import unittest import unicodedata from line import Line class MyTestCase(unittest.TestCase): def test_column_to_char(self): byte_offset = 123 text = "\tabc\td\tef\tg" # will be rendered as: ....abc.d...ef..g where . represents a whitespace column line = Line(byte_offset=byte_offset, byte_end=byte_offset + len(text.encode("utf8")), line=text) self.assertEqual(0, line.column_to_char(0)) # the tab self.assertEqual(0, line.column_to_char(1)) # the tab self.assertEqual(0, line.column_to_char(2)) # the tab self.assertEqual(0, line.column_to_char(3)) # last column of the tab self.assertEqual(1, line.column_to_char(4)) # a self.assertEqual(2, line.column_to_char(5)) # b self.assertEqual(3, line.column_to_char(6)) # c self.assertEqual(4, line.column_to_char(7)) # tab self.assertEqual(5, line.column_to_char(8)) # d self.assertEqual(6, line.column_to_char(9)) # tab self.assertEqual(6, line.column_to_char(10)) # tab self.assertEqual(6, line.column_to_char(11)) # tab self.assertEqual(7, line.column_to_char(12)) # e self.assertEqual(8, line.column_to_char(13)) # f self.assertEqual(9, line.column_to_char(14)) # tab self.assertEqual(9, line.column_to_char(15)) # tab self.assertEqual(10, line.column_to_char(16)) # g def test_column_to_char_ignore_nonspacing_mark_charaters(self): """ nonspacing mark charaters are those little decorations that are applied to the previous character, e.g. x\u0308 to make ẍ :return: """ byte_offset = 123 text = "x\u0308y\u0308z\u0308" line = Line(byte_offset=byte_offset, byte_end=byte_offset + len(text.encode("utf8")), line=text) self.assertEqual(0, line.column_to_char(0)) # ẍ self.assertEqual(2, line.column_to_char(1)) # ÿ self.assertEqual(4, line.column_to_char(2)) # z̈ def test_char_to_column(self): byte_offset = 123 text = "\tabc\td\tef\tg" # will be rendered as: ....abc.d...ef..g where . represents a whitespace column line = Line(byte_offset=byte_offset, byte_end=byte_offset + len(text.encode("utf8")), line=text) self.assertEqual(0, line.char_to_column(0)) # tab self.assertEqual(4, line.char_to_column(1)) # a self.assertEqual(5, line.char_to_column(2)) # b self.assertEqual(6, line.char_to_column(3)) # c self.assertEqual(7, line.char_to_column(4)) # tab self.assertEqual(8, line.char_to_column(5)) # d self.assertEqual(9, line.char_to_column(6)) # tab self.assertEqual(12, line.char_to_column(7)) # e self.assertEqual(13, line.char_to_column(8)) # f self.assertEqual(14, line.char_to_column(9)) # tab self.assertEqual(16, line.char_to_column(10)) # g def test_char_to_column_ignore_nonspacing_mark_charaters(self): """ nonspacing mark charaters are those little decorations that are applied to the previous character, e.g. x\u0308 to make ẍ :return: """ byte_offset = 123 text = "x\u0308y\u0308z\u0308" print(text) line = Line(byte_offset=byte_offset, byte_end=byte_offset + len(text.encode("utf8")), line=text) self.assertEqual(0, line.char_to_column(0)) # ẍ self.assertEqual(0, line.char_to_column(1)) # ẍ self.assertEqual(1, line.char_to_column(2)) # ÿ self.assertEqual(1, line.char_to_column(3)) # ÿ self.assertEqual(2, line.char_to_column(4)) # z̈ self.assertEqual(2, line.char_to_column(5)) # z̈ def test_line_tabs_replaced(self): byte_offset = 123 text = "\ta\tb" # will be rendered as: ....abc where . represents a whitespace column expected = " a b" line = Line(byte_offset=byte_offset, byte_end=byte_offset + len(text.encode("utf8")), line=text) self.assertEqual(expected, line.line_tabs_replaced()) def test_line_tabs_replaced_performance(self): byte_offset = 123 text = "a\t" * 10000 expected = "a " * 10000 line = Line(byte_offset=byte_offset, byte_end=byte_offset + len(text.encode("utf8")), line=text) self.assertEqual(expected, line.line_tabs_replaced()) def test_byte_index_to_char_index(self): byte_offset = 123 text = "x\u0308y\u0308z\u0308\t\u0308a" print(text) line = Line(byte_offset=byte_offset, byte_end=byte_offset + len(text.encode("utf8")), line=text) self.assertEqual(0, line.byte_index_to_char_index(0)) # x self.assertEqual(0, line.byte_index_to_char_index(1)) # first byte of diacritical mark belonging to x self.assertEqual(0, line.byte_index_to_char_index(2)) # second byte of diacritical mark belonging to x def test_diacritical_marks(self): text = "̈ẍỏôŏ̮👍🏿" text = "\U0001F9D9\u200D\u2642\uFE0F - \U0001F44D\U0001F3FF - a\u02c3 - ẍ - y\u0308 - w\u200D\u00A8" text = unicodedata.normalize("NFD", text) i = 0 print("%s" % text) print("length: %s" % len(text)) while i < len(text): c = text[i] print("%s %s cat: %s" % (c, unicodedata.name(c), unicodedata.category(c))) i = i + 1 if __name__ == '__main__': unittest.main()