handle character in the category 'nonspacing mark'
Those characters are ignored, because they decorate the previous character.
This commit is contained in:
55
testline.py
55
testline.py
@@ -1,5 +1,7 @@
|
||||
import unittest
|
||||
|
||||
import unicodedata
|
||||
|
||||
from line import Line
|
||||
|
||||
|
||||
@@ -27,6 +29,20 @@ class MyTestCase(unittest.TestCase):
|
||||
self.assertEqual(9, line.column_to_char(15)) # tab
|
||||
self.assertEqual(10, line.column_to_char(16)) # g
|
||||
|
||||
def test_column_to_char_ignore_nonspacing_mark_charaters(self):
|
||||
"""
|
||||
nonspacing mark charaters are those little decorations that are applied to the previous character,
|
||||
e.g. x\u0308 to make ẍ
|
||||
:return:
|
||||
"""
|
||||
byte_offset = 123
|
||||
text = "x\u0308y\u0308z\u0308"
|
||||
line = Line(byte_offset=byte_offset, byte_end=byte_offset + len(text.encode("utf8")), line=text)
|
||||
|
||||
self.assertEqual(0, line.column_to_char(0)) # ẍ
|
||||
self.assertEqual(2, line.column_to_char(1)) # ÿ
|
||||
self.assertEqual(4, line.column_to_char(2)) # z̈
|
||||
|
||||
def test_char_to_column(self):
|
||||
byte_offset = 123
|
||||
text = "\tabc\td\tef\tg" # will be rendered as: ....abc.d...ef..g where . represents a whitespace column
|
||||
@@ -43,6 +59,23 @@ class MyTestCase(unittest.TestCase):
|
||||
self.assertEqual(14, line.char_to_column(9)) # tab
|
||||
self.assertEqual(16, line.char_to_column(10)) # g
|
||||
|
||||
def test_char_to_column_ignore_nonspacing_mark_charaters(self):
|
||||
"""
|
||||
nonspacing mark charaters are those little decorations that are applied to the previous character,
|
||||
e.g. x\u0308 to make ẍ
|
||||
:return:
|
||||
"""
|
||||
byte_offset = 123
|
||||
text = "x\u0308y\u0308z\u0308"
|
||||
print(text)
|
||||
line = Line(byte_offset=byte_offset, byte_end=byte_offset + len(text.encode("utf8")), line=text)
|
||||
self.assertEqual(0, line.char_to_column(0)) # ẍ
|
||||
self.assertEqual(0, line.char_to_column(1)) # ẍ
|
||||
self.assertEqual(1, line.char_to_column(2)) # ÿ
|
||||
self.assertEqual(1, line.char_to_column(3)) # ÿ
|
||||
self.assertEqual(2, line.char_to_column(4)) # z̈
|
||||
self.assertEqual(2, line.char_to_column(5)) # z̈
|
||||
|
||||
def test_line_tabs_replaced(self):
|
||||
byte_offset = 123
|
||||
text = "\ta\tb" # will be rendered as: ....abc where . represents a whitespace column
|
||||
@@ -57,5 +90,27 @@ class MyTestCase(unittest.TestCase):
|
||||
line = Line(byte_offset=byte_offset, byte_end=byte_offset + len(text.encode("utf8")), line=text)
|
||||
self.assertEqual(expected, line.line_tabs_replaced())
|
||||
|
||||
def test_byte_index_to_char_index(self):
|
||||
byte_offset = 123
|
||||
text = "x\u0308y\u0308z\u0308\t\u0308a"
|
||||
print(text)
|
||||
line = Line(byte_offset=byte_offset, byte_end=byte_offset + len(text.encode("utf8")), line=text)
|
||||
self.assertEqual(0, line.byte_index_to_char_index(0)) # x
|
||||
self.assertEqual(0, line.byte_index_to_char_index(1)) # first byte of diacritical mark belonging to x
|
||||
self.assertEqual(0, line.byte_index_to_char_index(2)) # second byte of diacritical mark belonging to x
|
||||
|
||||
def test_diacritical_marks(self):
|
||||
text = "̈ẍỏôŏ̮👍🏿"
|
||||
text = "\U0001F9D9\u200D\u2642\uFE0F - \U0001F44D\U0001F3FF - a\u02c3 - ẍ - y\u0308 - w\u200D\u00A8"
|
||||
text = unicodedata.normalize("NFD", text)
|
||||
i = 0
|
||||
print("%s" % text)
|
||||
print("length: %s" % len(text))
|
||||
while i < len(text):
|
||||
c = text[i]
|
||||
print("%s %s cat: %s" % (c, unicodedata.name(c), unicodedata.category(c)))
|
||||
i = i + 1
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
|
||||
Reference in New Issue
Block a user