handle character in the category 'nonspacing mark'

Those characters are ignored, because they decorate the previous character.
2022-01-29 10:43:23 +01:00
parent a413134f68
commit f0c3d34995
7 changed files with 132 additions and 26 deletions
--- a/line.py
+++ b/line.py
@@ -1,3 +1,4 @@
+import unicodedata
 from typing import Dict

 import constants
@@ -8,7 +9,8 @@ class Line:
        self._byte_offset = byte_offset
        self._byte_end = byte_end
        self._line = line
-        self._char_to_column_cache = self._cache_char_to_column()
+
+        self._cache_char_to_column()

    def byte_offset(self) -> int:
        return self._byte_offset
@@ -23,9 +25,11 @@ class Line:
        return len(self._line)

    def length_in_columns(self) -> int:
-        return self.char_to_column(len(self._line))
+        return self.char_to_column(len(self._line) - 1)

    def char_index_to_byte(self, char_in_line: int) -> int:
+        # todo this does not work with multibyte characters
+        # should probably be len(self.prefix(char_in_line-1).encode("utf8"))
        return len(self.prefix(char_in_line).encode("utf8"))

    def byte_index_to_char_index(self, byte_index: int) -> int:
@@ -52,19 +56,7 @@ class Line:
        return result

    def column_to_char(self, column_in_line: int) -> int:
-        i = 0
-        result = 0
-        while i < column_in_line:
-            char = self._line[result]
-            if char == "\t":
-                i = i + constants.tab_width - i % constants.tab_width  # jump the additional columns to complete the tab
-                if i > column_in_line:
-                    break;
-            else:
-                i = i + 1
-            result = result + 1
-
-        return result
+        return self._column_to_char_cache[column_in_line]

    def char_to_column(self, char_in_line: int) -> int:
        if not char_in_line in self._char_to_column_cache:
@@ -72,19 +64,34 @@ class Line:
            return -1
        return self._char_to_column_cache[char_in_line]

-    def _cache_char_to_column(self) -> Dict[int, int]:
-        char_to_column_cache = {}
+    def _cache_char_to_column(self):
+        self._char_to_column_cache = {}
+        self._column_to_char_cache = {}
        result = 0
        i = 0
-        char_to_column_cache[0] = 0
+        self._char_to_column_cache[0] = 0
        while i < len(self._line):
-            if i < len(self._line) and self._line[i] == "\t":
+            self._char_to_column_cache[i] = result
+            if not result in self._column_to_char_cache:
+                self._column_to_char_cache[result] = i
+            current_char = self._line[i]
+            if current_char == "\t":
                result = result + constants.tab_width - result % constants.tab_width
            else:
                result = result + 1
            i = i + 1
-            char_to_column_cache[i] = result
-        return char_to_column_cache
+
+            # ignore: Nonspacing Mark characters are decorations for the previous character.
+            # They do not take up space.
+            # For example the character Combining Diaeresis (U+0308, %CC%88) that adds two
+            # dots above the previous character. It can be used to create an 'ä' from an 'a'+'◌̈'.
+            # In url encoding this looks like: a%CC%88.
+            # todo there are many other character combinations that should be skipped
+            while i < len(self._line) and unicodedata.category(self._line[i]) == "Mn":
+                self._char_to_column_cache[i] = result - 1
+                if not result in self._column_to_char_cache:
+                    self._column_to_char_cache[result - 1] = i
+                i = i + 1

    def includes_byte(self, byte: int) -> bool:
        return self._byte_offset <= byte <= self._byte_end