handle character in the category 'nonspacing mark'
Those characters are ignored, because they decorate the previous character.
This commit is contained in:
49
line.py
49
line.py
@@ -1,3 +1,4 @@
|
||||
import unicodedata
|
||||
from typing import Dict
|
||||
|
||||
import constants
|
||||
@@ -8,7 +9,8 @@ class Line:
|
||||
self._byte_offset = byte_offset
|
||||
self._byte_end = byte_end
|
||||
self._line = line
|
||||
self._char_to_column_cache = self._cache_char_to_column()
|
||||
|
||||
self._cache_char_to_column()
|
||||
|
||||
def byte_offset(self) -> int:
|
||||
return self._byte_offset
|
||||
@@ -23,9 +25,11 @@ class Line:
|
||||
return len(self._line)
|
||||
|
||||
def length_in_columns(self) -> int:
|
||||
return self.char_to_column(len(self._line))
|
||||
return self.char_to_column(len(self._line) - 1)
|
||||
|
||||
def char_index_to_byte(self, char_in_line: int) -> int:
|
||||
# todo this does not work with multibyte characters
|
||||
# should probably be len(self.prefix(char_in_line-1).encode("utf8"))
|
||||
return len(self.prefix(char_in_line).encode("utf8"))
|
||||
|
||||
def byte_index_to_char_index(self, byte_index: int) -> int:
|
||||
@@ -52,19 +56,7 @@ class Line:
|
||||
return result
|
||||
|
||||
def column_to_char(self, column_in_line: int) -> int:
|
||||
i = 0
|
||||
result = 0
|
||||
while i < column_in_line:
|
||||
char = self._line[result]
|
||||
if char == "\t":
|
||||
i = i + constants.tab_width - i % constants.tab_width # jump the additional columns to complete the tab
|
||||
if i > column_in_line:
|
||||
break;
|
||||
else:
|
||||
i = i + 1
|
||||
result = result + 1
|
||||
|
||||
return result
|
||||
return self._column_to_char_cache[column_in_line]
|
||||
|
||||
def char_to_column(self, char_in_line: int) -> int:
|
||||
if not char_in_line in self._char_to_column_cache:
|
||||
@@ -72,19 +64,34 @@ class Line:
|
||||
return -1
|
||||
return self._char_to_column_cache[char_in_line]
|
||||
|
||||
def _cache_char_to_column(self) -> Dict[int, int]:
|
||||
char_to_column_cache = {}
|
||||
def _cache_char_to_column(self):
|
||||
self._char_to_column_cache = {}
|
||||
self._column_to_char_cache = {}
|
||||
result = 0
|
||||
i = 0
|
||||
char_to_column_cache[0] = 0
|
||||
self._char_to_column_cache[0] = 0
|
||||
while i < len(self._line):
|
||||
if i < len(self._line) and self._line[i] == "\t":
|
||||
self._char_to_column_cache[i] = result
|
||||
if not result in self._column_to_char_cache:
|
||||
self._column_to_char_cache[result] = i
|
||||
current_char = self._line[i]
|
||||
if current_char == "\t":
|
||||
result = result + constants.tab_width - result % constants.tab_width
|
||||
else:
|
||||
result = result + 1
|
||||
i = i + 1
|
||||
char_to_column_cache[i] = result
|
||||
return char_to_column_cache
|
||||
|
||||
# ignore: Nonspacing Mark characters are decorations for the previous character.
|
||||
# They do not take up space.
|
||||
# For example the character Combining Diaeresis (U+0308, %CC%88) that adds two
|
||||
# dots above the previous character. It can be used to create an 'ä' from an 'a'+'◌̈'.
|
||||
# In url encoding this looks like: a%CC%88.
|
||||
# todo there are many other character combinations that should be skipped
|
||||
while i < len(self._line) and unicodedata.category(self._line[i]) == "Mn":
|
||||
self._char_to_column_cache[i] = result - 1
|
||||
if not result in self._column_to_char_cache:
|
||||
self._column_to_char_cache[result - 1] = i
|
||||
i = i + 1
|
||||
|
||||
def includes_byte(self, byte: int) -> bool:
|
||||
return self._byte_offset <= byte <= self._byte_end
|
||||
|
||||
Reference in New Issue
Block a user