115 lines
4.1 KiB
Python
115 lines
4.1 KiB
Python
import unicodedata
|
|
|
|
import constants
|
|
|
|
|
|
class Line:
|
|
def __init__(self, byte_offset: int, byte_end: int, line: str):
|
|
self._byte_offset = byte_offset
|
|
self._byte_end = byte_end
|
|
self._line = line
|
|
|
|
self._cache_char_to_column()
|
|
|
|
def byte_offset(self) -> int:
|
|
return self._byte_offset
|
|
|
|
def byte_end(self) -> int:
|
|
return self._byte_end
|
|
|
|
def line(self) -> str:
|
|
return self._line
|
|
|
|
def length_in_charaters(self) -> int:
|
|
return len(self._line)
|
|
|
|
def length_in_columns(self) -> int:
|
|
return self.char_to_column(len(self._line) - 1)
|
|
|
|
def char_index_to_byte(self, char_in_line: int) -> int:
|
|
# todo this does not work with multibyte characters
|
|
# should probably be len(self.prefix(char_in_line-1).encode("utf8"))
|
|
return len(self.prefix(char_in_line).encode("utf8"))
|
|
|
|
def byte_index_to_char_index(self, byte_index: int) -> int:
|
|
prefix_bytes = self._line.encode("utf8")[:byte_index]
|
|
prefix_chars = prefix_bytes.decode("utf8", errors="ignore")
|
|
return len(prefix_chars)
|
|
|
|
def line_tabs_replaced(self):
|
|
line = self._line;
|
|
i = 0
|
|
offset = 0
|
|
result = ""
|
|
length = len(line)
|
|
while True:
|
|
tab_index = line.find("\t", offset)
|
|
if tab_index < 0:
|
|
break
|
|
result = result + line[offset:tab_index]
|
|
result = result + " " * (constants.tab_width - len(result) % constants.tab_width)
|
|
offset = tab_index + 1
|
|
|
|
result = result + line[offset:]
|
|
|
|
return result
|
|
|
|
def column_to_char(self, column_in_line: int) -> int:
|
|
if column_in_line in self._column_to_char_cache:
|
|
return self._column_to_char_cache[column_in_line]
|
|
return 0
|
|
|
|
def char_to_column(self, char_in_line: int) -> int:
|
|
if not char_in_line in self._char_to_column_cache:
|
|
# print("%d in %s" % (char_in_line, self._char_to_column_cache))
|
|
return -1
|
|
return self._char_to_column_cache[char_in_line]
|
|
|
|
def _cache_char_to_column(self):
|
|
self._char_to_column_cache = {}
|
|
self._column_to_char_cache = {}
|
|
result = 0
|
|
i = 0
|
|
self._char_to_column_cache[0] = 0
|
|
while i < len(self._line):
|
|
self._char_to_column_cache[i] = result
|
|
if not result in self._column_to_char_cache:
|
|
self._column_to_char_cache[result] = i
|
|
current_char = self._line[i]
|
|
if current_char == "\t":
|
|
result = result + constants.tab_width - result % constants.tab_width
|
|
else:
|
|
result = result + 1
|
|
i = i + 1
|
|
|
|
# ignore: Nonspacing Mark characters are decorations for the previous character.
|
|
# They do not take up space.
|
|
# For example the character Combining Diaeresis (U+0308, %CC%88) that adds two
|
|
# dots above the previous character. It can be used to create an 'ä' from an 'a'+'◌̈'.
|
|
# In url encoding this looks like: a%CC%88.
|
|
# todo there are many other character combinations that should be skipped
|
|
while i < len(self._line) and unicodedata.category(self._line[i]) == "Mn":
|
|
self._char_to_column_cache[i] = result - 1
|
|
if not result in self._column_to_char_cache:
|
|
self._column_to_char_cache[result - 1] = i
|
|
i = i + 1
|
|
|
|
def includes_byte(self, byte: int) -> bool:
|
|
return self._byte_offset <= byte <= self._byte_end
|
|
|
|
def intersects(self, start_byte: int, end_byte: int):
|
|
result = start_byte < self._byte_end and end_byte > self._byte_offset
|
|
# print("%d,%d in %d,%d" % (start_byte, end_byte, self._byte_offset, self._byte_end))
|
|
return result
|
|
|
|
def prefix(self, index: int) -> str:
|
|
return self._line[0:index]
|
|
|
|
def substr(self, offset: int, length: int) -> str:
|
|
return self._line[offset:offset+length]
|
|
|
|
def suffix(self, index: int) -> str:
|
|
return self._line[index:]
|
|
|
|
def __str__(self):
|
|
return "%s (%d->%d)" % (self._line, self._byte_offset, self._byte_end) |