import unicodedata import constants class Line: def __init__(self, byte_offset: int, byte_end: int, line: str): self._byte_offset = byte_offset self._byte_end = byte_end self._line = line self._cache_char_to_column() def byte_offset(self) -> int: return self._byte_offset def byte_end(self) -> int: return self._byte_end def line(self) -> str: return self._line def length_in_charaters(self) -> int: return len(self._line) def length_in_columns(self) -> int: return self.char_to_column(len(self._line) - 1) def char_index_to_byte(self, char_in_line: int) -> int: # todo this does not work with multibyte characters # should probably be len(self.prefix(char_in_line-1).encode("utf8")) return len(self.prefix(char_in_line).encode("utf8")) def byte_index_to_char_index(self, byte_index: int) -> int: prefix_bytes = self._line.encode("utf8")[:byte_index] prefix_chars = prefix_bytes.decode("utf8", errors="ignore") return len(prefix_chars) def line_tabs_replaced(self): line = self._line; i = 0 offset = 0 result = "" length = len(line) while True: tab_index = line.find("\t", offset) if tab_index < 0: break result = result + line[offset:tab_index] result = result + " " * (constants.tab_width - len(result) % constants.tab_width) offset = tab_index + 1 result = result + line[offset:] return result def column_to_char(self, column_in_line: int) -> int: if column_in_line in self._column_to_char_cache: return self._column_to_char_cache[column_in_line] return 0 def char_to_column(self, char_in_line: int) -> int: if not char_in_line in self._char_to_column_cache: # print("%d in %s" % (char_in_line, self._char_to_column_cache)) return -1 return self._char_to_column_cache[char_in_line] def _cache_char_to_column(self): self._char_to_column_cache = {} self._column_to_char_cache = {} result = 0 i = 0 self._char_to_column_cache[0] = 0 while i < len(self._line): self._char_to_column_cache[i] = result if not result in self._column_to_char_cache: self._column_to_char_cache[result] = i current_char = self._line[i] if current_char == "\t": result = result + constants.tab_width - result % constants.tab_width else: result = result + 1 i = i + 1 # ignore: Nonspacing Mark characters are decorations for the previous character. # They do not take up space. # For example the character Combining Diaeresis (U+0308, %CC%88) that adds two # dots above the previous character. It can be used to create an 'ä' from an 'a'+'◌̈'. # In url encoding this looks like: a%CC%88. # todo there are many other character combinations that should be skipped while i < len(self._line) and unicodedata.category(self._line[i]) == "Mn": self._char_to_column_cache[i] = result - 1 if not result in self._column_to_char_cache: self._column_to_char_cache[result - 1] = i i = i + 1 def includes_byte(self, byte: int) -> bool: return self._byte_offset <= byte <= self._byte_end def intersects(self, start_byte: int, end_byte: int): result = start_byte < self._byte_end and end_byte > self._byte_offset # print("%d,%d in %d,%d" % (start_byte, end_byte, self._byte_offset, self._byte_end)) return result def prefix(self, index: int) -> str: return self._line[0:index] def substr(self, offset: int, length: int) -> str: return self._line[offset:offset+length] def suffix(self, index: int) -> str: return self._line[index:] def __str__(self): return "%s (%d->%d)" % (self._line, self._byte_offset, self._byte_end)