Graphemes don't all have the same width, not even when you use a monospace font. For latin characters it usually works find to assume the same width. But emojis, japanese or chinese characters have have different width. There are even some ultra wide characters like 𒐫 or ﷽. There is also a thing called 'half-width' character. E.g. the japanese 'a' can be ア or ア. Fixed by actually computing the width of graphemes and using pixel.
156 lines
5.7 KiB
Python
156 lines
5.7 KiB
Python
import unicodedata
|
|
|
|
from PySide6.QtGui import QFontMetrics
|
|
|
|
import constants
|
|
|
|
|
|
class Line:
|
|
def __init__(self, byte_offset: int, byte_end: int, line: str, bytes: str):
|
|
self._byte_offset = byte_offset
|
|
self._byte_end = byte_end
|
|
self._line = line
|
|
self._bytes = bytes
|
|
|
|
self._cache_char_to_column()
|
|
|
|
def get_width_in_px(self, font_metric: QFontMetrics):
|
|
return font_metric.horizontalAdvance(self._line)
|
|
|
|
def byte_offset(self) -> int:
|
|
return self._byte_offset
|
|
|
|
def byte_end(self) -> int:
|
|
return self._byte_end
|
|
|
|
def line(self) -> str:
|
|
return self._line
|
|
|
|
def length_in_charaters(self) -> int:
|
|
return len(self._line)
|
|
|
|
def length_in_columns(self) -> int:
|
|
return self.char_to_column(len(self._line) - 1)
|
|
|
|
def char_index_to_byte(self, char_in_line: int) -> int:
|
|
# todo this does not work with multibyte characters
|
|
# should probably be len(self.prefix(char_in_line-1).encode("utf8"))
|
|
return len(self.prefix(char_in_line).encode("utf8"))
|
|
|
|
def byte_index_to_char_index(self, byte_index: int) -> int:
|
|
prefix_bytes = self._line.encode("utf8")[:byte_index]
|
|
prefix_chars = prefix_bytes.decode("utf8", errors="ignore")
|
|
return len(prefix_chars)
|
|
|
|
def line_prepared_for_display(self) -> str:
|
|
line = self._line_tabs_replaced()
|
|
line = self._replace_control_chars_with_pictures(line)
|
|
return line
|
|
|
|
def _replace_control_chars_with_pictures(self, line: str) -> str:
|
|
length = len(line)
|
|
for i in range(length):
|
|
c = line[i]
|
|
|
|
if unicodedata.category(c) == "Cc" and c != "\r" and c != "\n" and c != "\t":
|
|
ordinal_value = ord(c)
|
|
if ordinal_value < 32 and not ordinal_value in [9, 10, 11]:
|
|
# print(ord(c), " -> ", hex(ord(c)), " -> ", unicodedata.category(c), " -> ", chr(9216 + ord(c)))
|
|
line = line.replace(c, chr(9216 + ord(c))) # see Unicode Block “Control Pictures”
|
|
else:
|
|
# print(ord(c), " -> ", hex(ord(c)), " -> ", unicodedata.category(c), " -> \u2421")
|
|
line = line.replace(c, "\u2421") # symbol for delete (␡)
|
|
# print(line)
|
|
|
|
return line;
|
|
|
|
def _line_tabs_replaced(self) -> str:
|
|
line = self._line;
|
|
i = 0
|
|
offset = 0
|
|
result = ""
|
|
# length = len(line)
|
|
while True:
|
|
tab_index = line.find("\t", offset)
|
|
if tab_index < 0:
|
|
break
|
|
result = result + line[offset:tab_index]
|
|
result = result + " " * (constants.tab_width - len(result) % constants.tab_width)
|
|
offset = tab_index + 1
|
|
|
|
result = result + line[offset:]
|
|
|
|
return result
|
|
|
|
def column_to_char(self, column_in_line: int) -> int:
|
|
while not column_in_line in self._column_to_char_cache and column_in_line > 0:
|
|
column_in_line = column_in_line - 1
|
|
|
|
if column_in_line in self._column_to_char_cache:
|
|
return self._column_to_char_cache[column_in_line]
|
|
|
|
return 0
|
|
|
|
def char_to_column(self, char_in_line: int) -> int:
|
|
if not char_in_line in self._char_to_column_cache:
|
|
# print("%d in %s" % (char_in_line, self._char_to_column_cache))
|
|
return -1
|
|
return self._char_to_column_cache[char_in_line]
|
|
|
|
def _cache_char_to_column(self):
|
|
self._char_to_column_cache = {}
|
|
self._column_to_char_cache = {}
|
|
result = 0
|
|
i = 0
|
|
self._char_to_column_cache[0] = 0
|
|
while i < len(self._line):
|
|
self._char_to_column_cache[i] = result
|
|
if not result in self._column_to_char_cache:
|
|
self._column_to_char_cache[result] = i
|
|
current_char = self._line[i]
|
|
if current_char == "\t":
|
|
result = result + constants.tab_width - result % constants.tab_width
|
|
else:
|
|
result = result + 1
|
|
i = i + 1
|
|
|
|
# ignore: Nonspacing Mark characters are decorations for the previous character.
|
|
# They do not take up space.
|
|
# For example the character Combining Diaeresis (U+0308, %CC%88) that adds two
|
|
# dots above the previous character. It can be used to create an 'ä' from an 'a'+'◌̈'.
|
|
# In url encoding this looks like: a%CC%88.
|
|
# todo there are many other character combinations that should be skipped
|
|
while i < len(self._line) and unicodedata.category(self._line[i]) == "Mn":
|
|
self._char_to_column_cache[i] = result - 1
|
|
if (result - 1) not in self._column_to_char_cache:
|
|
self._column_to_char_cache[result - 1] = i
|
|
i = i + 1
|
|
|
|
def includes_byte(self, byte: int) -> bool:
|
|
return self._byte_offset <= byte <= self._byte_end
|
|
|
|
def intersects(self, start_byte: int, end_byte: int):
|
|
result = start_byte < self._byte_end and end_byte > self._byte_offset
|
|
# print("%d,%d in %d,%d" % (start_byte, end_byte, self._byte_offset, self._byte_end))
|
|
return result
|
|
|
|
def prefix(self, index: int) -> str:
|
|
return self._line[0:index]
|
|
|
|
def prefix_bytes(self, byte_index: int) -> str:
|
|
return self._bytes[0:byte_index]
|
|
|
|
def substr(self, offset: int, length: int) -> str:
|
|
return self._line[offset:offset+length]
|
|
|
|
def substr_bytes(self, byte_offset: int, byte_length: int) -> str:
|
|
return self._bytes[byte_offset:byte_offset + byte_length]
|
|
|
|
def suffix(self, index: int) -> str:
|
|
return self._line[index:]
|
|
|
|
def suffix_bytes(self, byte_index: int) -> str:
|
|
return self._bytes[byte_index:]
|
|
|
|
def __str__(self):
|
|
return "%s (%d->%d)" % (self._line, self._byte_offset, self._byte_end) |