Files
krowlog/src/ui/bigtext/line.py
Andreas Huber 61132d242f fix: graphemes are not correctly highlighted
Graphemes don't all have the same width, not even when you use a monospace font.
For latin characters it usually works find to assume the same width. But emojis,
japanese or chinese characters have have different width. There are even some
ultra wide characters like 𒐫 or ﷽. There is also a thing
called 'half-width' character. E.g. the japanese 'a' can be ア or ア.

Fixed by actually computing the width of graphemes and using pixel.
2025-03-24 17:49:27 +01:00

156 lines
5.7 KiB
Python

import unicodedata
from PySide6.QtGui import QFontMetrics
import constants
class Line:
def __init__(self, byte_offset: int, byte_end: int, line: str, bytes: str):
self._byte_offset = byte_offset
self._byte_end = byte_end
self._line = line
self._bytes = bytes
self._cache_char_to_column()
def get_width_in_px(self, font_metric: QFontMetrics):
return font_metric.horizontalAdvance(self._line)
def byte_offset(self) -> int:
return self._byte_offset
def byte_end(self) -> int:
return self._byte_end
def line(self) -> str:
return self._line
def length_in_charaters(self) -> int:
return len(self._line)
def length_in_columns(self) -> int:
return self.char_to_column(len(self._line) - 1)
def char_index_to_byte(self, char_in_line: int) -> int:
# todo this does not work with multibyte characters
# should probably be len(self.prefix(char_in_line-1).encode("utf8"))
return len(self.prefix(char_in_line).encode("utf8"))
def byte_index_to_char_index(self, byte_index: int) -> int:
prefix_bytes = self._line.encode("utf8")[:byte_index]
prefix_chars = prefix_bytes.decode("utf8", errors="ignore")
return len(prefix_chars)
def line_prepared_for_display(self) -> str:
line = self._line_tabs_replaced()
line = self._replace_control_chars_with_pictures(line)
return line
def _replace_control_chars_with_pictures(self, line: str) -> str:
length = len(line)
for i in range(length):
c = line[i]
if unicodedata.category(c) == "Cc" and c != "\r" and c != "\n" and c != "\t":
ordinal_value = ord(c)
if ordinal_value < 32 and not ordinal_value in [9, 10, 11]:
# print(ord(c), " -> ", hex(ord(c)), " -> ", unicodedata.category(c), " -> ", chr(9216 + ord(c)))
line = line.replace(c, chr(9216 + ord(c))) # see Unicode Block “Control Pictures”
else:
# print(ord(c), " -> ", hex(ord(c)), " -> ", unicodedata.category(c), " -> \u2421")
line = line.replace(c, "\u2421") # symbol for delete (␡)
# print(line)
return line;
def _line_tabs_replaced(self) -> str:
line = self._line;
i = 0
offset = 0
result = ""
# length = len(line)
while True:
tab_index = line.find("\t", offset)
if tab_index < 0:
break
result = result + line[offset:tab_index]
result = result + " " * (constants.tab_width - len(result) % constants.tab_width)
offset = tab_index + 1
result = result + line[offset:]
return result
def column_to_char(self, column_in_line: int) -> int:
while not column_in_line in self._column_to_char_cache and column_in_line > 0:
column_in_line = column_in_line - 1
if column_in_line in self._column_to_char_cache:
return self._column_to_char_cache[column_in_line]
return 0
def char_to_column(self, char_in_line: int) -> int:
if not char_in_line in self._char_to_column_cache:
# print("%d in %s" % (char_in_line, self._char_to_column_cache))
return -1
return self._char_to_column_cache[char_in_line]
def _cache_char_to_column(self):
self._char_to_column_cache = {}
self._column_to_char_cache = {}
result = 0
i = 0
self._char_to_column_cache[0] = 0
while i < len(self._line):
self._char_to_column_cache[i] = result
if not result in self._column_to_char_cache:
self._column_to_char_cache[result] = i
current_char = self._line[i]
if current_char == "\t":
result = result + constants.tab_width - result % constants.tab_width
else:
result = result + 1
i = i + 1
# ignore: Nonspacing Mark characters are decorations for the previous character.
# They do not take up space.
# For example the character Combining Diaeresis (U+0308, %CC%88) that adds two
# dots above the previous character. It can be used to create an 'ä' from an 'a'+'◌̈'.
# In url encoding this looks like: a%CC%88.
# todo there are many other character combinations that should be skipped
while i < len(self._line) and unicodedata.category(self._line[i]) == "Mn":
self._char_to_column_cache[i] = result - 1
if (result - 1) not in self._column_to_char_cache:
self._column_to_char_cache[result - 1] = i
i = i + 1
def includes_byte(self, byte: int) -> bool:
return self._byte_offset <= byte <= self._byte_end
def intersects(self, start_byte: int, end_byte: int):
result = start_byte < self._byte_end and end_byte > self._byte_offset
# print("%d,%d in %d,%d" % (start_byte, end_byte, self._byte_offset, self._byte_end))
return result
def prefix(self, index: int) -> str:
return self._line[0:index]
def prefix_bytes(self, byte_index: int) -> str:
return self._bytes[0:byte_index]
def substr(self, offset: int, length: int) -> str:
return self._line[offset:offset+length]
def substr_bytes(self, byte_offset: int, byte_length: int) -> str:
return self._bytes[byte_offset:byte_offset + byte_length]
def suffix(self, index: int) -> str:
return self._line[index:]
def suffix_bytes(self, byte_index: int) -> str:
return self._bytes[byte_index:]
def __str__(self):
return "%s (%d->%d)" % (self._line, self._byte_offset, self._byte_end)