fix: graphemes are not correctly highlighted

Graphemes don't all have the same width, not even when you use a monospace font.
For latin characters it usually works find to assume the same width. But emojis,
japanese or chinese characters have have different width. There are even some
ultra wide characters like 𒐫 or ﷽. There is also a thing
called 'half-width' character. E.g. the japanese 'a' can be ア or ア.

Fixed by actually computing the width of graphemes and using pixel.
This commit is contained in:
2025-03-23 21:00:53 +01:00
parent 21b2da1e69
commit 61132d242f
6 changed files with 181 additions and 86 deletions

View File

@@ -18,6 +18,7 @@ from src.ui.bigtext.highlighted_range import HighlightedRange
from src.ui.bigtext.line import Line
from src.ui.bigtext.logFileModel import LogFileModel
from src.ui.bigtext.newhighlightingdialog import NewHighlightingDialog
from src.ui.bigtext.selectionPos import SelectionPos
from src.ui.icon import Icon
from src.ui.rangeslider import RangeSlider
from src.util.conversion import humanbytes
@@ -30,6 +31,7 @@ from src.i18n import _
log = logging.getLogger("bigtext")
class FileObserver(FileSystemEventHandler):
def __init__(self, big_text):
@@ -145,7 +147,7 @@ class BigText(QWidget):
# noinspection PyArgumentList,PyTypeChecker
class InnerBigText(QWidget):
_byte_offset = 0
_left_offset = 0 # number of characters the horizontal scrollbar was moved to the right
_left_offset = 0 # number of pixels the horizontal scrollbar was moved to the right
scroll_lines = 0
longest_line = 0
@@ -320,8 +322,8 @@ class InnerBigText(QWidget):
# noinspection PyTypeChecker
def mousePressEvent(self, e: QtGui.QMouseEvent) -> None:
if e.buttons() == Qt.MouseButton.LeftButton and e.modifiers() == Qt.KeyboardModifier.ShiftModifier:
offset = self.to_byte_offset(e)
self.selection_highlight.set_end_byte(offset)
selection_pos = self.to_byte_offset(e)
self.selection_highlight.set_end_byte(selection_pos)
self._update_highlight_selected_text()
self.update()
return
@@ -331,16 +333,16 @@ class InnerBigText(QWidget):
line_number = self.y_pos_to_line(e.pos().y())
if line_number == self._last_double_click_line_number and line_number < len(self.lines):
line: Line = self.lines[line_number]
self.selection_highlight.set_start(line.byte_offset())
self.selection_highlight.set_end_byte(line.byte_end())
self.selection_highlight.set_start(SelectionPos(line.byte_offset(), True, 1))
self.selection_highlight.set_end_byte(SelectionPos(line.byte_end() - 1, False, 1))
self._update_highlight_selected_text()
self.update()
return
if e.buttons() == Qt.MouseButton.LeftButton and e.modifiers() == Qt.KeyboardModifier.NoModifier:
offset = self.to_byte_offset(e)
self.selection_highlight.set_start(offset)
self.selection_highlight.set_end_byte(offset)
selection_pos = self.to_byte_offset(e)
self.selection_highlight.set_start(selection_pos)
self.selection_highlight.set_end_byte(selection_pos)
self._update_highlight_selected_text()
self.update()
@@ -355,14 +357,17 @@ class InnerBigText(QWidget):
self._last_double_click_time = time.time()
self._last_double_click_line_number = self.y_pos_to_line(e.pos().y())
offset = self.to_byte_offset(e)
(_word, start_byte, end_byte) = self.model.read_word_at(offset)
selection_pos = self.to_byte_offset(e)
(word, start_byte, end_byte) = self.model.read_word_at(selection_pos.pos())
if start_byte >= 0 and end_byte >= 0:
self.selection_highlight.set_start(start_byte)
self.selection_highlight.set_end_byte(end_byte)
bytes_of_first_char = len(f"{word[0]}".encode("utf8"))
self.selection_highlight.set_start(SelectionPos(start_byte, True, bytes_of_first_char))
bytes_of_last_char = len(f"{word[-1]}".encode("utf8"))
self.selection_highlight.set_end_byte(
SelectionPos(end_byte - bytes_of_last_char, False, bytes_of_last_char))
else:
self.selection_highlight.set_start(offset)
self.selection_highlight.set_end_byte(offset)
self.selection_highlight.set_start(selection_pos)
self.selection_highlight.set_end_byte(selection_pos)
self._update_highlight_selected_text()
self.update()
@@ -372,10 +377,10 @@ class InnerBigText(QWidget):
if e.buttons() != Qt.MouseButton.LeftButton:
return
current_byte = self.to_byte_offset(e)
selection_pos = self.to_byte_offset(e)
if self.selection_highlight.end_byte != current_byte:
self.selection_highlight.set_end_byte(current_byte)
if self.selection_highlight.max_byte() != selection_pos.pos():
self.selection_highlight.set_end_byte(selection_pos)
self._update_highlight_selected_text()
self.update()
# print("-> %s,%s" %(self._selection_start_byte, self._selection_end_byte))
@@ -386,10 +391,16 @@ class InnerBigText(QWidget):
self.scroll_by_lines(-1)
if line_number > int(self.lines_shown()):
self.scroll_by_lines(1)
if column_in_line <= 1:
# if column_in_line <= 1:
# self._left_offset = max(0, self._left_offset - 2)
# self.update()
if e.pos().x() <= 1:
self._left_offset = max(0, self._left_offset - 2)
self.update()
if column_in_line + 1 >= self.columns_shown():
# if column_in_line + 1 >= self.columns_shown():
# self._left_offset = self._left_offset + 2
# self.update()
if e.pos().x() + 1 >= self.width():
self._left_offset = self._left_offset + 2
self.update()
@@ -416,12 +427,13 @@ class InnerBigText(QWidget):
case BigScrollBar.ScrollEvent.PageDown:
self.scroll_by_lines(int(self.lines_shown()) - 1)
def update_longest_line(self, length: int):
width_in_chars = self.width() / self.char_width
def update_longest_line(self, line: Line):
# print("width_in_chars: %d" % width_in_chars)
if self.longest_line < length:
self.longest_line = length
maximum = max(0, length - width_in_chars + 1)
text_width_in_px = line.get_width_in_px(self.font_metric);
if self.longest_line < text_width_in_px:
self.longest_line = text_width_in_px
maximum = max(0, text_width_in_px - self.width() + 1)
self.parent.h_scroll_bar.setMaximum(round(maximum))
def y_pos_to_line(self, y: int) -> int:
@@ -436,29 +448,63 @@ class InnerBigText(QWidget):
def columns_shown(self) -> float:
return self.width() / float(self.char_width)
def to_byte_offset(self, e: QMouseEvent) -> int:
def to_byte_offset(self, e: QMouseEvent) -> SelectionPos:
x = e.pos().x() + self._left_offset
line_number = self.y_pos_to_line(e.pos().y())
if line_number < len(self.lines):
line: Line = self.lines[line_number]
column_in_line = self.x_pos_to_column(e.pos().x()) + self._left_offset
column_in_line = min(column_in_line, line.length_in_columns()) # x was behind the last column of this line
char_in_line = line.column_to_char(column_in_line)
# print("%s in line %s column_in_line=%s" % (char_in_line, line_number, column_in_line))
byte_in_line = line.char_index_to_byte(char_in_line)
current_byte = line.byte_offset() + byte_in_line
# print("%s + %s = %s" % (line.byte_offset(), char_in_line, current_byte))
text: str = line.line()
text = text.replace("\n", "").replace("\r", "")
elided_text: str = self.elided_text(text, x)
byte_offset = line.byte_offset() + len(elided_text.encode("utf8"))
left_x_offset = self.font_metric.horizontalAdvance((elided_text))
next_char = ""
pos_is_in_left_half = False
bytes_of_char = 0
if len(text) > len(elided_text): # has another character
next_char = text[len(elided_text)]
char_width = self.font_metric.horizontalAdvance(next_char)
pos_is_in_left_half = x < (left_x_offset + char_width / 2)
bytes_of_char = len(next_char.encode("utf8"))
else:
# print(f"{x} is after last char, elided_text={elided_text}")
# the position is after the last character / behind the end of the line
pass
# print(f"{x} -> {byte_offset} {'left' if pos_is_in_left_half else 'right'} bytes_of_char={bytes_of_char}")
return SelectionPos(byte_offset, pos_is_in_left_half, bytes_of_char)
# column_in_line = self.x_pos_to_column(e.pos().x()) + self._left_offset
# column_in_line = min(column_in_line, line.length_in_columns()) # x was behind the last column of this line
# char_in_line = line.column_to_char(column_in_line)
# # print("%s in line %s column_in_line=%s" % (char_in_line, line_number, column_in_line))
# byte_in_line = line.char_index_to_byte(char_in_line)
# current_byte = line.byte_offset() + byte_in_line
# # print("%s + %s = %s" % (line.byte_offset(), char_in_line, current_byte))
else:
current_byte = self.model.byte_count()
return current_byte
def elided_text(self, text: str, width: int):
w = width + self.font_metric.horizontalAdvance("")
elided_text = self.font_metric.elidedText(text + "", Qt.TextElideMode.ElideRight, w,
Qt.TextFlag.TextWrapAnywhere)
elided_text = elided_text[0:-1] if elided_text.endswith('') else elided_text # remove the trailing '…'
return elided_text
def _has_selection(self):
return self.selection_highlight.start_byte != self.selection_highlight.end_byte
return self.selection_highlight.min_byte() != self.selection_highlight.max_byte()
def copy_selection(self):
if self._has_selection():
start = min(self.selection_highlight.start_byte, self.selection_highlight.end_byte)
end = max(self.selection_highlight.start_byte, self.selection_highlight.end_byte)
start = self.selection_highlight.min_byte()
end = self.selection_highlight.max_byte()
bytes_human_readable = humanbytes(end - start)
if end - start > (1024 ** 2) * 5:
you_sure = QMessageBox(
@@ -492,8 +538,8 @@ class InnerBigText(QWidget):
def _copy_selection_to_file(self):
if self._has_selection():
start = min(self.selection_highlight.start_byte, self.selection_highlight.end_byte)
end = max(self.selection_highlight.start_byte, self.selection_highlight.end_byte)
start = self.selection_highlight.min_byte()
end = self.selection_highlight.max_byte()
dialog = QFileDialog(self)
(selected_file, _filter) = dialog.getSaveFileName(
parent=self,
@@ -507,18 +553,21 @@ class InnerBigText(QWidget):
PluginRegistry.execute("open_file", selected_file)
def _select_all(self):
self.selection_highlight.start_byte = self.model.get_line_start_at(self._range_start)
start_byte = self.model.get_line_start_at(self._range_start)
if self._range_end < 0 or self.model.byte_count() <= self._range_end:
self.selection_highlight.end_byte = self.model.byte_count()
end_byte = self.model.byte_count()
else:
self.selection_highlight.end_byte = self.model.get_line_start_at(self._range_end)
end_byte = self.model.get_line_start_at(self._range_end)
self.selection_highlight.set_start(SelectionPos(start_byte, True, 1))
self.selection_highlight.set_end_byte(SelectionPos(end_byte, False, 1))
self._update_highlight_selected_text()
self.update()
def _update_highlight_selected_text(self):
start_byte = min(self.selection_highlight.start_byte, self.selection_highlight.end_byte)
end_byte = max(self.selection_highlight.start_byte, self.selection_highlight.end_byte)
start_byte = self.selection_highlight.min_byte()
end_byte = self.selection_highlight.max_byte()
self._update_status_bar(start_byte, end_byte)
@@ -545,8 +594,19 @@ class InnerBigText(QWidget):
def paintEvent(self, event: QPaintEvent) -> None:
start_ns = time.process_time_ns()
painter = QPainter(self)
# font = "Courier New" if sys.platform == 'win32' or sys.platform == 'cygwin' else "Monospace"
painter.setFont(QFont("Courier New", self.model.settings.getint_session('general', "font_size")))
# "Courier New"
# "JetBrains Mono"
# "Noto Sans Mono"
# "Noto Color Emoji"
# "Andale Mono"
qfont = QFont("Noto Sans Mono", self.model.settings.getint_session('general', "font_size"))
qfont.setStyleHint(QFont.StyleHint.Monospace)
painter.setFont(qfont)
self.font_metric = painter.fontMetrics()
painter.setPen(QColor(0, 0, 0))
self.update_font_metrics(painter)
@@ -567,7 +627,7 @@ class InnerBigText(QWidget):
self.parent.range_limit.set_maximum(byte_count)
for line in self.lines:
self.update_longest_line(len(line.line()))
self.update_longest_line(line)
highlighters = self.model.highlighters()
if self.model.get_query_highlight():
@@ -586,23 +646,24 @@ class InnerBigText(QWidget):
if optional_highlight_range:
highlight_ranges = highlight_ranges + optional_highlight_range
self.draw_highlights(highlight_ranges, painter, y_line_offset)
self.draw_highlights(highlight_ranges, painter, y_line_offset, line)
y_line_offset = y_line_offset + self.char_height
left_offset = int(-1 * self._left_offset * self.char_width)
# left_offset = int(-1 * self._left_offset * self.char_width)
left_offset = int(-1 * self._left_offset)
y_line_offset = self.char_height
for line in self.lines:
text = line.line_prepared_for_display()
text = text[self._left_offset:self._left_offset + math.ceil(
self.columns_shown())] # reduce string to the visible section before drawing
painter.drawText(0, y_line_offset, text)
# text = text[self._left_offset:self._left_offset + math.ceil(
# self.columns_shown())] # reduce string to the visible section before drawing
painter.drawText(-self._left_offset, y_line_offset, text)
y_line_offset = y_line_offset + self.char_height
painter.end()
end_ns = time.process_time_ns()
# print(f"paint took {(end_ns - start_ns) / 1000000.0}")
def draw_highlights(self, highlights: [HighlightedRange], painter: QPainter, y_line_offset: int):
def draw_highlights(self, highlights: [HighlightedRange], painter: QPainter, y_line_offset: int, line: Line):
for highlight in highlights:
if highlight.is_highlight_full_line():
@@ -612,15 +673,17 @@ class InnerBigText(QWidget):
self.highlight_background(painter, rect, highlight.get_brush_full_line())
for highlight in highlights:
left_offset = self._left_offset * self.char_width
x1 = highlight.get_start() * self.char_width
width = highlight.get_width() * self.char_width
x1 = self.font_metric.horizontalAdvance(
line.prefix_bytes(highlight.get_start()).decode("utf8", errors="replace"))
width = self.font_metric.horizontalAdvance(
line.substr_bytes(highlight.get_start(), highlight.get_width()).decode("utf8", errors="replace"))
y1 = y_line_offset - self.char_height + self.char_height / 7
height = self.char_height
left = round(x1 - left_offset)
if x1 + width < left_offset \
or x1 > left_offset + self.width():
left = round(x1 - self._left_offset)
if x1 + width < self._left_offset \
or x1 > self._left_offset + self.width():
# too far left or too far right
continue

View File

@@ -71,12 +71,15 @@ class HighlightRegex(Highlight):
# but we only want to highlight the groups
first_group = 1 if len(match.groups()) > 0 else 0
for i in range(first_group, len(match.groups()) + 1):
start_column = line.char_to_column(match.start(i))
end_column = line.char_to_column(match.end(i))
start_char_index = match.start(i)
start_byte_index = len(line.prefix(start_char_index).encode("utf8"))
end_char_index = match.end(i)
width = len(line.substr(start_char_index, end_char_index - start_char_index).encode("utf8"))
# print(f"highlight: {start_column}:{end_column} - {match.group(i)}")
result.append(HighlightedRange(
start_column,
end_column - start_column,
start_byte_index,
width,
highlight_full_line=True,
brush=self._brush_hit,
brush_full_line=self._brush_line

View File

@@ -7,50 +7,45 @@ from PySide6.QtCore import Qt
from PySide6.QtGui import QBrush, QColor
from src.settings.settings import Settings
from src.ui.bigtext.selectionPos import SelectionPos
class HighlightSelection(Highlight):
start_byte = 0
end_byte = 0
start = SelectionPos(0, False, 0)
end = SelectionPos(0, False, 0)
def set_start(self, start_byte):
self.start_byte = start_byte
def set_start(self, start: SelectionPos):
self.start = start
def set_end_byte(self, end_byte):
self.end_byte = end_byte
def set_end_byte(self, end: SelectionPos):
self.end = end
def min_byte(self) -> int:
return min(self.start.pos(), self.end.pos())
def max_byte(self) -> int:
return max(self.start.pos(), self.end.pos())
def compute_highlight(self, line: Line) -> Optional[List[HighlightedRange]]:
begin = min(self.start_byte, self.end_byte)
end = max(self.start_byte, self.end_byte)
begin = self.min_byte()
end = self.max_byte()
if line.intersects(begin, end):
if line.includes_byte(begin):
start_byte_in_line = begin - line.byte_offset()
else:
start_byte_in_line = 0
start_char = line.byte_index_to_char_index(start_byte_in_line)
if line.includes_byte(end):
length_in_bytes = end - line.byte_offset() - start_byte_in_line
end_char = line.byte_index_to_char_index(start_byte_in_line + length_in_bytes)
else:
# renders the highlighting to the end of the line
# this is how selections usually behave
length_in_bytes = Settings.max_line_length() - start_byte_in_line
# note: this mixes chars and bytes, but that should not matter, because
# it just means that we render the highlight into the invisible range on the right
end_char = start_char + length_in_bytes
start_column = line.char_to_column(start_char)
end_column = line.char_to_column(end_char)
if end_column >= 0:
length_in_columns = end_column - start_column
else:
length_in_columns = 4096
return [HighlightedRange(start_column, length_in_columns, brush=QBrush(QColor(156, 215, 255, 192)),
# print(f"compute_highlight: {line.substr_bytes(begin, end)} begin={begin} end={end} start_byte_in_line={start_byte_in_line} length_in_bytes={length_in_bytes}")
return [HighlightedRange(start_byte_in_line, length_in_bytes, brush=QBrush(QColor(156, 215, 255, 192)),
pen=Qt.PenStyle.NoPen)]
else:
return None

View File

@@ -1,16 +1,22 @@
import unicodedata
from PySide6.QtGui import QFontMetrics
import constants
class Line:
def __init__(self, byte_offset: int, byte_end: int, line: str):
def __init__(self, byte_offset: int, byte_end: int, line: str, bytes: str):
self._byte_offset = byte_offset
self._byte_end = byte_end
self._line = line
self._bytes = bytes
self._cache_char_to_column()
def get_width_in_px(self, font_metric: QFontMetrics):
return font_metric.horizontalAdvance(self._line)
def byte_offset(self) -> int:
return self._byte_offset
@@ -131,11 +137,20 @@ class Line:
def prefix(self, index: int) -> str:
return self._line[0:index]
def prefix_bytes(self, byte_index: int) -> str:
return self._bytes[0:byte_index]
def substr(self, offset: int, length: int) -> str:
return self._line[offset:offset+length]
def substr_bytes(self, byte_offset: int, byte_length: int) -> str:
return self._bytes[byte_offset:byte_offset + byte_length]
def suffix(self, index: int) -> str:
return self._line[index:]
def suffix_bytes(self, byte_index: int) -> str:
return self._bytes[byte_index:]
def __str__(self):
return "%s (%d->%d)" % (self._line, self._byte_offset, self._byte_end)

View File

@@ -136,7 +136,7 @@ class LogFileModel:
return re.match(r"\w", char) is not None
def prune_cache(self, range_start: int, range_end: int):
print(f"cache size: {len(self._line_cache.keys())}")
# print(f"cache size: {len(self._line_cache.keys())}")
for key in list(self._line_cache.keys()):
line = self._line_cache[key]
if range_start > line.byte_end() or line.byte_offset() > range_end:
@@ -171,7 +171,7 @@ class LogFileModel:
new_offset = f.tell()
if 0 <= range_end < new_offset:
break
line = Line(offset, new_offset, line_bytes.decode("utf8", errors="ignore"))
line = Line(offset, new_offset, line_bytes.decode("utf8", errors="ignore"), line_bytes)
if previous_line_is_complete: # only cache lines when we know they are complete
self._line_cache[offset] = line
offset = new_offset