From 61132d242ff1071a66b7f635921b591f690e011d Mon Sep 17 00:00:00 2001 From: Andreas Huber Date: Sun, 23 Mar 2025 21:00:53 +0100 Subject: [PATCH] fix: graphemes are not correctly highlighted MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Graphemes don't all have the same width, not even when you use a monospace font. For latin characters it usually works find to assume the same width. But emojis, japanese or chinese characters have have different width. There are even some ultra wide characters like 𒐫 or ﷽. There is also a thing called 'half-width' character. E.g. the japanese 'a' can be ア or ア. Fixed by actually computing the width of graphemes and using pixel. --- src/ui/bigtext/bigtext.py | 175 +++++++++++++++++--------- src/ui/bigtext/highlight_regex.py | 11 +- src/ui/bigtext/highlight_selection.py | 41 +++--- src/ui/bigtext/line.py | 17 ++- src/ui/bigtext/logFileModel.py | 4 +- testbed/example.log | 19 +++ 6 files changed, 181 insertions(+), 86 deletions(-) diff --git a/src/ui/bigtext/bigtext.py b/src/ui/bigtext/bigtext.py index e72b85f..88a2ec5 100644 --- a/src/ui/bigtext/bigtext.py +++ b/src/ui/bigtext/bigtext.py @@ -18,6 +18,7 @@ from src.ui.bigtext.highlighted_range import HighlightedRange from src.ui.bigtext.line import Line from src.ui.bigtext.logFileModel import LogFileModel from src.ui.bigtext.newhighlightingdialog import NewHighlightingDialog +from src.ui.bigtext.selectionPos import SelectionPos from src.ui.icon import Icon from src.ui.rangeslider import RangeSlider from src.util.conversion import humanbytes @@ -30,6 +31,7 @@ from src.i18n import _ log = logging.getLogger("bigtext") + class FileObserver(FileSystemEventHandler): def __init__(self, big_text): @@ -145,7 +147,7 @@ class BigText(QWidget): # noinspection PyArgumentList,PyTypeChecker class InnerBigText(QWidget): _byte_offset = 0 - _left_offset = 0 # number of characters the horizontal scrollbar was moved to the right + _left_offset = 0 # number of pixels the horizontal scrollbar was moved to the right scroll_lines = 0 longest_line = 0 @@ -320,8 +322,8 @@ class InnerBigText(QWidget): # noinspection PyTypeChecker def mousePressEvent(self, e: QtGui.QMouseEvent) -> None: if e.buttons() == Qt.MouseButton.LeftButton and e.modifiers() == Qt.KeyboardModifier.ShiftModifier: - offset = self.to_byte_offset(e) - self.selection_highlight.set_end_byte(offset) + selection_pos = self.to_byte_offset(e) + self.selection_highlight.set_end_byte(selection_pos) self._update_highlight_selected_text() self.update() return @@ -331,16 +333,16 @@ class InnerBigText(QWidget): line_number = self.y_pos_to_line(e.pos().y()) if line_number == self._last_double_click_line_number and line_number < len(self.lines): line: Line = self.lines[line_number] - self.selection_highlight.set_start(line.byte_offset()) - self.selection_highlight.set_end_byte(line.byte_end()) + self.selection_highlight.set_start(SelectionPos(line.byte_offset(), True, 1)) + self.selection_highlight.set_end_byte(SelectionPos(line.byte_end() - 1, False, 1)) self._update_highlight_selected_text() self.update() return if e.buttons() == Qt.MouseButton.LeftButton and e.modifiers() == Qt.KeyboardModifier.NoModifier: - offset = self.to_byte_offset(e) - self.selection_highlight.set_start(offset) - self.selection_highlight.set_end_byte(offset) + selection_pos = self.to_byte_offset(e) + self.selection_highlight.set_start(selection_pos) + self.selection_highlight.set_end_byte(selection_pos) self._update_highlight_selected_text() self.update() @@ -355,14 +357,17 @@ class InnerBigText(QWidget): self._last_double_click_time = time.time() self._last_double_click_line_number = self.y_pos_to_line(e.pos().y()) - offset = self.to_byte_offset(e) - (_word, start_byte, end_byte) = self.model.read_word_at(offset) + selection_pos = self.to_byte_offset(e) + (word, start_byte, end_byte) = self.model.read_word_at(selection_pos.pos()) if start_byte >= 0 and end_byte >= 0: - self.selection_highlight.set_start(start_byte) - self.selection_highlight.set_end_byte(end_byte) + bytes_of_first_char = len(f"{word[0]}".encode("utf8")) + self.selection_highlight.set_start(SelectionPos(start_byte, True, bytes_of_first_char)) + bytes_of_last_char = len(f"{word[-1]}".encode("utf8")) + self.selection_highlight.set_end_byte( + SelectionPos(end_byte - bytes_of_last_char, False, bytes_of_last_char)) else: - self.selection_highlight.set_start(offset) - self.selection_highlight.set_end_byte(offset) + self.selection_highlight.set_start(selection_pos) + self.selection_highlight.set_end_byte(selection_pos) self._update_highlight_selected_text() self.update() @@ -372,10 +377,10 @@ class InnerBigText(QWidget): if e.buttons() != Qt.MouseButton.LeftButton: return - current_byte = self.to_byte_offset(e) + selection_pos = self.to_byte_offset(e) - if self.selection_highlight.end_byte != current_byte: - self.selection_highlight.set_end_byte(current_byte) + if self.selection_highlight.max_byte() != selection_pos.pos(): + self.selection_highlight.set_end_byte(selection_pos) self._update_highlight_selected_text() self.update() # print("-> %s,%s" %(self._selection_start_byte, self._selection_end_byte)) @@ -386,10 +391,16 @@ class InnerBigText(QWidget): self.scroll_by_lines(-1) if line_number > int(self.lines_shown()): self.scroll_by_lines(1) - if column_in_line <= 1: + # if column_in_line <= 1: + # self._left_offset = max(0, self._left_offset - 2) + # self.update() + if e.pos().x() <= 1: self._left_offset = max(0, self._left_offset - 2) self.update() - if column_in_line + 1 >= self.columns_shown(): + # if column_in_line + 1 >= self.columns_shown(): + # self._left_offset = self._left_offset + 2 + # self.update() + if e.pos().x() + 1 >= self.width(): self._left_offset = self._left_offset + 2 self.update() @@ -416,12 +427,13 @@ class InnerBigText(QWidget): case BigScrollBar.ScrollEvent.PageDown: self.scroll_by_lines(int(self.lines_shown()) - 1) - def update_longest_line(self, length: int): - width_in_chars = self.width() / self.char_width + def update_longest_line(self, line: Line): + # print("width_in_chars: %d" % width_in_chars) - if self.longest_line < length: - self.longest_line = length - maximum = max(0, length - width_in_chars + 1) + text_width_in_px = line.get_width_in_px(self.font_metric); + if self.longest_line < text_width_in_px: + self.longest_line = text_width_in_px + maximum = max(0, text_width_in_px - self.width() + 1) self.parent.h_scroll_bar.setMaximum(round(maximum)) def y_pos_to_line(self, y: int) -> int: @@ -436,29 +448,63 @@ class InnerBigText(QWidget): def columns_shown(self) -> float: return self.width() / float(self.char_width) - def to_byte_offset(self, e: QMouseEvent) -> int: + def to_byte_offset(self, e: QMouseEvent) -> SelectionPos: + + x = e.pos().x() + self._left_offset line_number = self.y_pos_to_line(e.pos().y()) if line_number < len(self.lines): line: Line = self.lines[line_number] - column_in_line = self.x_pos_to_column(e.pos().x()) + self._left_offset - column_in_line = min(column_in_line, line.length_in_columns()) # x was behind the last column of this line - char_in_line = line.column_to_char(column_in_line) - # print("%s in line %s column_in_line=%s" % (char_in_line, line_number, column_in_line)) - byte_in_line = line.char_index_to_byte(char_in_line) - current_byte = line.byte_offset() + byte_in_line - # print("%s + %s = %s" % (line.byte_offset(), char_in_line, current_byte)) + + text: str = line.line() + text = text.replace("\n", "").replace("\r", "") + + elided_text: str = self.elided_text(text, x) + byte_offset = line.byte_offset() + len(elided_text.encode("utf8")) + + left_x_offset = self.font_metric.horizontalAdvance((elided_text)) + + next_char = "" + pos_is_in_left_half = False + bytes_of_char = 0 + if len(text) > len(elided_text): # has another character + next_char = text[len(elided_text)] + char_width = self.font_metric.horizontalAdvance(next_char) + pos_is_in_left_half = x < (left_x_offset + char_width / 2) + bytes_of_char = len(next_char.encode("utf8")) + else: + # print(f"{x} is after last char, elided_text={elided_text}") + # the position is after the last character / behind the end of the line + pass + + # print(f"{x} -> {byte_offset} {'left' if pos_is_in_left_half else 'right'} bytes_of_char={bytes_of_char}") + return SelectionPos(byte_offset, pos_is_in_left_half, bytes_of_char) + + # column_in_line = self.x_pos_to_column(e.pos().x()) + self._left_offset + # column_in_line = min(column_in_line, line.length_in_columns()) # x was behind the last column of this line + # char_in_line = line.column_to_char(column_in_line) + # # print("%s in line %s column_in_line=%s" % (char_in_line, line_number, column_in_line)) + # byte_in_line = line.char_index_to_byte(char_in_line) + # current_byte = line.byte_offset() + byte_in_line + # # print("%s + %s = %s" % (line.byte_offset(), char_in_line, current_byte)) else: current_byte = self.model.byte_count() return current_byte + def elided_text(self, text: str, width: int): + w = width + self.font_metric.horizontalAdvance("…") + elided_text = self.font_metric.elidedText(text + "…", Qt.TextElideMode.ElideRight, w, + Qt.TextFlag.TextWrapAnywhere) + elided_text = elided_text[0:-1] if elided_text.endswith('…') else elided_text # remove the trailing '…' + return elided_text + def _has_selection(self): - return self.selection_highlight.start_byte != self.selection_highlight.end_byte + return self.selection_highlight.min_byte() != self.selection_highlight.max_byte() def copy_selection(self): if self._has_selection(): - start = min(self.selection_highlight.start_byte, self.selection_highlight.end_byte) - end = max(self.selection_highlight.start_byte, self.selection_highlight.end_byte) + start = self.selection_highlight.min_byte() + end = self.selection_highlight.max_byte() bytes_human_readable = humanbytes(end - start) if end - start > (1024 ** 2) * 5: you_sure = QMessageBox( @@ -492,8 +538,8 @@ class InnerBigText(QWidget): def _copy_selection_to_file(self): if self._has_selection(): - start = min(self.selection_highlight.start_byte, self.selection_highlight.end_byte) - end = max(self.selection_highlight.start_byte, self.selection_highlight.end_byte) + start = self.selection_highlight.min_byte() + end = self.selection_highlight.max_byte() dialog = QFileDialog(self) (selected_file, _filter) = dialog.getSaveFileName( parent=self, @@ -507,18 +553,21 @@ class InnerBigText(QWidget): PluginRegistry.execute("open_file", selected_file) def _select_all(self): - self.selection_highlight.start_byte = self.model.get_line_start_at(self._range_start) + start_byte = self.model.get_line_start_at(self._range_start) if self._range_end < 0 or self.model.byte_count() <= self._range_end: - self.selection_highlight.end_byte = self.model.byte_count() + end_byte = self.model.byte_count() else: - self.selection_highlight.end_byte = self.model.get_line_start_at(self._range_end) + end_byte = self.model.get_line_start_at(self._range_end) + + self.selection_highlight.set_start(SelectionPos(start_byte, True, 1)) + self.selection_highlight.set_end_byte(SelectionPos(end_byte, False, 1)) self._update_highlight_selected_text() self.update() def _update_highlight_selected_text(self): - start_byte = min(self.selection_highlight.start_byte, self.selection_highlight.end_byte) - end_byte = max(self.selection_highlight.start_byte, self.selection_highlight.end_byte) + start_byte = self.selection_highlight.min_byte() + end_byte = self.selection_highlight.max_byte() self._update_status_bar(start_byte, end_byte) @@ -545,8 +594,19 @@ class InnerBigText(QWidget): def paintEvent(self, event: QPaintEvent) -> None: start_ns = time.process_time_ns() painter = QPainter(self) + # font = "Courier New" if sys.platform == 'win32' or sys.platform == 'cygwin' else "Monospace" - painter.setFont(QFont("Courier New", self.model.settings.getint_session('general', "font_size"))) + # "Courier New" + # "JetBrains Mono" + # "Noto Sans Mono" + # "Noto Color Emoji" + # "Andale Mono" + qfont = QFont("Noto Sans Mono", self.model.settings.getint_session('general', "font_size")) + qfont.setStyleHint(QFont.StyleHint.Monospace) + painter.setFont(qfont) + self.font_metric = painter.fontMetrics() + + painter.setPen(QColor(0, 0, 0)) self.update_font_metrics(painter) @@ -567,7 +627,7 @@ class InnerBigText(QWidget): self.parent.range_limit.set_maximum(byte_count) for line in self.lines: - self.update_longest_line(len(line.line())) + self.update_longest_line(line) highlighters = self.model.highlighters() if self.model.get_query_highlight(): @@ -586,23 +646,24 @@ class InnerBigText(QWidget): if optional_highlight_range: highlight_ranges = highlight_ranges + optional_highlight_range - self.draw_highlights(highlight_ranges, painter, y_line_offset) + self.draw_highlights(highlight_ranges, painter, y_line_offset, line) y_line_offset = y_line_offset + self.char_height - left_offset = int(-1 * self._left_offset * self.char_width) + # left_offset = int(-1 * self._left_offset * self.char_width) + left_offset = int(-1 * self._left_offset) y_line_offset = self.char_height for line in self.lines: text = line.line_prepared_for_display() - text = text[self._left_offset:self._left_offset + math.ceil( - self.columns_shown())] # reduce string to the visible section before drawing - painter.drawText(0, y_line_offset, text) + # text = text[self._left_offset:self._left_offset + math.ceil( + # self.columns_shown())] # reduce string to the visible section before drawing + painter.drawText(-self._left_offset, y_line_offset, text) y_line_offset = y_line_offset + self.char_height painter.end() end_ns = time.process_time_ns() # print(f"paint took {(end_ns - start_ns) / 1000000.0}") - def draw_highlights(self, highlights: [HighlightedRange], painter: QPainter, y_line_offset: int): + def draw_highlights(self, highlights: [HighlightedRange], painter: QPainter, y_line_offset: int, line: Line): for highlight in highlights: if highlight.is_highlight_full_line(): @@ -612,15 +673,17 @@ class InnerBigText(QWidget): self.highlight_background(painter, rect, highlight.get_brush_full_line()) for highlight in highlights: - left_offset = self._left_offset * self.char_width - x1 = highlight.get_start() * self.char_width - width = highlight.get_width() * self.char_width + + x1 = self.font_metric.horizontalAdvance( + line.prefix_bytes(highlight.get_start()).decode("utf8", errors="replace")) + width = self.font_metric.horizontalAdvance( + line.substr_bytes(highlight.get_start(), highlight.get_width()).decode("utf8", errors="replace")) y1 = y_line_offset - self.char_height + self.char_height / 7 height = self.char_height - left = round(x1 - left_offset) - if x1 + width < left_offset \ - or x1 > left_offset + self.width(): + left = round(x1 - self._left_offset) + if x1 + width < self._left_offset \ + or x1 > self._left_offset + self.width(): # too far left or too far right continue diff --git a/src/ui/bigtext/highlight_regex.py b/src/ui/bigtext/highlight_regex.py index 0d85007..c401fb5 100644 --- a/src/ui/bigtext/highlight_regex.py +++ b/src/ui/bigtext/highlight_regex.py @@ -71,12 +71,15 @@ class HighlightRegex(Highlight): # but we only want to highlight the groups first_group = 1 if len(match.groups()) > 0 else 0 for i in range(first_group, len(match.groups()) + 1): - start_column = line.char_to_column(match.start(i)) - end_column = line.char_to_column(match.end(i)) + start_char_index = match.start(i) + start_byte_index = len(line.prefix(start_char_index).encode("utf8")) + end_char_index = match.end(i) + width = len(line.substr(start_char_index, end_char_index - start_char_index).encode("utf8")) + # print(f"highlight: {start_column}:{end_column} - {match.group(i)}") result.append(HighlightedRange( - start_column, - end_column - start_column, + start_byte_index, + width, highlight_full_line=True, brush=self._brush_hit, brush_full_line=self._brush_line diff --git a/src/ui/bigtext/highlight_selection.py b/src/ui/bigtext/highlight_selection.py index 14bdf2f..2132109 100644 --- a/src/ui/bigtext/highlight_selection.py +++ b/src/ui/bigtext/highlight_selection.py @@ -7,50 +7,45 @@ from PySide6.QtCore import Qt from PySide6.QtGui import QBrush, QColor from src.settings.settings import Settings +from src.ui.bigtext.selectionPos import SelectionPos class HighlightSelection(Highlight): - start_byte = 0 - end_byte = 0 + start = SelectionPos(0, False, 0) + end = SelectionPos(0, False, 0) - def set_start(self, start_byte): - self.start_byte = start_byte + def set_start(self, start: SelectionPos): + self.start = start - def set_end_byte(self, end_byte): - self.end_byte = end_byte + def set_end_byte(self, end: SelectionPos): + self.end = end + + def min_byte(self) -> int: + return min(self.start.pos(), self.end.pos()) + + def max_byte(self) -> int: + return max(self.start.pos(), self.end.pos()) def compute_highlight(self, line: Line) -> Optional[List[HighlightedRange]]: - begin = min(self.start_byte, self.end_byte) - end = max(self.start_byte, self.end_byte) + begin = self.min_byte() + end = self.max_byte() if line.intersects(begin, end): + if line.includes_byte(begin): start_byte_in_line = begin - line.byte_offset() else: start_byte_in_line = 0 - start_char = line.byte_index_to_char_index(start_byte_in_line) - if line.includes_byte(end): length_in_bytes = end - line.byte_offset() - start_byte_in_line - end_char = line.byte_index_to_char_index(start_byte_in_line + length_in_bytes) else: # renders the highlighting to the end of the line # this is how selections usually behave length_in_bytes = Settings.max_line_length() - start_byte_in_line - # note: this mixes chars and bytes, but that should not matter, because - # it just means that we render the highlight into the invisible range on the right - end_char = start_char + length_in_bytes - - start_column = line.char_to_column(start_char) - end_column = line.char_to_column(end_char) - if end_column >= 0: - length_in_columns = end_column - start_column - else: - length_in_columns = 4096 - - return [HighlightedRange(start_column, length_in_columns, brush=QBrush(QColor(156, 215, 255, 192)), + # print(f"compute_highlight: {line.substr_bytes(begin, end)} begin={begin} end={end} start_byte_in_line={start_byte_in_line} length_in_bytes={length_in_bytes}") + return [HighlightedRange(start_byte_in_line, length_in_bytes, brush=QBrush(QColor(156, 215, 255, 192)), pen=Qt.PenStyle.NoPen)] else: return None diff --git a/src/ui/bigtext/line.py b/src/ui/bigtext/line.py index 8bc7951..93d8983 100644 --- a/src/ui/bigtext/line.py +++ b/src/ui/bigtext/line.py @@ -1,16 +1,22 @@ import unicodedata +from PySide6.QtGui import QFontMetrics + import constants class Line: - def __init__(self, byte_offset: int, byte_end: int, line: str): + def __init__(self, byte_offset: int, byte_end: int, line: str, bytes: str): self._byte_offset = byte_offset self._byte_end = byte_end self._line = line + self._bytes = bytes self._cache_char_to_column() + def get_width_in_px(self, font_metric: QFontMetrics): + return font_metric.horizontalAdvance(self._line) + def byte_offset(self) -> int: return self._byte_offset @@ -131,11 +137,20 @@ class Line: def prefix(self, index: int) -> str: return self._line[0:index] + def prefix_bytes(self, byte_index: int) -> str: + return self._bytes[0:byte_index] + def substr(self, offset: int, length: int) -> str: return self._line[offset:offset+length] + def substr_bytes(self, byte_offset: int, byte_length: int) -> str: + return self._bytes[byte_offset:byte_offset + byte_length] + def suffix(self, index: int) -> str: return self._line[index:] + def suffix_bytes(self, byte_index: int) -> str: + return self._bytes[byte_index:] + def __str__(self): return "%s (%d->%d)" % (self._line, self._byte_offset, self._byte_end) \ No newline at end of file diff --git a/src/ui/bigtext/logFileModel.py b/src/ui/bigtext/logFileModel.py index b12e26e..8a8154a 100644 --- a/src/ui/bigtext/logFileModel.py +++ b/src/ui/bigtext/logFileModel.py @@ -136,7 +136,7 @@ class LogFileModel: return re.match(r"\w", char) is not None def prune_cache(self, range_start: int, range_end: int): - print(f"cache size: {len(self._line_cache.keys())}") + # print(f"cache size: {len(self._line_cache.keys())}") for key in list(self._line_cache.keys()): line = self._line_cache[key] if range_start > line.byte_end() or line.byte_offset() > range_end: @@ -171,7 +171,7 @@ class LogFileModel: new_offset = f.tell() if 0 <= range_end < new_offset: break - line = Line(offset, new_offset, line_bytes.decode("utf8", errors="ignore")) + line = Line(offset, new_offset, line_bytes.decode("utf8", errors="ignore"), line_bytes) if previous_line_is_complete: # only cache lines when we know they are complete self._line_cache[offset] = line offset = new_offset diff --git a/testbed/example.log b/testbed/example.log index c4b2882..59e0196 100644 --- a/testbed/example.log +++ b/testbed/example.log @@ -22,6 +22,25 @@ x◌᷍◌◌᷍◌x Control characters:  ------------------------------ +wide and half width characters: +12345678 +123456789 +アンドレアス +アンドレアス アンドレアス アンドレアス アンドレアス アンドレアス アンドレアス アンドレアス +アンドレアス +canadian aboriginal: +ᑭᓇᑐᐃᓐᓇᑦᑎᐊᖅᒥᒃ +simplified chinese: +任何人不得使为奴隶或奴 +Thai: +ทุกคนมีสิทธิที่จะได้ +Nastaliq Urdu (rl): +چونکہ یہ تمام +Braille: +⠑⠧⠑⠗⠽⠕⠝⠑ +Arabic (rl): +ولما كانت +------------------------------ 👍🏿 dark thumbs up (U+1F44D + U+1F3FF - THUMBS UP SIGN + EMOJI MODIFIER FITZPATRICK TYPE-6) ä---------ä----------ä---------ä----------ä---------ä----------ä---------ä----------ä---------ä----------ä---------ä----------ä---------ä----------ä---------ä----------ä---------ä----------ä---------ä----------ä---------ä----------ä---------ä----------ä---------ä----------ä---------ä----------ä---------ä----------ä---------ä----------ä---------ä----------ä---------ä----------ä---------ä----------ä---------ä----------ä---------ä----------ä---------ä----------ä---------ä----------ä---------ä----------ä---------ä----------< 2019-08-07 00:00:10,391 [catalina-exec-40] INFO c.r.c.u.l.PerformancePointcut - Executed HealthCheckController.checkOperativeness in 1 ms successful. [jv3fw7r2.m1u5]