From f0c3d34995eaa3ecc171e5ea15fca3b1aad24a4f Mon Sep 17 00:00:00 2001 From: Andreas Huber Date: Sat, 29 Jan 2022 10:43:23 +0100 Subject: [PATCH] handle character in the category 'nonspacing mark' Those characters are ignored, because they decorate the previous character. --- bigtext.py | 6 ++++- example.log | 5 ++++- highlight_regex.py | 6 +++-- line.py | 49 +++++++++++++++++++++++------------------ main.py | 2 ++ scribble.py | 35 ++++++++++++++++++++++++++++- testline.py | 55 ++++++++++++++++++++++++++++++++++++++++++++++ 7 files changed, 132 insertions(+), 26 deletions(-) diff --git a/bigtext.py b/bigtext.py index 29b8c79..ce06b64 100644 --- a/bigtext.py +++ b/bigtext.py @@ -308,7 +308,7 @@ class InnerBigText(QWidget): column_in_line = self.x_pos_to_column(e.pos().x()) + self._left_offset column_in_line = min(column_in_line, line.length_in_columns()) # x was behind the last column of this line char_in_line = line.column_to_char(column_in_line) - print("%s in line %s lcolumn_in_line=%s" % (char_in_line, line_number, column_in_line)) + # print("%s in line %s lcolumn_in_line=%s" % (char_in_line, line_number, column_in_line)) byte_in_line = line.char_index_to_byte(char_in_line) current_byte = line.byte_offset() + byte_in_line # print("%s + %s = %s" % (line.byte_offset(), char_in_line, current_byte)) @@ -401,6 +401,7 @@ class InnerBigText(QWidget): # draw hightlights first - some characters may overlap to the next line # by drawing the background hightlights first we prevent that the hightlight # draws over a character + start = time.time() y_line_offset = self.char_height; for l in self.lines: highlight_ranges = [] @@ -412,6 +413,9 @@ class InnerBigText(QWidget): self.draw_highlights(highlight_ranges, painter, y_line_offset) y_line_offset = y_line_offset + self.char_height + end = time.time() + # print("highlight duration: %.3f" %((end-start)*1000)) + left_offset = int(-1 * self._left_offset * self.char_width) y_line_offset = self.char_height; for l in self.lines: diff --git a/example.log b/example.log index 8499413..29a7837 100644 --- a/example.log +++ b/example.log @@ -1,7 +1,6 @@ 01234 01234567890123456789 012345678901234567890123456789012345678901234567890123456789 -0123456789012345678901234567890123456789 tab indentation: 1 2 3 4 5 - 1-- 2-- 3- 4---5 @@ -15,8 +14,12 @@ aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa| ...............................| | ääääääääääääää♥ääääääääääääääää| +nonspacing marks: next line consists of x%CC%88 ẍẍẍẍẍẍẍẍẍẍẍẍẍẍẍẍẍẍẍẍẍẍẍẍẍẍẍẍẍẍẍ| +Combining Double Circumflex Above: “◌᷍◌” (U+1DCD) +x◌᷍◌◌᷍◌x +👍🏿 dark thumbs up (U+1F44D + U+1F3FF - THUMBS UP SIGN + EMOJI MODIFIER FITZPATRICK TYPE-6) 2019-08-07 00:00:10,391 [catalina-exec-40] INFO c.r.c.u.l.PerformancePointcut - Executed HealthCheckController.checkOperativeness in 1 ms successful. [jv3fw7r2.m1u5] 2019-08-07 00:00:16,377 [catalina-exec-56] INFO c.r.c.u.l.PerformancePointcut - Executed HealthCheckController.checkOperativeness in 1 ms successful. [jv3fw7r2.m1u6] 2019-08-07 00:00:40,403 [catalina-exec-26] ERROR c.r.c.u.l.PerformancePointcut - Executed HealthCheckController.checkOperativeness in 1 ms successful. [jv3fw7r2.m1ud] diff --git a/highlight_regex.py b/highlight_regex.py index 0b1f5f8..735b26b 100644 --- a/highlight_regex.py +++ b/highlight_regex.py @@ -22,6 +22,8 @@ class HighlightRegex(Highlight): self.regex = self._get_regex() self.hit_background_color = hit_background_color self.line_background_color = line_background_color + self._brush_hit = self.brush(self.hit_background_color) + self._brush_line = self.brush(self.line_background_color) def _get_regex(self): flags = re.IGNORECASE if self.ignore_case else 0 @@ -46,8 +48,8 @@ class HighlightRegex(Highlight): start_column, end_column - start_column, highlight_full_line=True, - brush=self.brush(self.hit_background_color), - brush_full_line=self.brush(self.line_background_color) + brush=self._brush_hit, + brush_full_line=self._brush_line )) return result diff --git a/line.py b/line.py index f2f4e64..4a1e19c 100644 --- a/line.py +++ b/line.py @@ -1,3 +1,4 @@ +import unicodedata from typing import Dict import constants @@ -8,7 +9,8 @@ class Line: self._byte_offset = byte_offset self._byte_end = byte_end self._line = line - self._char_to_column_cache = self._cache_char_to_column() + + self._cache_char_to_column() def byte_offset(self) -> int: return self._byte_offset @@ -23,9 +25,11 @@ class Line: return len(self._line) def length_in_columns(self) -> int: - return self.char_to_column(len(self._line)) + return self.char_to_column(len(self._line) - 1) def char_index_to_byte(self, char_in_line: int) -> int: + # todo this does not work with multibyte characters + # should probably be len(self.prefix(char_in_line-1).encode("utf8")) return len(self.prefix(char_in_line).encode("utf8")) def byte_index_to_char_index(self, byte_index: int) -> int: @@ -52,19 +56,7 @@ class Line: return result def column_to_char(self, column_in_line: int) -> int: - i = 0 - result = 0 - while i < column_in_line: - char = self._line[result] - if char == "\t": - i = i + constants.tab_width - i % constants.tab_width # jump the additional columns to complete the tab - if i > column_in_line: - break; - else: - i = i + 1 - result = result + 1 - - return result + return self._column_to_char_cache[column_in_line] def char_to_column(self, char_in_line: int) -> int: if not char_in_line in self._char_to_column_cache: @@ -72,19 +64,34 @@ class Line: return -1 return self._char_to_column_cache[char_in_line] - def _cache_char_to_column(self) -> Dict[int, int]: - char_to_column_cache = {} + def _cache_char_to_column(self): + self._char_to_column_cache = {} + self._column_to_char_cache = {} result = 0 i = 0 - char_to_column_cache[0] = 0 + self._char_to_column_cache[0] = 0 while i < len(self._line): - if i < len(self._line) and self._line[i] == "\t": + self._char_to_column_cache[i] = result + if not result in self._column_to_char_cache: + self._column_to_char_cache[result] = i + current_char = self._line[i] + if current_char == "\t": result = result + constants.tab_width - result % constants.tab_width else: result = result + 1 i = i + 1 - char_to_column_cache[i] = result - return char_to_column_cache + + # ignore: Nonspacing Mark characters are decorations for the previous character. + # They do not take up space. + # For example the character Combining Diaeresis (U+0308, %CC%88) that adds two + # dots above the previous character. It can be used to create an 'ä' from an 'a'+'◌̈'. + # In url encoding this looks like: a%CC%88. + # todo there are many other character combinations that should be skipped + while i < len(self._line) and unicodedata.category(self._line[i]) == "Mn": + self._char_to_column_cache[i] = result - 1 + if not result in self._column_to_char_cache: + self._column_to_char_cache[result - 1] = i + i = i + 1 def includes_byte(self, byte: int) -> bool: return self._byte_offset <= byte <= self._byte_end diff --git a/main.py b/main.py index 0c9323f..dc02690 100644 --- a/main.py +++ b/main.py @@ -64,6 +64,8 @@ if __name__ == "__main__": # PluginRegistry.execute("open_file", "/home/andi/ws/ravenlog/example.log") PluginRegistry.execute("open_file", "/home/andi/ws/performanceDb/data/production/lt_axc_21.4_133.02_maxInstance/lt_axc_21.4_133.02_maxInstance/app/axcng-service_i-0a69bd43d3624a5bc_172_28_60_222_VADPERFO01AA001_2021-09-21_091717/service/service.log"); + PluginRegistry.execute("open_file", + "/home/andi/ws/ravenlog/example.log"); # window.open_file("C:\\Users\\andi\\ws\\some.log") signal.signal(signal.SIGINT, stop_signal) diff --git a/scribble.py b/scribble.py index cdfcd68..2b3959f 100644 --- a/scribble.py +++ b/scribble.py @@ -1,4 +1,37 @@ # extract icons from dll on windows # https://mail.python.org/pipermail/python-win32/2009-April/009078.html +import re +import time -print(min(2290538861, 2342622222)) +from highlighted_range import HighlightedRange +from line import Line + +result = [] + +byte_offset = 123 +text = "2021-09-21 08:40:38,187 [catalina-exec-37] INFO c.r.c.s.l.SearchAdapter - Fetched 0 fields, 1 folders, 0 content requests for 1 documents; took 763ms (including 0ms to fetch field names). [project=axcelerate.lds_5m_review2, session=37891bc0-a67e-4c43-90c0-c20da567f491, user=r-162] [kttsjx2h.48z.9ls] 2021-09-21 08:47:16,529 [BravaJobHandler-12] INFO c.r.b.c.f.i.DoneNotifierRunnable - CORE job for 'complete document request with output format xdl (source n.pdf)' complete. Notifying Brava server about completion for numId=LDS_001:00095883.. Extracting and moving XDL and SVG/Thumbs took 31ms (from remote stream from 172.28.60.208:51048,com.recommind.rmi.ssl.SslRmiExporter$CheckedSslRmiClientSocketFactory). Notifying with URL https://localhost:8889/BravaServer/done/xc_E4E99FE32A313D2FBA8D29F846C0EF439E8AE2BE159164D04B2AFD862F714BED_ (context switch time 0ms) [project=axcelerate.lds_5m_review2, session=500380b9-94c5-4740-b30a-81e9f6cd071d, user=r-377] [kttsjx2h.8ys.kai]" + +start = time.time() +line = Line(byte_offset=byte_offset, byte_end=byte_offset + len(text.encode("utf8")), line=text) +regex = re.compile(r"\w", flags=re.IGNORECASE) +match_iter = re.finditer(regex, line.line()) +for match in match_iter: + start_char = match.start(0) + end_char = match.end(0) + + start_column = line.char_to_column(start_char) + end_column = line.char_to_column(end_char) + + result.append(HighlightedRange(start_column, end_column - start_column, highlight_full_line=True, brush=None, + brush_full_line=None)) + +end = time.time() +print("duration: %.3f" % ((end - start) * 1000)) + +result = [] +start = time.time() +for i in range(0, 10000): + result.append(i) + +end = time.time() +print("duration: %.3f" % ((end - start) * 1000)) diff --git a/testline.py b/testline.py index 917ebda..7297b01 100644 --- a/testline.py +++ b/testline.py @@ -1,5 +1,7 @@ import unittest +import unicodedata + from line import Line @@ -27,6 +29,20 @@ class MyTestCase(unittest.TestCase): self.assertEqual(9, line.column_to_char(15)) # tab self.assertEqual(10, line.column_to_char(16)) # g + def test_column_to_char_ignore_nonspacing_mark_charaters(self): + """ + nonspacing mark charaters are those little decorations that are applied to the previous character, + e.g. x\u0308 to make ẍ + :return: + """ + byte_offset = 123 + text = "x\u0308y\u0308z\u0308" + line = Line(byte_offset=byte_offset, byte_end=byte_offset + len(text.encode("utf8")), line=text) + + self.assertEqual(0, line.column_to_char(0)) # ẍ + self.assertEqual(2, line.column_to_char(1)) # ÿ + self.assertEqual(4, line.column_to_char(2)) # z̈ + def test_char_to_column(self): byte_offset = 123 text = "\tabc\td\tef\tg" # will be rendered as: ....abc.d...ef..g where . represents a whitespace column @@ -43,6 +59,23 @@ class MyTestCase(unittest.TestCase): self.assertEqual(14, line.char_to_column(9)) # tab self.assertEqual(16, line.char_to_column(10)) # g + def test_char_to_column_ignore_nonspacing_mark_charaters(self): + """ + nonspacing mark charaters are those little decorations that are applied to the previous character, + e.g. x\u0308 to make ẍ + :return: + """ + byte_offset = 123 + text = "x\u0308y\u0308z\u0308" + print(text) + line = Line(byte_offset=byte_offset, byte_end=byte_offset + len(text.encode("utf8")), line=text) + self.assertEqual(0, line.char_to_column(0)) # ẍ + self.assertEqual(0, line.char_to_column(1)) # ẍ + self.assertEqual(1, line.char_to_column(2)) # ÿ + self.assertEqual(1, line.char_to_column(3)) # ÿ + self.assertEqual(2, line.char_to_column(4)) # z̈ + self.assertEqual(2, line.char_to_column(5)) # z̈ + def test_line_tabs_replaced(self): byte_offset = 123 text = "\ta\tb" # will be rendered as: ....abc where . represents a whitespace column @@ -57,5 +90,27 @@ class MyTestCase(unittest.TestCase): line = Line(byte_offset=byte_offset, byte_end=byte_offset + len(text.encode("utf8")), line=text) self.assertEqual(expected, line.line_tabs_replaced()) + def test_byte_index_to_char_index(self): + byte_offset = 123 + text = "x\u0308y\u0308z\u0308\t\u0308a" + print(text) + line = Line(byte_offset=byte_offset, byte_end=byte_offset + len(text.encode("utf8")), line=text) + self.assertEqual(0, line.byte_index_to_char_index(0)) # x + self.assertEqual(0, line.byte_index_to_char_index(1)) # first byte of diacritical mark belonging to x + self.assertEqual(0, line.byte_index_to_char_index(2)) # second byte of diacritical mark belonging to x + + def test_diacritical_marks(self): + text = "̈ẍỏôŏ̮👍🏿" + text = "\U0001F9D9\u200D\u2642\uFE0F - \U0001F44D\U0001F3FF - a\u02c3 - ẍ - y\u0308 - w\u200D\u00A8" + text = unicodedata.normalize("NFD", text) + i = 0 + print("%s" % text) + print("length: %s" % len(text)) + while i < len(text): + c = text[i] + print("%s %s cat: %s" % (c, unicodedata.name(c), unicodedata.category(c))) + i = i + 1 + + if __name__ == '__main__': unittest.main()