krowlog/src/ui/bigtext/line.py

import unicodedata

from PySide6.QtGui import QFontMetrics

import constants


class Line:
    def __init__(self, byte_offset: int, byte_end: int, line: str, bytes: str):
        self._byte_offset = byte_offset
        self._byte_end = byte_end
        self._line = line
        self._bytes = bytes

        self._cache_char_to_column()

    def get_width_in_px(self, font_metric: QFontMetrics):
        return font_metric.horizontalAdvance(self._line)

    def byte_offset(self) -> int:
        return self._byte_offset

    def byte_end(self) -> int:
        return self._byte_end

    def line(self) -> str:
        return self._line

    def length_in_charaters(self) -> int:
        return len(self._line)

    def length_in_columns(self) -> int:
        return self.char_to_column(len(self._line) - 1)

    def char_index_to_byte(self, char_in_line: int) -> int:
        # todo this does not work with multibyte characters
        # should probably be len(self.prefix(char_in_line-1).encode("utf8"))
        return len(self.prefix(char_in_line).encode("utf8"))

    def byte_index_to_char_index(self, byte_index: int) -> int:
        prefix_bytes = self._line.encode("utf8")[:byte_index]
        prefix_chars = prefix_bytes.decode("utf8", errors="ignore")
        return len(prefix_chars)

    def line_prepared_for_display(self) -> str:
        line = self._line_tabs_replaced()
        line = self._replace_control_chars_with_pictures(line)
        return line

    def _replace_control_chars_with_pictures(self, line: str) -> str:
        length = len(line)
        for i in range(length):
            c = line[i]

            if unicodedata.category(c) == "Cc" and c != "\r" and c != "\n" and c != "\t":
                ordinal_value = ord(c)
                if ordinal_value < 32 and not ordinal_value in [9, 10, 11]:
                    # print(ord(c), " -> ", hex(ord(c)), " -> ", unicodedata.category(c), " -> ", chr(9216 + ord(c)))
                    line = line.replace(c, chr(9216 + ord(c)))  # see Unicode Block “Control Pictures”
                else:
                    # print(ord(c), " -> ", hex(ord(c)), " -> ", unicodedata.category(c), " -> \u2421")
                    line = line.replace(c, "\u2421")  # symbol for delete (␡)
                # print(line)

        return line;

    def _line_tabs_replaced(self) -> str:
        line = self._line;
        i = 0
        offset = 0
        result = ""
        # length = len(line)
        while True:
            tab_index = line.find("\t", offset)
            if tab_index < 0:
                break
            result = result + line[offset:tab_index]
            result = result + " " * (constants.tab_width - len(result) % constants.tab_width)
            offset = tab_index + 1

        result = result + line[offset:]

        return result

    def column_to_char(self, column_in_line: int) -> int:
        while not column_in_line in self._column_to_char_cache and column_in_line > 0:
            column_in_line = column_in_line - 1

        if column_in_line in self._column_to_char_cache:
            return self._column_to_char_cache[column_in_line]

        return 0

    def char_to_column(self, char_in_line: int) -> int:
        if not char_in_line in self._char_to_column_cache:
            # print("%d in %s" % (char_in_line, self._char_to_column_cache))
            return -1
        return self._char_to_column_cache[char_in_line]

    def _cache_char_to_column(self):
        self._char_to_column_cache = {}
        self._column_to_char_cache = {}
        result = 0
        i = 0
        self._char_to_column_cache[0] = 0
        while i < len(self._line):
            self._char_to_column_cache[i] = result
            if not result in self._column_to_char_cache:
                self._column_to_char_cache[result] = i
            current_char = self._line[i]
            if current_char == "\t":
                result = result + constants.tab_width - result % constants.tab_width
            else:
                result = result + 1
            i = i + 1

            # ignore: Nonspacing Mark characters are decorations for the previous character.
            # They do not take up space.
            # For example the character Combining Diaeresis (U+0308, %CC%88) that adds two
            # dots above the previous character. It can be used to create an 'ä' from an 'a'+'◌̈'.
            # In url encoding this looks like: a%CC%88.
            # todo there are many other character combinations that should be skipped
            while i < len(self._line) and unicodedata.category(self._line[i]) == "Mn":
                self._char_to_column_cache[i] = result - 1
                if (result - 1) not in self._column_to_char_cache:
                    self._column_to_char_cache[result - 1] = i
                i = i + 1

    def includes_byte(self, byte: int) -> bool:
        return self._byte_offset <= byte <= self._byte_end

    def intersects(self, start_byte: int, end_byte: int):
        result = start_byte < self._byte_end and end_byte > self._byte_offset
        # print("%d,%d in %d,%d" % (start_byte, end_byte, self._byte_offset, self._byte_end))
        return result

    def prefix(self, index: int) -> str:
        return self._line[0:index]

    def prefix_bytes(self, byte_index: int) -> str:
        return self._bytes[0:byte_index]

    def substr(self, offset: int, length: int) -> str:
        return self._line[offset:offset+length]

    def substr_bytes(self, byte_offset: int, byte_length: int) -> str:
        return self._bytes[byte_offset:byte_offset + byte_length]

    def suffix(self, index: int) -> str:
        return self._line[index:]

    def suffix_bytes(self, byte_index: int) -> str:
        return self._bytes[byte_index:]

    def __str__(self):
        return "%s (%d->%d)" % (self._line, self._byte_offset, self._byte_end)