Skip to content

markdown_node

trestle.core.markdown.markdown_node ¤

A markdown node.

logger ¤

Classes¤

MarkdownNode ¤

Markdown will be read to the tree.

Source code in trestle/core/markdown/markdown_node.py
class MarkdownNode:
    """Markdown will be read to the tree."""

    def __init__(self, key: str, content: SectionContent, starting_line: int):
        """Initialize markdown node."""
        self.subnodes: List[MarkdownNode] = []
        self.key = key
        self.content = content
        self.starting_line = starting_line

    @classmethod
    def build_tree_from_markdown(cls, lines: List[str], governed_header: Optional[str] = None):
        """Construct a tree out of the given markdown."""
        ob = cls.__new__(cls)
        start_level = ob._get_max_header_lvl(lines)
        ob, _ = ob._build_tree(lines, 'root', 0, start_level, governed_header)
        return ob

    def get_all_headers_for_level(self, level: int) -> Iterable[str]:
        """Return all headers per specified level of hierarchy."""
        return list(
            filter(lambda header: self._get_header_level_if_valid(header) == level, self.content.subnodes_keys)
        ).__iter__()

    def get_node_for_key(self, key: str, strict_matching: bool = True) -> Optional[MarkdownNode]:
        """Return a first node for the given key, substring matching is supported."""
        if not strict_matching:
            if not any([key in el for el in self.content.subnodes_keys]):
                return None
            elif len(as_filtered_list(self.content.subnodes_keys, lambda el: key in el)) > 1:
                logger.warning(f'Multiple nodes for {key} were found, only the first one will be returned.')
        else:
            if key not in self.content.subnodes_keys:
                return None
            elif len(as_filtered_list(self.content.subnodes_keys, lambda el: el == key)) > 1:
                logger.warning(f'Multiple nodes for {key} were found, only the first one will be returned.')

        return self._rec_traverse(self, key, strict_matching)

    def get_all_nodes_for_keys(
        self,
        keys: List[str],
        strict_matching: bool = True,
        stop_recurse_on_first_match: bool = False
    ) -> List[MarkdownNode]:
        """
        Return all nodes for the given keys, substring matching is supported.

        Args:
            keys: List of strings for the headers being collected
            strict_matching: Force exact match of key with header vs. simple substring match
            stop_recurse_on_first_match: Return first match of any of the keys and don't search subnodes

        Returns: List of found markdown nodes
        """
        if not strict_matching:
            if not any([key in el for el in self.content.subnodes_keys for key in keys]):
                return []
        elif not set(keys).intersection(self.content.subnodes_keys):
            return []

        return self._rec_traverse_all(self, keys, strict_matching, stop_recurse_on_first_match)

    def get_all_headers_for_key(self, key: str, strict_matching: bool = True) -> Iterable[str]:
        """Return all headers contained in the node with a given key."""
        if strict_matching:
            return list(filter(lambda header: key == header, self.content.subnodes_keys)).__iter__()
        else:
            return list(filter(lambda header: key in header, self.content.subnodes_keys)).__iter__()

    def get_node_header_lvl(self) -> Optional[int]:
        """Return current node header level."""
        return self._get_header_level_if_valid(self.key)

    def change_header_level_by(self, delta_level: int) -> None:
        """
        Change all headers in the tree by specified level up or down.

        All children nodes will be modified by specified level as well.

        Args:
            delta_level: each header will be modified by this number, can be negative.
        """
        # construct a map
        header_map = {}
        if self.key != 'root':
            new_key = self._modify_header_level(self.key, delta_level)
            header_map[self.key] = new_key
        for key in self.content.subnodes_keys:
            new_key = self._modify_header_level(key, delta_level)
            header_map[key] = new_key

        # go through all contents and modify headers
        self._rec_traverse_header_update(self, header_map)

    def get_count_of_subnodes(self, recurse=True) -> int:
        """Get count of subnodes with optional recursion."""
        count = len(self.subnodes)
        if recurse:
            for subnode in self.subnodes:
                count += subnode.get_count_of_subnodes(True)
        return count

    def delete_nodes_text(self, keys: List[str], strict_matching: bool = True) -> List[str]:
        """Remove text from this node that is found in matching subnodes."""
        text_lines = self.content.raw_text.split('\n')
        matching_nodes = self.get_all_nodes_for_keys(keys, strict_matching, True)
        # need to delete from end and proceed backwards
        sorted_nodes = sorted(matching_nodes, key=lambda node: node.starting_line, reverse=True)
        for node in sorted_nodes:
            last_line = node.starting_line + len(node.content.raw_text.split('\n'))
            delete_list_from_list(text_lines, list(range(node.starting_line, last_line)))
        return text_lines

    def _build_tree(
        self,
        lines: List[str],
        root_key: str,
        starting_line: int,
        level: int,
        governed_header: Optional[str] = None
    ) -> Tuple[MarkdownNode, int]:
        """
        Build a tree from the markdown recursively.

        The tree is contructed with valid headers as node's keys
        and node's content contains everything that is under that header.
        The subsections are placed into node's children with the same structure.

        A header is valid iff the line starts with # and it is not:
          1. Inside of the html blocks
          2. Inside single lined in the <> tags
          3. Inside the html comment
          4. Inside any table, code block or blockquotes
        """
        content = SectionContent()
        node_children = []
        i = starting_line

        while True:
            if i >= len(lines):
                break
            line = lines[i].strip(' ')
            header_lvl = self._get_header_level_if_valid(line)

            if header_lvl is not None:
                if header_lvl >= level + 1:
                    # build subtree
                    subtree, i = self._build_tree(lines, line, i + 1, level + 1, governed_header)
                    node_children.append(subtree)
                    content.union(subtree)
                else:
                    break  # level of the header is above or equal to the current level, subtree is over
            elif self._does_start_with(line, md_const.CODEBLOCK_DEF):
                code_lines, i = self._read_code_lines(lines, line, i + 1)
                content.code_lines.extend(code_lines)
            elif self._does_start_with(line, md_const.HTML_COMMENT_START):
                html_lines, i = self._read_html_block(lines, line, i + 1, md_const.HTML_COMMENT_END_REGEX)
                content.html_lines.extend(html_lines)
            elif self._does_contain(line, md_const.HTML_TAG_REGEX_START):
                html_lines, i = self._read_html_block(lines, line, i + 1, md_const.HTML_TAG_REGEX_END)
                content.html_lines.extend(html_lines)
            elif self._does_start_with(line, md_const.TABLE_SYMBOL):
                table_block, i = self._read_table_block(lines, line, i + 1)
                content.tables.extend(table_block)
            elif self._does_start_with(line, md_const.BLOCKQUOTE_CHAR):
                content.blockquotes.append(line)
                i += 1
            elif governed_header is not None and self._does_contain(
                    root_key, fr'^[#]+ {governed_header}$') and self._does_contain(line, md_const.GOVERNED_DOC_REGEX):
                regexp = re.compile(md_const.GOVERNED_DOC_REGEX)
                match = regexp.search(line)
                header = match.group(0).strip('*').strip(':')
                content.governed_document.append(header)
                i += 1
            else:
                content.text.append(line)
                i += 1

        first_line_to_grab = starting_line - 1 if starting_line else 0
        content.raw_text = '\n'.join(lines[first_line_to_grab:i])
        md_node = MarkdownNode(key=root_key, content=content, starting_line=first_line_to_grab)
        md_node.subnodes = node_children
        return (md_node, i)

    def _modify_header_level(self, header: str, delta_level: int) -> str:
        """Modify header level by specified level."""
        if delta_level == 0:
            logger.debug('Nothing to modify in header, level 0 is given.')
            return header

        current_level = self._get_header_level_if_valid(header)
        if current_level is None:
            current_level = 0
        if current_level + delta_level < 0:
            logger.warning(
                f'Cannot substract {delta_level} as level of {header} is {current_level}. All `#` will be removed.'
            )
            delta_level = current_level * -1

        if current_level + delta_level == 0:
            replacement = ''
        else:
            replacement = '#' * (current_level + delta_level)
        header = header.replace('#' * current_level, replacement)

        return header.strip(' ')

    def _get_header_level_if_valid(self, line: str) -> Optional[int]:
        """
        Return a level of the header if the given line is indeed a header.

        Level of the header is determined by the number of # symbols.
        """
        header_symbols = re.match(md_const.HEADER_REGEX, line)
        # Header is valid only if it line starts with header
        if header_symbols is not None and header_symbols.regs[0][0] == 0:
            return header_symbols.regs[0][1]
        return None

    def _does_start_with(self, line: str, start_chars: str) -> bool:
        """Determine whether the line starts with given characters."""
        return line.startswith(start_chars)

    def _does_contain(self, line: str, reg: str) -> bool:
        """Determine if the line matches regex."""
        if len(line) == 0 and reg != r'':
            return False
        regexp = re.compile(reg)
        return regexp.search(line) is not None

    def _read_code_lines(self, lines: List[str], line: str, i: int) -> Tuple[str, int]:
        """Read code block."""
        code_lines = [line]
        while True:
            if i >= len(lines):
                raise TrestleError(f'Code block is not closed: {code_lines}')

            line = lines[i]
            code_lines.append(line)
            i += 1
            if self._does_contain(line, md_const.CODEBLOCK_DEF):
                break
        return code_lines, i

    def _read_html_block(self, lines: List[str], line: str, i: int, ending_regex: str) -> Tuple[str, int]:
        """Read html block."""
        html_block = [line]
        if self._does_contain(line, r'<br[ /]*>'):
            return html_block, i
        if self._does_contain(line, ending_regex):
            return html_block, i
        while True:
            if i >= len(lines):
                raise TrestleError(f'HTML block is not closed: {html_block}')

            line = lines[i]
            html_block.append(line)
            i += 1
            if self._does_contain(line, ending_regex):
                break
        return html_block, i

    def _read_table_block(self, lines: List[str], line: str, i: int) -> Tuple[str, int]:
        """Read table."""
        table_block = [line]
        while True:
            if i >= len(lines):
                return table_block, i

            line = lines[i]
            if not self._does_contain(line, md_const.TABLE_REGEX):
                table_block.append(line)
                break
            table_block.append(line)
            i += 1
        return table_block, i

    def _rec_traverse(self, node: MarkdownNode, key: str, strict_matching: bool) -> Optional[MarkdownNode]:
        """
        Recursevely traverses the tree and searches for the given key.

        If strict matching is turned off, node will be matched if key is a substring of the node's header.
        """
        if key == node.key or (not strict_matching and key in node.key):
            return node
        if (not strict_matching and any([key in el
                                         for el in node.content.subnodes_keys])) or (key in node.content.subnodes_keys):
            for subnode in node.subnodes:
                matched_node = self._rec_traverse(subnode, key, strict_matching)
                if matched_node is not None:
                    return matched_node

        return None

    def _rec_traverse_all(
        self, node: MarkdownNode, keys: List[str], strict_matching: bool, stop_recurse_on_first_match: bool
    ) -> List[MarkdownNode]:
        """
        Recursevely traverse the tree and find all nodes matching the keys.

        If strict matching is turned off, nodes will be matched if key is a substring of the node's header.
        stop_recurse_on_first_match will return only the highest level key match and not any subnodes
        """
        found_nodes: List[MarkdownNode] = []
        for key in keys:
            if key == node.key or (not strict_matching and key in node.key):
                found_nodes.append(node)
                if stop_recurse_on_first_match:
                    return found_nodes
        for subnode in node.subnodes:
            matched_nodes = self._rec_traverse_all(subnode, keys, strict_matching, stop_recurse_on_first_match)
            found_nodes.extend(matched_nodes)
        return found_nodes

    def _rec_traverse_header_update(self, node: MarkdownNode, header_map: Dict[str, str]) -> None:
        """Recursively traverse tree and update the contents."""
        if node:
            if node.key != 'root':
                new_key = header_map[node.key]
                node.key = new_key

            # update text
            lines = node.content.raw_text.split('\n')
            if lines:
                for i in range(0, len(lines)):
                    line = lines[i]
                    if line in header_map.keys():
                        new_key = header_map[line]
                        lines[i] = new_key
                    elif line.strip(' ') in header_map.keys():
                        # keep spaces if any
                        new_key = header_map[line.strip(' ')]
                        lines[i] = line.replace(line.strip(' '), new_key)

                node.content.raw_text = '\n'.join(lines)

            # update subnodes
            if node.content.subnodes_keys:
                for i in range(0, len(node.content.subnodes_keys)):
                    subnode_key = node.content.subnodes_keys[i]
                    if subnode_key in header_map.keys():
                        new_key = header_map[subnode_key]
                        node.content.subnodes_keys[i] = new_key

        for subnode in node.subnodes:
            self._rec_traverse_header_update(subnode, header_map)

    def _get_max_header_lvl(self, lines: List[str]):
        """Go through all lines to determine highest header level. Less # means higher."""
        min_lvl = math.inf
        for line in lines:
            line = line.strip(' ')
            header_lvl = self._get_header_level_if_valid(line)

            if header_lvl is not None and header_lvl < min_lvl:
                min_lvl = header_lvl

        return min_lvl - 1
Methods¤
__init__(self, key, content, starting_line) special ¤

Initialize markdown node.

Source code in trestle/core/markdown/markdown_node.py
def __init__(self, key: str, content: SectionContent, starting_line: int):
    """Initialize markdown node."""
    self.subnodes: List[MarkdownNode] = []
    self.key = key
    self.content = content
    self.starting_line = starting_line
build_tree_from_markdown(lines, governed_header=None) classmethod ¤

Construct a tree out of the given markdown.

Source code in trestle/core/markdown/markdown_node.py
@classmethod
def build_tree_from_markdown(cls, lines: List[str], governed_header: Optional[str] = None):
    """Construct a tree out of the given markdown."""
    ob = cls.__new__(cls)
    start_level = ob._get_max_header_lvl(lines)
    ob, _ = ob._build_tree(lines, 'root', 0, start_level, governed_header)
    return ob
change_header_level_by(self, delta_level) ¤

Change all headers in the tree by specified level up or down.

All children nodes will be modified by specified level as well.

Parameters:

Name Type Description Default
delta_level int

each header will be modified by this number, can be negative.

required
Source code in trestle/core/markdown/markdown_node.py
def change_header_level_by(self, delta_level: int) -> None:
    """
    Change all headers in the tree by specified level up or down.

    All children nodes will be modified by specified level as well.

    Args:
        delta_level: each header will be modified by this number, can be negative.
    """
    # construct a map
    header_map = {}
    if self.key != 'root':
        new_key = self._modify_header_level(self.key, delta_level)
        header_map[self.key] = new_key
    for key in self.content.subnodes_keys:
        new_key = self._modify_header_level(key, delta_level)
        header_map[key] = new_key

    # go through all contents and modify headers
    self._rec_traverse_header_update(self, header_map)
delete_nodes_text(self, keys, strict_matching=True) ¤

Remove text from this node that is found in matching subnodes.

Source code in trestle/core/markdown/markdown_node.py
def delete_nodes_text(self, keys: List[str], strict_matching: bool = True) -> List[str]:
    """Remove text from this node that is found in matching subnodes."""
    text_lines = self.content.raw_text.split('\n')
    matching_nodes = self.get_all_nodes_for_keys(keys, strict_matching, True)
    # need to delete from end and proceed backwards
    sorted_nodes = sorted(matching_nodes, key=lambda node: node.starting_line, reverse=True)
    for node in sorted_nodes:
        last_line = node.starting_line + len(node.content.raw_text.split('\n'))
        delete_list_from_list(text_lines, list(range(node.starting_line, last_line)))
    return text_lines
get_all_headers_for_key(self, key, strict_matching=True) ¤

Return all headers contained in the node with a given key.

Source code in trestle/core/markdown/markdown_node.py
def get_all_headers_for_key(self, key: str, strict_matching: bool = True) -> Iterable[str]:
    """Return all headers contained in the node with a given key."""
    if strict_matching:
        return list(filter(lambda header: key == header, self.content.subnodes_keys)).__iter__()
    else:
        return list(filter(lambda header: key in header, self.content.subnodes_keys)).__iter__()
get_all_headers_for_level(self, level) ¤

Return all headers per specified level of hierarchy.

Source code in trestle/core/markdown/markdown_node.py
def get_all_headers_for_level(self, level: int) -> Iterable[str]:
    """Return all headers per specified level of hierarchy."""
    return list(
        filter(lambda header: self._get_header_level_if_valid(header) == level, self.content.subnodes_keys)
    ).__iter__()
get_all_nodes_for_keys(self, keys, strict_matching=True, stop_recurse_on_first_match=False) ¤

Return all nodes for the given keys, substring matching is supported.

Parameters:

Name Type Description Default
keys List[str]

List of strings for the headers being collected

required
strict_matching bool

Force exact match of key with header vs. simple substring match

True
stop_recurse_on_first_match bool

Return first match of any of the keys and don't search subnodes

False

Returns: List of found markdown nodes

Source code in trestle/core/markdown/markdown_node.py
def get_all_nodes_for_keys(
    self,
    keys: List[str],
    strict_matching: bool = True,
    stop_recurse_on_first_match: bool = False
) -> List[MarkdownNode]:
    """
    Return all nodes for the given keys, substring matching is supported.

    Args:
        keys: List of strings for the headers being collected
        strict_matching: Force exact match of key with header vs. simple substring match
        stop_recurse_on_first_match: Return first match of any of the keys and don't search subnodes

    Returns: List of found markdown nodes
    """
    if not strict_matching:
        if not any([key in el for el in self.content.subnodes_keys for key in keys]):
            return []
    elif not set(keys).intersection(self.content.subnodes_keys):
        return []

    return self._rec_traverse_all(self, keys, strict_matching, stop_recurse_on_first_match)
get_count_of_subnodes(self, recurse=True) ¤

Get count of subnodes with optional recursion.

Source code in trestle/core/markdown/markdown_node.py
def get_count_of_subnodes(self, recurse=True) -> int:
    """Get count of subnodes with optional recursion."""
    count = len(self.subnodes)
    if recurse:
        for subnode in self.subnodes:
            count += subnode.get_count_of_subnodes(True)
    return count
get_node_for_key(self, key, strict_matching=True) ¤

Return a first node for the given key, substring matching is supported.

Source code in trestle/core/markdown/markdown_node.py
def get_node_for_key(self, key: str, strict_matching: bool = True) -> Optional[MarkdownNode]:
    """Return a first node for the given key, substring matching is supported."""
    if not strict_matching:
        if not any([key in el for el in self.content.subnodes_keys]):
            return None
        elif len(as_filtered_list(self.content.subnodes_keys, lambda el: key in el)) > 1:
            logger.warning(f'Multiple nodes for {key} were found, only the first one will be returned.')
    else:
        if key not in self.content.subnodes_keys:
            return None
        elif len(as_filtered_list(self.content.subnodes_keys, lambda el: el == key)) > 1:
            logger.warning(f'Multiple nodes for {key} were found, only the first one will be returned.')

    return self._rec_traverse(self, key, strict_matching)
get_node_header_lvl(self) ¤

Return current node header level.

Source code in trestle/core/markdown/markdown_node.py
def get_node_header_lvl(self) -> Optional[int]:
    """Return current node header level."""
    return self._get_header_level_if_valid(self.key)

SectionContent ¤

A content of the node.

Source code in trestle/core/markdown/markdown_node.py
class SectionContent:
    """A content of the node."""

    def __init__(self):
        """Initialize section content."""
        self.tables = []
        self.text = []
        self.code_lines = []
        self.html_lines = []
        self.blockquotes = []
        self.raw_text = ''
        self.subnodes_keys = []
        self.governed_document = []

    def union(self, node: MarkdownNode) -> None:
        """Unites contents together."""
        self.subnodes_keys.append(node.key)
        self.subnodes_keys.extend(node.content.subnodes_keys)
        self.code_lines.extend(node.content.code_lines)
        self.html_lines.extend(node.content.html_lines)
        self.tables.extend(node.content.tables)
        self.blockquotes.extend(node.content.blockquotes)
Methods¤
__init__(self) special ¤

Initialize section content.

Source code in trestle/core/markdown/markdown_node.py
def __init__(self):
    """Initialize section content."""
    self.tables = []
    self.text = []
    self.code_lines = []
    self.html_lines = []
    self.blockquotes = []
    self.raw_text = ''
    self.subnodes_keys = []
    self.governed_document = []
union(self, node) ¤

Unites contents together.

Source code in trestle/core/markdown/markdown_node.py
def union(self, node: MarkdownNode) -> None:
    """Unites contents together."""
    self.subnodes_keys.append(node.key)
    self.subnodes_keys.extend(node.content.subnodes_keys)
    self.code_lines.extend(node.content.code_lines)
    self.html_lines.extend(node.content.html_lines)
    self.tables.extend(node.content.tables)
    self.blockquotes.extend(node.content.blockquotes)

handler: python