markdown_node
trestle.core.markdown.markdown_node
¤
A markdown node.
logger
¤
Classes¤
MarkdownNode
¤
Markdown will be read to the tree.
Source code in trestle/core/markdown/markdown_node.py
class MarkdownNode:
"""Markdown will be read to the tree."""
def __init__(self, key: str, content: SectionContent, starting_line: int):
"""Initialize markdown node."""
self.subnodes: List[MarkdownNode] = []
self.key = key
self.content = content
self.starting_line = starting_line
@classmethod
def build_tree_from_markdown(cls, lines: List[str], governed_header: Optional[str] = None):
"""Construct a tree out of the given markdown."""
ob = cls.__new__(cls)
start_level = ob._get_max_header_lvl(lines)
ob, _ = ob._build_tree(lines, 'root', 0, start_level, governed_header)
return ob
def get_all_headers_for_level(self, level: int) -> Iterable[str]:
"""Return all headers per specified level of hierarchy."""
return list(
filter(lambda header: self._get_header_level_if_valid(header) == level, self.content.subnodes_keys)
).__iter__()
def get_node_for_key(self, key: str, strict_matching: bool = True) -> Optional[MarkdownNode]:
"""Return a first node for the given key, substring matching is supported."""
if not strict_matching:
if not any([key in el for el in self.content.subnodes_keys]):
return None
elif len(as_filtered_list(self.content.subnodes_keys, lambda el: key in el)) > 1:
logger.warning(f'Multiple nodes for {key} were found, only the first one will be returned.')
else:
if key not in self.content.subnodes_keys:
return None
elif len(as_filtered_list(self.content.subnodes_keys, lambda el: el == key)) > 1:
logger.warning(f'Multiple nodes for {key} were found, only the first one will be returned.')
return self._rec_traverse(self, key, strict_matching)
def get_all_nodes_for_keys(
self,
keys: List[str],
strict_matching: bool = True,
stop_recurse_on_first_match: bool = False
) -> List[MarkdownNode]:
"""
Return all nodes for the given keys, substring matching is supported.
Args:
keys: List of strings for the headers being collected
strict_matching: Force exact match of key with header vs. simple substring match
stop_recurse_on_first_match: Return first match of any of the keys and don't search subnodes
Returns: List of found markdown nodes
"""
if not strict_matching:
if not any([key in el for el in self.content.subnodes_keys for key in keys]):
return []
elif not set(keys).intersection(self.content.subnodes_keys):
return []
return self._rec_traverse_all(self, keys, strict_matching, stop_recurse_on_first_match)
def get_all_headers_for_key(self, key: str, strict_matching: bool = True) -> Iterable[str]:
"""Return all headers contained in the node with a given key."""
if strict_matching:
return list(filter(lambda header: key == header, self.content.subnodes_keys)).__iter__()
else:
return list(filter(lambda header: key in header, self.content.subnodes_keys)).__iter__()
def get_node_header_lvl(self) -> Optional[int]:
"""Return current node header level."""
return self._get_header_level_if_valid(self.key)
def change_header_level_by(self, delta_level: int) -> None:
"""
Change all headers in the tree by specified level up or down.
All children nodes will be modified by specified level as well.
Args:
delta_level: each header will be modified by this number, can be negative.
"""
# construct a map
header_map = {}
if self.key != 'root':
new_key = self._modify_header_level(self.key, delta_level)
header_map[self.key] = new_key
for key in self.content.subnodes_keys:
new_key = self._modify_header_level(key, delta_level)
header_map[key] = new_key
# go through all contents and modify headers
self._rec_traverse_header_update(self, header_map)
def get_count_of_subnodes(self, recurse=True) -> int:
"""Get count of subnodes with optional recursion."""
count = len(self.subnodes)
if recurse:
for subnode in self.subnodes:
count += subnode.get_count_of_subnodes(True)
return count
def delete_nodes_text(self, keys: List[str], strict_matching: bool = True) -> List[str]:
"""Remove text from this node that is found in matching subnodes."""
text_lines = self.content.raw_text.split('\n')
matching_nodes = self.get_all_nodes_for_keys(keys, strict_matching, True)
# need to delete from end and proceed backwards
sorted_nodes = sorted(matching_nodes, key=lambda node: node.starting_line, reverse=True)
for node in sorted_nodes:
last_line = node.starting_line + len(node.content.raw_text.split('\n'))
delete_list_from_list(text_lines, list(range(node.starting_line, last_line)))
return text_lines
def _build_tree(
self,
lines: List[str],
root_key: str,
starting_line: int,
level: int,
governed_header: Optional[str] = None
) -> Tuple[MarkdownNode, int]:
"""
Build a tree from the markdown recursively.
The tree is contructed with valid headers as node's keys
and node's content contains everything that is under that header.
The subsections are placed into node's children with the same structure.
A header is valid iff the line starts with # and it is not:
1. Inside of the html blocks
2. Inside single lined in the <> tags
3. Inside the html comment
4. Inside any table, code block or blockquotes
"""
content = SectionContent()
node_children = []
i = starting_line
while True:
if i >= len(lines):
break
line = lines[i].strip(' ')
header_lvl = self._get_header_level_if_valid(line)
if header_lvl is not None:
if header_lvl >= level + 1:
# build subtree
subtree, i = self._build_tree(lines, line, i + 1, level + 1, governed_header)
node_children.append(subtree)
content.union(subtree)
else:
break # level of the header is above or equal to the current level, subtree is over
elif self._does_start_with(line, md_const.CODEBLOCK_DEF):
code_lines, i = self._read_code_lines(lines, line, i + 1)
content.code_lines.extend(code_lines)
elif self._does_start_with(line, md_const.HTML_COMMENT_START):
html_lines, i = self._read_html_block(lines, line, i + 1, md_const.HTML_COMMENT_END_REGEX)
content.html_lines.extend(html_lines)
elif self._does_contain(line, md_const.HTML_TAG_REGEX_START):
html_lines, i = self._read_html_block(lines, line, i + 1, md_const.HTML_TAG_REGEX_END)
content.html_lines.extend(html_lines)
elif self._does_start_with(line, md_const.TABLE_SYMBOL):
table_block, i = self._read_table_block(lines, line, i + 1)
content.tables.extend(table_block)
elif self._does_start_with(line, md_const.BLOCKQUOTE_CHAR):
content.blockquotes.append(line)
i += 1
elif governed_header is not None and self._does_contain(
root_key, fr'^[#]+ {governed_header}$') and self._does_contain(line, md_const.GOVERNED_DOC_REGEX):
regexp = re.compile(md_const.GOVERNED_DOC_REGEX)
match = regexp.search(line)
header = match.group(0).strip('*').strip(':')
content.governed_document.append(header)
i += 1
else:
content.text.append(line)
i += 1
first_line_to_grab = starting_line - 1 if starting_line else 0
content.raw_text = '\n'.join(lines[first_line_to_grab:i])
md_node = MarkdownNode(key=root_key, content=content, starting_line=first_line_to_grab)
md_node.subnodes = node_children
return (md_node, i)
def _modify_header_level(self, header: str, delta_level: int) -> str:
"""Modify header level by specified level."""
if delta_level == 0:
logger.debug('Nothing to modify in header, level 0 is given.')
return header
current_level = self._get_header_level_if_valid(header)
if current_level is None:
current_level = 0
if current_level + delta_level < 0:
logger.warning(
f'Cannot substract {delta_level} as level of {header} is {current_level}. All `#` will be removed.'
)
delta_level = current_level * -1
if current_level + delta_level == 0:
replacement = ''
else:
replacement = '#' * (current_level + delta_level)
header = header.replace('#' * current_level, replacement)
return header.strip(' ')
def _get_header_level_if_valid(self, line: str) -> Optional[int]:
"""
Return a level of the header if the given line is indeed a header.
Level of the header is determined by the number of # symbols.
"""
header_symbols = re.match(md_const.HEADER_REGEX, line)
# Header is valid only if it line starts with header
if header_symbols is not None and header_symbols.regs[0][0] == 0:
return header_symbols.regs[0][1]
return None
def _does_start_with(self, line: str, start_chars: str) -> bool:
"""Determine whether the line starts with given characters."""
return line.startswith(start_chars)
def _does_contain(self, line: str, reg: str) -> bool:
"""Determine if the line matches regex."""
if len(line) == 0 and reg != r'':
return False
regexp = re.compile(reg)
return regexp.search(line) is not None
def _read_code_lines(self, lines: List[str], line: str, i: int) -> Tuple[str, int]:
"""Read code block."""
code_lines = [line]
while True:
if i >= len(lines):
raise TrestleError(f'Code block is not closed: {code_lines}')
line = lines[i]
code_lines.append(line)
i += 1
if self._does_contain(line, md_const.CODEBLOCK_DEF):
break
return code_lines, i
def _read_html_block(self, lines: List[str], line: str, i: int, ending_regex: str) -> Tuple[str, int]:
"""Read html block."""
html_block = [line]
if self._does_contain(line, r'<br[ /]*>'):
return html_block, i
if self._does_contain(line, ending_regex):
return html_block, i
while True:
if i >= len(lines):
raise TrestleError(f'HTML block is not closed: {html_block}')
line = lines[i]
html_block.append(line)
i += 1
if self._does_contain(line, ending_regex):
break
return html_block, i
def _read_table_block(self, lines: List[str], line: str, i: int) -> Tuple[str, int]:
"""Read table."""
table_block = [line]
while True:
if i >= len(lines):
return table_block, i
line = lines[i]
if not self._does_contain(line, md_const.TABLE_REGEX):
table_block.append(line)
break
table_block.append(line)
i += 1
return table_block, i
def _rec_traverse(self, node: MarkdownNode, key: str, strict_matching: bool) -> Optional[MarkdownNode]:
"""
Recursevely traverses the tree and searches for the given key.
If strict matching is turned off, node will be matched if key is a substring of the node's header.
"""
if key == node.key or (not strict_matching and key in node.key):
return node
if (not strict_matching and any([key in el
for el in node.content.subnodes_keys])) or (key in node.content.subnodes_keys):
for subnode in node.subnodes:
matched_node = self._rec_traverse(subnode, key, strict_matching)
if matched_node is not None:
return matched_node
return None
def _rec_traverse_all(
self, node: MarkdownNode, keys: List[str], strict_matching: bool, stop_recurse_on_first_match: bool
) -> List[MarkdownNode]:
"""
Recursevely traverse the tree and find all nodes matching the keys.
If strict matching is turned off, nodes will be matched if key is a substring of the node's header.
stop_recurse_on_first_match will return only the highest level key match and not any subnodes
"""
found_nodes: List[MarkdownNode] = []
for key in keys:
if key == node.key or (not strict_matching and key in node.key):
found_nodes.append(node)
if stop_recurse_on_first_match:
return found_nodes
for subnode in node.subnodes:
matched_nodes = self._rec_traverse_all(subnode, keys, strict_matching, stop_recurse_on_first_match)
found_nodes.extend(matched_nodes)
return found_nodes
def _rec_traverse_header_update(self, node: MarkdownNode, header_map: Dict[str, str]) -> None:
"""Recursively traverse tree and update the contents."""
if node:
if node.key != 'root':
new_key = header_map[node.key]
node.key = new_key
# update text
lines = node.content.raw_text.split('\n')
if lines:
for i in range(0, len(lines)):
line = lines[i]
if line in header_map.keys():
new_key = header_map[line]
lines[i] = new_key
elif line.strip(' ') in header_map.keys():
# keep spaces if any
new_key = header_map[line.strip(' ')]
lines[i] = line.replace(line.strip(' '), new_key)
node.content.raw_text = '\n'.join(lines)
# update subnodes
if node.content.subnodes_keys:
for i in range(0, len(node.content.subnodes_keys)):
subnode_key = node.content.subnodes_keys[i]
if subnode_key in header_map.keys():
new_key = header_map[subnode_key]
node.content.subnodes_keys[i] = new_key
for subnode in node.subnodes:
self._rec_traverse_header_update(subnode, header_map)
def _get_max_header_lvl(self, lines: List[str]):
"""Go through all lines to determine highest header level. Less # means higher."""
min_lvl = math.inf
for line in lines:
line = line.strip(' ')
header_lvl = self._get_header_level_if_valid(line)
if header_lvl is not None and header_lvl < min_lvl:
min_lvl = header_lvl
return min_lvl - 1
Methods¤
__init__(self, key, content, starting_line)
special
¤
Initialize markdown node.
Source code in trestle/core/markdown/markdown_node.py
def __init__(self, key: str, content: SectionContent, starting_line: int):
"""Initialize markdown node."""
self.subnodes: List[MarkdownNode] = []
self.key = key
self.content = content
self.starting_line = starting_line
build_tree_from_markdown(lines, governed_header=None)
classmethod
¤
Construct a tree out of the given markdown.
Source code in trestle/core/markdown/markdown_node.py
@classmethod
def build_tree_from_markdown(cls, lines: List[str], governed_header: Optional[str] = None):
"""Construct a tree out of the given markdown."""
ob = cls.__new__(cls)
start_level = ob._get_max_header_lvl(lines)
ob, _ = ob._build_tree(lines, 'root', 0, start_level, governed_header)
return ob
change_header_level_by(self, delta_level)
¤
Change all headers in the tree by specified level up or down.
All children nodes will be modified by specified level as well.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
delta_level |
int |
each header will be modified by this number, can be negative. |
required |
Source code in trestle/core/markdown/markdown_node.py
def change_header_level_by(self, delta_level: int) -> None:
"""
Change all headers in the tree by specified level up or down.
All children nodes will be modified by specified level as well.
Args:
delta_level: each header will be modified by this number, can be negative.
"""
# construct a map
header_map = {}
if self.key != 'root':
new_key = self._modify_header_level(self.key, delta_level)
header_map[self.key] = new_key
for key in self.content.subnodes_keys:
new_key = self._modify_header_level(key, delta_level)
header_map[key] = new_key
# go through all contents and modify headers
self._rec_traverse_header_update(self, header_map)
delete_nodes_text(self, keys, strict_matching=True)
¤
Remove text from this node that is found in matching subnodes.
Source code in trestle/core/markdown/markdown_node.py
def delete_nodes_text(self, keys: List[str], strict_matching: bool = True) -> List[str]:
"""Remove text from this node that is found in matching subnodes."""
text_lines = self.content.raw_text.split('\n')
matching_nodes = self.get_all_nodes_for_keys(keys, strict_matching, True)
# need to delete from end and proceed backwards
sorted_nodes = sorted(matching_nodes, key=lambda node: node.starting_line, reverse=True)
for node in sorted_nodes:
last_line = node.starting_line + len(node.content.raw_text.split('\n'))
delete_list_from_list(text_lines, list(range(node.starting_line, last_line)))
return text_lines
get_all_headers_for_key(self, key, strict_matching=True)
¤
Return all headers contained in the node with a given key.
Source code in trestle/core/markdown/markdown_node.py
def get_all_headers_for_key(self, key: str, strict_matching: bool = True) -> Iterable[str]:
"""Return all headers contained in the node with a given key."""
if strict_matching:
return list(filter(lambda header: key == header, self.content.subnodes_keys)).__iter__()
else:
return list(filter(lambda header: key in header, self.content.subnodes_keys)).__iter__()
get_all_headers_for_level(self, level)
¤
Return all headers per specified level of hierarchy.
Source code in trestle/core/markdown/markdown_node.py
def get_all_headers_for_level(self, level: int) -> Iterable[str]:
"""Return all headers per specified level of hierarchy."""
return list(
filter(lambda header: self._get_header_level_if_valid(header) == level, self.content.subnodes_keys)
).__iter__()
get_all_nodes_for_keys(self, keys, strict_matching=True, stop_recurse_on_first_match=False)
¤
Return all nodes for the given keys, substring matching is supported.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
keys |
List[str] |
List of strings for the headers being collected |
required |
strict_matching |
bool |
Force exact match of key with header vs. simple substring match |
True |
stop_recurse_on_first_match |
bool |
Return first match of any of the keys and don't search subnodes |
False |
Returns: List of found markdown nodes
Source code in trestle/core/markdown/markdown_node.py
def get_all_nodes_for_keys(
self,
keys: List[str],
strict_matching: bool = True,
stop_recurse_on_first_match: bool = False
) -> List[MarkdownNode]:
"""
Return all nodes for the given keys, substring matching is supported.
Args:
keys: List of strings for the headers being collected
strict_matching: Force exact match of key with header vs. simple substring match
stop_recurse_on_first_match: Return first match of any of the keys and don't search subnodes
Returns: List of found markdown nodes
"""
if not strict_matching:
if not any([key in el for el in self.content.subnodes_keys for key in keys]):
return []
elif not set(keys).intersection(self.content.subnodes_keys):
return []
return self._rec_traverse_all(self, keys, strict_matching, stop_recurse_on_first_match)
get_count_of_subnodes(self, recurse=True)
¤
Get count of subnodes with optional recursion.
Source code in trestle/core/markdown/markdown_node.py
def get_count_of_subnodes(self, recurse=True) -> int:
"""Get count of subnodes with optional recursion."""
count = len(self.subnodes)
if recurse:
for subnode in self.subnodes:
count += subnode.get_count_of_subnodes(True)
return count
get_node_for_key(self, key, strict_matching=True)
¤
Return a first node for the given key, substring matching is supported.
Source code in trestle/core/markdown/markdown_node.py
def get_node_for_key(self, key: str, strict_matching: bool = True) -> Optional[MarkdownNode]:
"""Return a first node for the given key, substring matching is supported."""
if not strict_matching:
if not any([key in el for el in self.content.subnodes_keys]):
return None
elif len(as_filtered_list(self.content.subnodes_keys, lambda el: key in el)) > 1:
logger.warning(f'Multiple nodes for {key} were found, only the first one will be returned.')
else:
if key not in self.content.subnodes_keys:
return None
elif len(as_filtered_list(self.content.subnodes_keys, lambda el: el == key)) > 1:
logger.warning(f'Multiple nodes for {key} were found, only the first one will be returned.')
return self._rec_traverse(self, key, strict_matching)
get_node_header_lvl(self)
¤
Return current node header level.
Source code in trestle/core/markdown/markdown_node.py
def get_node_header_lvl(self) -> Optional[int]:
"""Return current node header level."""
return self._get_header_level_if_valid(self.key)
SectionContent
¤
A content of the node.
Source code in trestle/core/markdown/markdown_node.py
class SectionContent:
"""A content of the node."""
def __init__(self):
"""Initialize section content."""
self.tables = []
self.text = []
self.code_lines = []
self.html_lines = []
self.blockquotes = []
self.raw_text = ''
self.subnodes_keys = []
self.governed_document = []
def union(self, node: MarkdownNode) -> None:
"""Unites contents together."""
self.subnodes_keys.append(node.key)
self.subnodes_keys.extend(node.content.subnodes_keys)
self.code_lines.extend(node.content.code_lines)
self.html_lines.extend(node.content.html_lines)
self.tables.extend(node.content.tables)
self.blockquotes.extend(node.content.blockquotes)
Methods¤
__init__(self)
special
¤
Initialize section content.
Source code in trestle/core/markdown/markdown_node.py
def __init__(self):
"""Initialize section content."""
self.tables = []
self.text = []
self.code_lines = []
self.html_lines = []
self.blockquotes = []
self.raw_text = ''
self.subnodes_keys = []
self.governed_document = []
union(self, node)
¤
Unites contents together.
Source code in trestle/core/markdown/markdown_node.py
def union(self, node: MarkdownNode) -> None:
"""Unites contents together."""
self.subnodes_keys.append(node.key)
self.subnodes_keys.extend(node.content.subnodes_keys)
self.code_lines.extend(node.content.code_lines)
self.html_lines.extend(node.content.html_lines)
self.tables.extend(node.content.tables)
self.blockquotes.extend(node.content.blockquotes)
handler: python