Skip to content

Commit

Permalink
fix: in list table parsing
Browse files Browse the repository at this point in the history
The in-list-item table should replace the paragraph node, instead of
being a child of the paragraph node.
Because paragraph join its children into one line str, since it assumes
all its children are inline item.
  • Loading branch information
drmingdrmer committed Sep 15, 2024
1 parent 873ae2d commit 9a0bac9
Show file tree
Hide file tree
Showing 3 changed files with 67 additions and 48 deletions.
105 changes: 63 additions & 42 deletions md2zhihu/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import pprint
import re
import shutil
from typing import Optional
from typing import Optional, List

import k3down2
import k3git
Expand All @@ -20,15 +20,21 @@

from .. import mistune

class MarkdownMeta(object):
def __init__(self, meta_text: str):
self.text = meta_text
self.dict_data = yaml.safe_load(meta_text)
class FrontMatter(object):
"""
The font matter is the yaml enclosed between `---` at the top of a markdown.
"""
def __init__(self, front_matter_text: str):
self.text = front_matter_text
self.data = yaml.safe_load(front_matter_text)

def build_refs(self, platform: str) -> dict:
def get_refs(self, platform: str) -> dict:
"""
Get refs from front matter.
"""
dic = {}

meta = self.dict_data
meta = self.data

# Collect universal refs
if 'refs' in meta:
Expand Down Expand Up @@ -462,51 +468,71 @@ def render_node(self, rnode):
pprint.pprint(n)
return ['***:' + typ]

def render(self, rnode):
def render(self, rnode) -> List[str]:
rst = []
for n in rnode.node['children']:
child = rnode.new_child(n)
rst.extend(self.render_node(child))
lines = self.render_node(child)
rst.extend(lines)

return rst

def msg(self, *args):
msg(*args)


def fix_tables(nodes):
def parse_in_list_tables(nodes) -> List[dict]:
"""
mistune does not parse table in list item.
We need to recursively fix it.
"""

rst = []
for n in nodes:
if 'children' in n:
fix_tables(n['children'])
n['children'] = parse_in_list_tables(n['children'])

if n['type'] == 'paragraph':
children = n['children']
nodes = convert_paragraph_table(n)
rst.extend(nodes)

if len(children) == 0:
continue
return rst

c0 = children[0]
if c0['type'] != 'text':
continue
def convert_paragraph_table(node: dict) -> List[dict]:
"""
Parse table text in a paragraph and returns the ast of parsed table.
:return List[dict]: a list of ast nodes.
"""

txt = c0['text']
if node['type'] != 'paragraph':
return [node]

table_reg = r' {0,3}\|(.+)\n *\|( *[-:]+[-| :]*)\n((?: *\|.*(?:\n|$))*)\n*'
children = node['children']

match = re.match(table_reg, txt)
if match:
mdr = MDRender(None, platform='')
partialmd = mdr.render(RenderNode(n))
partialmd = ''.join(partialmd)
if len(children) == 0:
return [node]

c0 = children[0]
if c0['type'] != 'text':
return [node]

txt = c0['text']

table_reg = r' {0,3}\|(.+)\n *\|( *[-:]+[-| :]*)\n((?: *\|.*(?:\n|$))*)\n*'

match = re.match(table_reg, txt)
if match:
mdr = MDRender(None, platform='')
partialmd = mdr.render(RenderNode(node))
partialmd = ''.join(partialmd)

parser = new_parser()
new_children = parser(partialmd)

return new_children
else:
return [node]

parser = new_parser()
new_children = parser(partialmd)
n['children'] = new_children


def join_math_block(nodes):
Expand Down Expand Up @@ -842,14 +868,14 @@ def extract_ref_definitions(cont):
return '\n'.join(rst), refs


def extract_jekyll_meta(cont):
def extract_front_matter(cont):
meta = None
m = re.match(r'^ *--- *\n(.*?)\n---\n', cont,
flags=re.DOTALL | re.UNICODE)
if m:
cont = cont[m.end():]
meta_text = m.groups()[0].strip()
meta = MarkdownMeta(meta_text)
meta = FrontMatter(meta_text)

return cont, meta

Expand Down Expand Up @@ -1402,34 +1428,29 @@ def __init__(self, conf: Config, md_text: str):
# References used in this markdown
self.used_refs = None

self.meta: Optional[MarkdownMeta] = None
self.front_matter: Optional[FrontMatter] = None

# Parsed AST of the markdown
self.ast = None

# extract article meta

self.md_text, self.meta = extract_jekyll_meta(self.md_text)
self.md_text, self.front_matter = extract_front_matter(self.md_text)
self.md_text, article_refs = extract_ref_definitions(self.md_text)

# build refs

self.refs.update(load_external_refs(self.conf))
if self.meta is not None:
self.refs.update(self.meta.build_refs("zhihu"))
if self.front_matter is not None:
self.refs.update(self.front_matter.get_refs("zhihu"))
self.refs.update(article_refs)

# parse to ast and clean up

parse_to_ast = new_parser()
self.ast = parse_to_ast(self.md_text)

# TODO use feature detection to decide if we need to convert table to hml
if self.conf.platform == 'minimal_mistake':
# jekyll output does render table well.
pass
else:
fix_tables(self.ast)
self.ast = parse_in_list_tables(self.ast)

self.used_refs = replace_ref_with_def(self.ast, self.refs)

Expand Down Expand Up @@ -1457,8 +1478,8 @@ def render(self):
}
output_lines = mdr.render(RenderNode(root_node))

if self.conf.keep_meta and self.meta is not None:
output_lines = ['---', self.meta.text, '---'] + output_lines
if self.conf.keep_meta and self.front_matter is not None:
output_lines = ['---', self.front_matter.text, '---'] + output_lines

output_lines.append('')

Expand Down
6 changes: 2 additions & 4 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,8 @@

import setuptools

import imp

pseudo = "pseudo"
pkg = imp.load_source(pseudo, 'md2zhihu/version.py')
import importlib
pkg = importlib.import_module('md2zhihu.version')

setuptools.setup(
name="md2zhihu",
Expand Down
4 changes: 2 additions & 2 deletions test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,5 @@ python setup.py install
)

# PYTHONPATH="$(cd ..; pwd)" pytest -x -v
PYTHONPATH="$(cd ..; pwd)" pytest -x -v -k test_simple
# PYTHONPATH="$(cd ..; pwd)" pytest -x -v --show-capture=stdout -k test_minimal_mistake
# PYTHONPATH="$(cd ..; pwd)" pytest -x -v -k test_simple
PYTHONPATH="$(cd ..; pwd)" pytest -x -v --show-capture=all -k test_minimal_mistake

0 comments on commit 9a0bac9

Please sign in to comment.