文本生成块 util.py
把纯文本分成一个一个的文本块,以便接下来对每一个文本块进行解析
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 ''' 处理 TXT 文本,创建返回文本块的生成器 ''' def lines (file ): """生成器,在文本最后加一空行 """ for line in file: yield line yield '\n' def blocks (file ): """生成器,将 TXT 文件内容生成一个个单独的文本块,按空行分 """ block = [] for line in lines(file): if line.strip(): block.append(line) elif block: yield '' .join(block).strip() block = []
blocks
方法在调用时,会将打开的文件内容生成的 IOWrapper
对象作为参数,也就是我们平时使用 open
方法时生成的 f
变量
处理程序 handlers.py
给文本块打上合适的 HTML 标记
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 ''' HTML 文本处理类,用于打印各种 HTML 标签 ''' class Handler : """ 处理程序父类 """ def callback (self, prefix, name, *args ): method = getattr (self, prefix + name, None ) if callable (method): return method(*args) def start (self, name ): self.callback('start_' , name) def end (self, name ): self.callback('end_' , name) def sub (self, name ): def substitution (match ): result = self.callback('sub_' , name, match) if result is None : result = match.group(0 ) return result return substitution class HTMLRenderer (Handler ): """ HTML 处理程序,给文本块加相应的 HTML 标记 """ def start_document (self ): print ('<html><head><title>ShiYanLou</title></head><body>' ) def end_document (self ): print ('</body></html>' ) def start_paragraph (self ): print ('<p style="color: #444;">' ) def end_paragraph (self ): print ('</p>' ) def start_heading (self ): print ('<h2 style="color: #68BE5D;">' ) def end_heading (self ): print ('</h2>' ) def start_list (self ): print ('<ul style="color: #363736;">' ) def end_list (self ): print ('</ul>' ) def start_listitem (self ): print ('<li>' ) def end_listitem (self ): print ('</li>' ) def start_title (self ): print ('<h1 style="color: #1ABC9C;">' ) def end_title (self ): print ('</h1>' ) def sub_emphasis (self, match ): return ('<em>%s</em>' % match.group(1 )) def sub_url (self, match ): s = ('<a target="_blank" style="text-decoration: none;' 'color: #BC1A4B;" href="{}">{}</a>' ) return s.format (match.group(1 ), match.group(1 )) def sub_mail (self, match ): s = ('<a style="text-decoration: none;color: #BC1A4B;" ' 'href="mailto:{}">{}</a>' ) return s.format (match.group(1 ), match.group(1 )) def feed (self, data ): print (data)
在上面的代码中 callable
方法用于检查一个函数是否能够被调用,Python 内置方法 gerattr
用于返回一个对象的属性值。举例来说,getattr(x, 'foo', None)
就相当于是 x.foo
,而如果没有这个属性值 foo
,则返回我们设定的默认值 None
。
规则 rules.py
需要一定的规则来判断每个文本块交给处理程序将要加什么标记
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 '''处理文本块的规则类,所有类均为单例模式,在程序运行时除了 Rule 每个类仅创建一个实例 ''' class Rule : """ 所有规则类的父类 """ def action (self, block, handler ): """ 加标记,以下三行执行打印 HTML 标签的功能 """ handler.start(self.type ) handler.feed(block) handler.end(self.type ) return True class HeadingRule (Rule ): """ 一号标题规则,HTML 文件的一级标题规则(最大字号)<h1> 标签 """ type = 'heading' def condition (self, block ): """ 判断文本块是否符合规则,返回值为布尔值 True 或 False """ return not '\n' in block and len (block) <= 70 and not block[-1 ] == ':' class TitleRule (HeadingRule ): """ 二号标题规则,次级标题规则,继承一号标题规则类 <h2> 标签 """ type = 'title' first = True def condition (self, block ): if not self.first: return False self.first = False return super ().condition(block) class ListItemRule (Rule ): """ 列表项规则,<li> 标签 """ type = 'listitem' def condition (self, block ): return block[0 ] == '-' def action (self, block, handler ): handler.start(self.type ) handler.feed(block[1 :].strip()) handler.end(self.type ) return True class ListRule (ListItemRule ): """ 列表规则,<ul> 标签 """ type = 'list' inside = False def condition (self, block ): return True def action (self, block, handler ): if not self.inside and super ().condition(block): handler.start(self.type ) self.inside = True elif self.inside and not super ().condition(block): handler.end(self.type ) self.inside = False return False class ParagraphRule (Rule ): """ 段落规则,<p> 标签 """ type = 'paragraph' def condition (self, block ): return True rule_list = [ListRule(), ListItemRule(), TitleRule(), HeadingRule(), ParagraphRule()]
解析 markup.py
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 import sys, refrom handlers import HTMLRendererfrom util import blocksfrom rules import rule_listclass Parser : """ 解析器父类 """ def __init__ (self, handler ): self.handler = handler self.rules = [] self.filters = [] def addRule (self, rule ): """ 向 self.rules 列表中添加规则类的实例 """ self.rules.append(rule) def addFilter (self, pattern, name ): """ 向 self.filters 列表中添加过滤函数 """ def filter (block, handler ): return re.sub(pattern, handler.sub(name), block) self.filters.append(filter ) def parse (self, file ): """ 核心方法,解析文本,打印符合要求的标签,写入新的文件中 """ self.handler.start('document' ) for block in blocks(file): for filter in self.filters: block = filter (block, self.handler) for rule in self.rules: if rule.condition(block): last = rule.action(block, self.handler) if last: break self.handler.end('document' ) class BasicTextParser (Parser ): """ 纯文本解析器 """ def __init__ (self, handler ): super ().__init__(handler) for rule in rule_list: self.addRule(rule) self.addFilter(r'\*(.+?)\*' , 'emphasis' ) self.addFilter(r'(http://[\.a-zA-Z/]+)' , 'url' ) self.addFilter(r'([\.a-zA-Z]+@[\.a-zA-Z]+[a-zA-Z]+)' , 'mail' ) def main (): ''' 主函数,控制整个程序的运行 ''' handler = HTMLRenderer() parser = BasicTextParser(handler) parser.parse(sys.stdin) if __name__ == '__main__' : main()
使用:
1 python markup.py < test.txt > test.html
<
为重定向命令符,将 test.txt
文件内容作为标准输入
>
也是重定向命令符,将标准输出(即程序运行时 print
方法打印到终端的内容)重定向到文件 test.html
中