#@+leo-ver=5-thin
#@+node:ekr.20120519121124.9919: * @file ../external/leosax.py
#@@language python
#@@killbeautify
#@+others
#@+node:ekr.20120519121124.9920: ** leosax declarations
"""Read .leo files into a simple python data structure with
h, b, u (unknown attribs), gnx and children information.
Clones and derived files are ignored. Useful for scanning
multiple .leo files quickly.
"""
from __future__ import print_function
import leo.core.leoGlobals as g
from xml.sax.handler import ContentHandler
from xml.sax import parseString
from pickle import loads
from binascii import unhexlify
#@+node:ekr.20120519121124.9921: ** class LeoNode
[docs]class LeoNode(object):
"""Representation of a Leo node. Root node has itself as parent.
:IVariables:
children
python list of children
u
unknownAttributes dict (decoded)
h
headline
b
body text
gnx
node id
parent
node's parent
path
list of nodes that lead to this one from root, including this one
"""
#@+others
#@+node:ekr.20120519121124.9922: *3* __init__
def __init__(self):
"""Set ivars"""
self.children = []
self.u = {}
self.unknownAttributes = self.u # for compatibility
self.h = []
self.b = []
self.gnx = None
self.parent = self
self.path = []
#@+node:ekr.20120519121124.9923: *3* __str__
def __str__(self, level=0):
"""Return long text representation of node and
descendents with indentation"""
ans = [("%s%s (%s)" % (' '*(level-1), self.h, self.gnx))[:78]]
for k in self.u:
s = self.u[k]
ans.append(("%s@%s: %s"%(' '*(level+1), k, repr(s)))[:78])
for line in self.b[:5]:
ans.append((' '*(level+1) + '|' + line)[:78])
for child in self.children:
ans.append(child.__str__(level=level+1))
return '\n'.join(ans)
#@+node:ekr.20120519121124.9924: *3* UNL
[docs] def node_pos_count(self, node):
"""node_pos_count - return the position (index) and count of
preceeding siblings with the same name, also return headline
:param LeoNode node: node to characterize
:return: h, pos, count
:rtype: (str, int, int)
"""
pos = node.parent.children.index(node)
count = len([i for i in node.parent.children[:pos]
if i.h == node.h])
return node.h, pos, count
[docs] def UNL(self):
"""Return the UNL string leading to this node"""
return '-->'.join(["%s:%d,%d" % self.node_pos_count(i)
for i in self.path])
#@+node:ekr.20120519121124.9925: *3* flat
[docs] def flat(self):
"""iterate this node and all its descendants in a flat list,
useful for finding things and building an UNL based view"""
if self.parent != self:
yield(self)
for i in self.children:
for j in i.flat():
yield j
#@-others
#@+node:ekr.20120519121124.9926: ** class LeoReader
[docs]class LeoReader(ContentHandler):
"""Read .leo files into a simple python data structure with
h, b, u (unknown attribs), gnx and children information.
Clones and derived files are ignored. Useful for scanning
multiple .leo files quickly.
:IVariables:
root
root node
cur
used internally during SAX read
idx
mapping from gnx to node
`in_`
name of XML element we're current in, used for SAX read
in_attr
attributes of element tag we're currentl in, used for SAX read
path
list of nodes leading to current node
"""
#@+others
#@+node:ekr.20120519121124.9927: *3* __init__
def __init__(self, *args, **kwargs):
"""Set ivars"""
ContentHandler.__init__(self, *args, **kwargs)
self.root = LeoNode()
self.root.h = g.u('ROOT')
# changes type from [] to str, done by endElement() for other vnodes
self.cur = self.root
self.idx = {}
self.in_ = None
self.in_attrs = {}
self.path = []
#@+node:ekr.20120519121124.9928: *3* startElement
[docs] def startElement(self, name, attrs):
"""collect information from v and t elements"""
self.in_ = name
self.in_attrs = attrs
if name == 'v':
nd = LeoNode()
self.cur.children.append(nd)
nd.parent = self.cur
self.cur = nd
self.idx[attrs['t']] = nd
nd.gnx = attrs['t']
self.path.append(nd)
nd.path = self.path[:]
if name == 't':
for k in attrs.keys():
if k == 'tx':
continue
self.idx[attrs['tx']].u[k] = attrs[k]
#@+node:ekr.20120519121124.9929: *3* endElement
[docs] def endElement(self, name):
"""decode unknownAttributes when t element is done"""
self.in_ = None
# could maintain a stack, but we only need to know for
# character collection, so it doesn't matter
if name == 'v':
self.cur.h = g.u('').join(self.cur.h)
self.cur = self.cur.parent
if self.path:
del self.path[-1]
if name == 't':
nd = self.idx[self.in_attrs['tx']]
for k in nd.u:
s = nd.u[k]
if not k.startswith('str_'):
try:
s = loads(unhexlify(s))
except Exception:
pass
nd.u[k] = s
#@+node:ekr.20120519121124.9930: *3* characters
[docs] def characters(self, content):
"""collect body text and headlines"""
if self.in_ == 'vh':
self.cur.h.append(content)
if self.in_ == 't':
self.idx[self.in_attrs['tx']].b.append(content)
#@-others
#@+node:ekr.20120519121124.9931: ** get_leo_data
[docs]def get_leo_data(source):
"""Return the root node for the specificed .leo file (path or file)"""
parser = LeoReader()
if g.os_path_isfile(source):
source = g.readFileIntoEncodedString(source)
parseString(source, parser)
return parser.root
#@-others
#@@language python
#@@tabwidth -4
#@@pagewidth 70
if __name__ == '__main__':
import sys
if len(sys.argv) > 1 and g.os_path_isfile(sys.argv[1]):
wb = sys.argv[1]
else:
wb = g.os_path_expanduser(
g.os_path_join('~', '.leo', 'workbook.leo')
)
leo_data = get_leo_data(g.readFileIntoUnicodeString(wb))
print(leo_data)
#@-leo