# this html module is in the public domain
# Sam Watkins, 2012
from l import *
def split(raw_html):
html = raw_html.replace('\r', '')
html = raw_html.replace(' ', ' ')
lines = re.findall(r'<.*?>\s*|[^<]+', html)
lines = [re.sub(r'^\s+|\s+$', '', l) for l in lines]
return lines
def parse_lines(lines, do_expand_singles=True):
parsed = [parse_line(l) for l in lines]
if do_expand_singles:
parsed = expand_singles(parsed)
return parsed
def parse_line(l):
text = tag = open = close = attrs = None
is_tag = re.match(r'^<', l)
if is_tag:
close = bool(re.match(r'^', l))
open = not close
if re.search(r'/>$', l):
close = True
(tag, attrs) = re.search(r'([^<>/\s]+)(.*?)/?>$', l).groups()
else:
text = l # disentify? squeeze spaces? maybe not yet
return (text, tag, open, close, attrs, l)
def expand_singles(parsed_lines):
out = []
for p in parsed_lines:
(text, tag, open, close, attrs, raw) = p
if open and close:
out.append((text, tag, True, False, attrs, raw))
out.append((text, tag, False, True, None, '')) # hmmm...!
else:
out.append((text, tag, open, close, attrs, raw))
return out
def tidy_html_text(t):
t = re.sub(r'^\s+|\s+$', '', t)
t = re.sub(r'\s+', ' ', t)
return t
def extract_table_data(parsed, with_th=True, tidy_values=True, a=False):
# This does not preseve tags within table cells,
# for now I don't need them.
rows = []
row = None
value = None
value_type = None
colspan = None
# this is 'event driven', not ideal
for p in parsed:
(text, tag, start, end, attrs, raw) = p
# print raw
# print " ", text, tag, start, end, repr(attrs)
tr = tag == 'tr'
td = tag == 'td'
th = tag == 'th'
cell = td or th
ctl = tr or cell
if text is not None and value_type:
if value == '':
value = text
else:
value += " " + text
if ctl and value is not None:
if value_type == 'td' or (value_type == 'th' and with_th):
if tidy_values:
value = tidy_html_text(value)
row.append(value)
while colspan > 1:
row.append('')
colspan -= 1
value = None
value_type = None
colspan = None
if start and cell:
value = ""
value_type = tag # td or th
colspan = 1
match = re.search(r'colspan=["\']?(\d+)', attrs)
# not quite right!
if match:
colspan = int(match.group(1))
if a and start and tag == 'a':
match = re.search(r'href=["\']?(.*?)["\' >]', attrs)
if match:
url = match.group(1)
# TODO disentify
if value:
value += " "
value += "[%s]" % url
if tr:
if row is not None:
rows.append(row)
if start:
row = []
if end:
row = None
return rows
def parse_tables(html, need_th=True, with_th=True, tidy_values=True, a=False):
lines = split(html)
parsed = parse_lines(lines)
has_th = None
raw_tables = []
# get all tables without other tables in them
# This is kind of bad, being 'event driven', rather that procedural.
for i in range(0, len(parsed)):
(text, tag, start, end, attr, raw) = p = parsed[i]
if tag == 'table' and start:
table_start = i
has_th = 0
if tag == 'th' and table_start is not None:
has_th = 1
if tag == 'table' and end and table_start is not None:
table_end = i
if not need_th or has_th:
table = parsed[table_start:table_end+1]
raw_tables.append(table)
table_start = None
has_th = None
data = [extract_table_data(t, with_th=with_th, tidy_values=tidy_values, a=a) for t in raw_tables]
return data, raw_tables