Uriel Web Browser for TempleOS
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

206 lines
8.1 KiB

import bs4
import re, sys
def preprocess(htm1, l_url):
title_text = ''
hb_header = '$WW,1$$BLACK$$MA+LIS,"[Close]",LM="CloseBrowser;"$ $MA+LIS,"[Back]",LM="Browser(\\"h:back\\");"$ $MA+LIS,"[Fwd]",LM="Browser(\\"h:fwd\\");"$ $MA+LIS,"[Go]",LM="Browser(GetStr(\\"\nURL> \\"));"$ ' + title_text + '\n\n'
if htm1.upper().find('<HTML') == -1:
dl_link = '$AN,"",A="BINARY"$Click $MA+LIS,"[Here]",LM="Get(\\"retry:send\\",\\"~/Downloads/' + l_url.split('/')[len(l_url.split('/'))-1] + '\\");"$ to download the file:\n\n' + l_url
if l_url.split('.')[len(l_url.split('.'))-1].upper() == 'Z':
dl_link = dl_link.replace('retry:send', 'retry:sendZ')
return hb_header + dl_link
htm1 = htm1[htm1.upper().find('<HTML'):]
htm1 = htm1.replace('$', '$$')
htm1 = htm1.replace('<blockquote>',' ')
htm1 = htm1.replace('<BLOCKQUOTE>',' ')
htm1 = htm1.replace('<br>','\n')
htm1 = htm1.replace('<br/>','\n')
htm1 = htm1.replace('<br />','\n')
htm1 = htm1.replace('<BR>','\n')
htm1 = htm1.replace('<BR/>','\n')
htm1 = htm1.replace('<BR />','\n')
htm1 = htm1.replace('<li>',' * ')
htm1 = htm1.replace('<LI>',' * ')
htm1 = htm1.replace('</img>','')
htm1 = htm1.replace('</IMG>','')
title_text = ''
a_pos = htm1.upper().find('<TITLE>')
if a_pos != -1:
title_text = htm1[a_pos:htm1.find('</', a_pos)].split('>')[1]
soup1 = bs4.BeautifulSoup(htm1, 'lxml')
unwrap_tags = [ 'html', 'body', 'p', 'b', 'pre', 'span', 'table', 'header' ]
for tag in unwrap_tags:
for match in soup1.findAll(tag):
match.unwrap()
for f in soup1.findAll('a'):
for tag in f.findAll(True):
if str(tag).find('<None>') == -1:
if tag.name.upper() != 'IMG':
tag.decompose()
remove_tags = [ 'svg', 'embed', 'head', 'noscript', 'object', 'param', 'script', 'option' ]
for tag in remove_tags:
[s.extract() for s in soup1(tag)]
html = str(soup1)
html = html.replace('<h1>','$PURPLE$')
html = html.replace('<H1>','$PURPLE$')
html = html.replace('</h1>','$BLACK$')
html = html.replace('</H1>','$BLACK$')
html = html.replace('<u>','$UL,1$')
html = html.replace('<U>','$UL,1$')
html = html.replace('</u>','$UL,0$')
html = html.replace('</U>','$UL,0$')
html = html.replace('<b>','$IV,1$')
html = html.replace('<B>','$IV,1$')
html = html.replace('</b>','$IV,0$')
html = html.replace('</B>','$IV,0$')
a_pos = html.upper().find('<IMG ')
while a_pos != -1:
img_text = html[a_pos:].split('>')[0]
img_text.replace("'",'"')
img_src = ''
img_pos = img_text.upper().find('SRC')
if img_pos > 0:
img_src = img_text[img_text.upper().find('SRC'):].split('"')[1]
img_el = '[URIEL_IMG]' + img_src + '[/URIEL_IMG]'
html = html[:a_pos] + img_el + html[1+html.upper().find('>', a_pos):]
a_pos = html.upper().find('<IMG ')
a_pos = html.upper().find('<BUTTON ')
while a_pos != -1:
button_text = html[a_pos:].split('>')[1]
button_text = button_text[:button_text.upper().find('</BUTTON')]
button_text = button_text.replace('"','\\"')
button_doctext = '$BT,"' + button_text + '"$'
html = html[:a_pos] + button_doctext + html[9+html.upper().find('</BUTTON>', a_pos):]
a_pos = html.upper().find('<BUTTON ')
a_ctr = 0
a_pos = html.upper().find('<A ')
while a_pos != -1:
link_pre = ''
link_text = html[a_pos:].split('>')[1]
link_text = link_text[:link_text.upper().find('</A')]
while link_text.find('[URIEL_IMG]') != -1:
link_pre += link_text[link_text.find('[URIEL_IMG]'):12+link_text.find('[/URIEL_IMG]')] + ' '
link_text = link_text[:link_text.find('[URIEL_IMG]')] + link_text[12+link_text.find('[/URIEL_IMG]'):]
link_text = link_text.replace('"','\\"')
link_href = ''
link_pos = html[a_pos:html.upper().find('</A>', a_pos)].upper().find('HREF')
if link_pos > 0:
link_href = html[a_pos:html.upper().find('</A>', a_pos)][link_pos:].replace('\'','"').split('"')[1]
doldoc_link = '$AN,"",A="A' + str(a_ctr) + '"$$MA+LIS,"' + link_text + '",LM="Navigate(\\"A' + str(a_ctr) + '\\",\\"' + link_href + '\\");"$'
html = html[:a_pos] + link_pre + doldoc_link + html[4+html.upper().find('</A>', a_pos):]
a_ctr += 1
a_pos = html.upper().find('<A ')
a_pos = html.upper().find('<CENTER>')
while a_pos != -1:
center_text = html[a_pos:].split('>')[1]
center_text = center_text[:center_text.upper().find('</CENTER')]
center_text = center_text.replace('"','\\"')
if center_text.upper().find('[URIEL_IMG]') != -1:
center_doctext = center_text
else:
center_doctext = '$TX+CX,"' + center_text + '"$'
html = html[:a_pos] + center_doctext + html[9+html.upper().find('</CENTER>', a_pos):]
a_pos = html.upper().find('<CENTER>')
html = html.replace('</div>','\n')
html = html.replace('</DIV>','\n')
html = html.replace('</td>', ' ')
html = html.replace('</TD>', ' ')
html = html.replace('</tr>', '\n')
html = html.replace('</TR>', '\n')
a_pos = html.upper().find('<INPUT ')
while a_pos != -1:
input_text = html[a_pos:].split('>')[0]
input_text = input_text.replace("'", '"')
input_doctext = '[$UL,1$ $UL,0$]'
t_text = ''
if input_text.upper().find('VALUE='):
t_t = input_text[input_text.upper().find('VALUE='):].split('"')
if len(t_t) > 2:
t_text = t_t[1]
bt_text = t_text if t_text != '' else 'Button'
st_text = t_text if t_text != '' else 'Submit'
if input_text.find('button') != -1:
input_doctext = '$BT,"' + bt_text + '"$'
if input_text.find('checkbox') != -1:
input_doctext = '$CB$'
if input_text.find('hidden') != -1:
input_doctext = ''
if input_text.find('submit') != -1:
input_doctext = '$BT,"' + st_text + '"$'
html = html[:a_pos] + input_doctext + html[1+html.upper().find('>', a_pos):]
a_pos = html.upper().find('<INPUT ')
a_pos = html.upper().find('<')
while a_pos != -1:
html = html[:a_pos] + html[1+html.upper().find('>', a_pos):]
a_pos = html.upper().find('<')
html = html.replace('&lt;','<')
html = html.replace('&gt;','>')
html = html.replace('&amp;','&')
html = html.replace('&apos;','\'')
html = html.replace('&quot;','"')
img_a_ctr = 0
while html.find('[URIEL_IMG]') != -1:
img_url = html[11+html.find('[URIEL_IMG]'):html.find('[/URIEL_IMG]')]
img_ma = '$AN,"",A="IMG' + str(img_a_ctr) + '"$$MA+LIS,"[IMG]",LM="InsertImg(\\"IMG' + str(img_a_ctr) + '\\",\\"$IMIS$\\",\\"$IMIE$\\",\\"' + img_url + '.uriel_img\\");"$$AN,"",A="IMIS' + str(img_a_ctr) + '"$'
html = html[:html.find('[URIEL_IMG]')] + img_ma + html[12+html.find('[/URIEL_IMG]'):]
img_a_ctr += 1
hb_header = '$WW,1$$BLACK$$MA+LIS,"[Close]",LM="CloseBrowser;"$ $MA+LIS,"[Back]",LM="Browser(\\"h:back\\");"$ $MA+LIS,"[Fwd]",LM="Browser(\\"h:fwd\\");"$ $MA+LIS,"[Go]",LM="Browser(GetStr(\\"URL> \\"));"$ ' + title_text + '\n\n'
ind_id = ''
o_html = ''
o_lj_ct = 0
o_lj_indent = False
for line in html.split('\n'):
if not o_lj_indent:
if line[0:13] == '$AN,"",A="IMG':
# Left Justified image detected.
ind_id = line.split('IMG')[1].split('"')[0]
line = line.replace('$IMIS$','IMIS' + ind_id)
line = line.replace('$IMIE$','IMIE' + ind_id)
o_lj_indent = True
if o_lj_indent:
o_lj_ct += 1
if o_lj_ct > 11:
line = '$AN,"",A="IMIE' + ind_id + '"$' + line
ind_id = ''
o_lj_ct = 0
o_lj_indent = False
line = line.replace('$IMIS$','')
line = line.replace('$IMIE$','')
o_html += line + '\n'
return hb_header + o_html