You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
444 lines
16 KiB
444 lines
16 KiB
import bs4 |
|
import re |
|
|
|
URIEL_GETPAGE = 0x10 |
|
URIEL_NAVBACK = 0x11 |
|
URIEL_NAVFWD = 0x12 |
|
URIEL_THUMB = 0x13 |
|
URIEL_DOWNLOAD = 0x14 |
|
|
|
class Uriel: |
|
download_buffer = '' |
|
user_agent = '' |
|
history = [] |
|
nav_index = -1 |
|
class rel: |
|
scheme = '' |
|
netloc = '' |
|
path = '' |
|
|
|
def uriel(data): |
|
if data == URIEL_GETPAGE: |
|
UrielGetPage() |
|
if data == URIEL_NAVBACK: |
|
UrielNavBack() |
|
if data == URIEL_NAVFWD: |
|
UrielNavFwd() |
|
if data == URIEL_THUMB: |
|
UrielThumb() |
|
if data == URIEL_DOWNLOAD: |
|
UrielDownload() |
|
|
|
def UrielGetPage(): |
|
global Uriel |
|
os.lseek(HGBD,0,os.SEEK_SET) |
|
HGBD_PARAM_BUF = os.read(HGBD,BLK_SIZE) |
|
os.lseek(HGBD,BLK_SIZE,os.SEEK_SET) |
|
HGBD_URL_BUF = os.read(HGBD,BLK_SIZE*4) |
|
if Uriel.user_agent == '': |
|
Uriel.user_agent = HGBD_PARAM_BUF[:HGBD_PARAM_BUF.find('\x00')] |
|
url_comp = urlparse.urlparse(HGBD_URL_BUF[:HGBD_URL_BUF.find('\x00')]) |
|
scheme = '' |
|
netloc = '' |
|
path = '' |
|
if url_comp.scheme == '': |
|
scheme = Uriel.rel.scheme |
|
else: |
|
scheme = url_comp.scheme |
|
Uriel.rel.scheme = url_comp.scheme |
|
if url_comp.netloc == '': |
|
netloc = Uriel.rel.netloc |
|
else: |
|
netloc = url_comp.netloc |
|
Uriel.rel.netloc = url_comp.netloc |
|
if url_comp.path != '': |
|
if url_comp.path.find('/') != -1: |
|
path = url_comp.path |
|
Uriel.rel.path = url_comp.path[:url_comp.path.rfind('/')+1] |
|
else: |
|
path = Uriel.rel.path + url_comp.path |
|
url = scheme + "://" + netloc + urllib.quote(path) |
|
pagedata = subprocess.Popen('wget -q -O - -U "' + Uriel.user_agent + '" "' + url + '" 2>/dev/null', shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE).communicate()[0] |
|
filedata = UrielPreProcess(pagedata, url) |
|
filesize = len(filedata) |
|
if filesize>0: |
|
if filedata.find('$AN,"",A="BINARY"$') != -1: |
|
Uriel.download_buffer = pagedata |
|
if url[-2:].upper()==".Z": |
|
tmp_z_file = "/tmp/" + str(uuid.uuid4()).split('-')[0].upper() + ".Z" |
|
while os.path.exists(tmp_z_file): |
|
tmp_z_file = "/tmp/" + str(uuid.uuid4()).split('-')[0].upper() + ".Z" |
|
open(tmp_z_file,"wb").write(pagedata) |
|
try: |
|
os.system('tosz "' + tmp_z_file + '"') |
|
pagedata = open(tmp_z_file.split('.Z')[0],"rb").read() |
|
os.remove(tmp_z_file.split('.Z')[0]) |
|
Uriel.download_buffer = pagedata |
|
except: |
|
Uriel.download_buffer = '' |
|
Uriel.nav_index += 1 |
|
Uriel.history = Uriel.history[0:Uriel.nav_index] |
|
Uriel.history.append({'url':url, 'filedata':filedata}) |
|
ZeroParamBuf() |
|
os.lseek(HGBD,0,os.SEEK_SET) |
|
os.write(HGBD,str(filesize)) |
|
os.lseek(HGBD,BLK_SIZE,os.SEEK_SET) |
|
os.write(HGBD,filedata) |
|
logger.info("[Uriel] navigate to " + url) |
|
else: |
|
filesize = -1 |
|
ZeroParamBuf() |
|
os.lseek(HGBD,0,os.SEEK_SET) |
|
os.write(HGBD,str(filesize)) |
|
logger.error("[Uriel] error reading url " + url) |
|
conn.send(chr(URIEL_GETPAGE)) |
|
|
|
def UrielNavBack(): |
|
global Uriel |
|
if Uriel.nav_index > 0: |
|
Uriel.nav_index -= 1 |
|
url_comp = urlparse.urlparse(Uriel.history[Uriel.nav_index]['url']) |
|
filedata = Uriel.history[Uriel.nav_index]['filedata'] |
|
scheme = '' |
|
netloc = '' |
|
path = '' |
|
if url_comp.scheme == '': |
|
scheme = Uriel.rel.scheme |
|
else: |
|
scheme = url_comp.scheme |
|
if url_comp.netloc == '': |
|
netloc = Uriel.rel.netloc |
|
else: |
|
netloc = url_comp.netloc |
|
if url_comp.path != '': |
|
if url_comp.path.find('/') != -1: |
|
path = url_comp.path |
|
else: |
|
path = Uriel.rel.path + url_comp.path |
|
url = scheme + "://" + netloc + urllib.quote(path) |
|
filesize = len(filedata) |
|
if filesize>0: |
|
ZeroParamBuf() |
|
os.lseek(HGBD,0,os.SEEK_SET) |
|
os.write(HGBD,str(filesize)) |
|
os.lseek(HGBD,BLK_SIZE,os.SEEK_SET) |
|
os.write(HGBD,filedata) |
|
logger.info("[Uriel] history navigate back to " + url) |
|
else: |
|
filesize = -1 |
|
ZeroParamBuf() |
|
os.lseek(HGBD,0,os.SEEK_SET) |
|
os.write(HGBD,str(filesize)) |
|
logger.error("[Uriel] error reading history for url " + url) |
|
conn.send(chr(URIEL_NAVBACK)) |
|
|
|
def UrielNavFwd(): |
|
global Uriel |
|
if Uriel.nav_index < len(Uriel.history)-1: |
|
Uriel.nav_index += 1 |
|
url_comp = urlparse.urlparse(Uriel.history[Uriel.nav_index]['url']) |
|
filedata = Uriel.history[Uriel.nav_index]['filedata'] |
|
scheme = '' |
|
netloc = '' |
|
path = '' |
|
if url_comp.scheme == '': |
|
scheme = Uriel.rel.scheme |
|
else: |
|
scheme = url_comp.scheme |
|
if url_comp.netloc == '': |
|
netloc = Uriel.rel.netloc |
|
else: |
|
netloc = url_comp.netloc |
|
if url_comp.path != '': |
|
if url_comp.path.find('/') != -1: |
|
path = url_comp.path |
|
else: |
|
path = Uriel.rel.path + url_comp.path |
|
url = scheme + "://" + netloc + urllib.quote(path) |
|
filesize = len(filedata) |
|
if filesize>0: |
|
ZeroParamBuf() |
|
os.lseek(HGBD,0,os.SEEK_SET) |
|
os.write(HGBD,str(filesize)) |
|
os.lseek(HGBD,BLK_SIZE,os.SEEK_SET) |
|
os.write(HGBD,filedata) |
|
logger.info("[Uriel] history navigate fwd to " + url) |
|
else: |
|
filesize = -1 |
|
ZeroParamBuf() |
|
os.lseek(HGBD,0,os.SEEK_SET) |
|
os.write(HGBD,str(filesize)) |
|
logger.error("[Uriel] error reading history for url " + url) |
|
conn.send(chr(URIEL_NAVFWD)) |
|
|
|
def UrielThumb(): |
|
global Uriel |
|
os.lseek(HGBD,0,os.SEEK_SET) |
|
HGBD_PARAM_BUF = os.read(HGBD,BLK_SIZE) |
|
os.lseek(HGBD,BLK_SIZE,os.SEEK_SET) |
|
HGBD_URL_BUF = os.read(HGBD,BLK_SIZE*4) |
|
if Uriel.user_agent == '': |
|
Uriel.user_agent = HGBD_PARAM_BUF[:HGBD_PARAM_BUF.find('\x00')] |
|
url_comp = urlparse.urlparse(HGBD_URL_BUF[:HGBD_URL_BUF.find('\x00')]) |
|
scheme = '' |
|
netloc = '' |
|
path = '' |
|
if url_comp.scheme == '': |
|
scheme = Uriel.rel.scheme |
|
else: |
|
scheme = url_comp.scheme |
|
if url_comp.netloc == '': |
|
netloc = Uriel.rel.netloc |
|
else: |
|
netloc = url_comp.netloc |
|
if url_comp.path != '': |
|
if url_comp.path.find('/') != -1: |
|
path = url_comp.path |
|
else: |
|
path = Uriel.rel.path + url_comp.path |
|
url = scheme + "://" + netloc + urllib.quote(path) |
|
tmp_thumb = '/tmp/' + str(uuid.uuid4()) + '.bmp' |
|
while os.path.exists(tmp_thumb): |
|
tmp_thumb = '/tmp/' + str(uuid.uuid4()) + '.bmp' |
|
pagedata = subprocess.Popen('wget -q -O - -U "' + Uriel.user_agent + '" "' + url + '" 2>/dev/null | gm convert -resize 100x100 - -colors 16 "' + tmp_thumb + '"', shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE).communicate()[0] |
|
filedata = open(tmp_thumb,"rb").read() |
|
try: |
|
os.remove(tmp_thumb) |
|
except: |
|
pass |
|
filesize = len(filedata) |
|
if filesize>0: |
|
ZeroParamBuf() |
|
os.lseek(HGBD,0,os.SEEK_SET) |
|
os.write(HGBD,str(filesize)) |
|
os.lseek(HGBD,BLK_SIZE,os.SEEK_SET) |
|
os.write(HGBD,filedata) |
|
logger.info("[Uriel] get image thumbnail " + url) |
|
else: |
|
filesize = -1 |
|
ZeroParamBuf() |
|
os.lseek(HGBD,0,os.SEEK_SET) |
|
os.write(HGBD,str(filesize)) |
|
logger.error("[Uriel] error reading url " + url) |
|
conn.send(chr(URIEL_THUMB)) |
|
|
|
def UrielDownload(): |
|
global Uriel |
|
filedata = Uriel.download_buffer |
|
filesize = len(filedata) |
|
if filesize>0: |
|
ZeroParamBuf() |
|
os.lseek(HGBD,0,os.SEEK_SET) |
|
os.write(HGBD,str(filesize)) |
|
os.lseek(HGBD,BLK_SIZE,os.SEEK_SET) |
|
os.write(HGBD,filedata) |
|
logger.info("[Uriel] binary, download file") |
|
else: |
|
filesize = -1 |
|
ZeroParamBuf() |
|
os.lseek(HGBD,0,os.SEEK_SET) |
|
os.write(HGBD,str(filesize)) |
|
logger.error("[Uriel] error downloading file") |
|
conn.send(chr(URIEL_DOWNLOAD)) |
|
|
|
def UrielPreProcess(htm1, l_url): |
|
title_text = '' |
|
hb_header = '$WW,1$$BLACK$$MA+LIS,"[Close]",LM="U_CloseBrowser;"$ $MA+LIS,"[Back]",LM="U_HistNav(0);"$ $MA+LIS,"[Fwd]",LM="U_HistNav(1);"$ $MA+LIS,"[Go]",LM="U_Browser(GetStr(\\"\nURL> \\"));"$ ' + title_text + '\n\n' |
|
|
|
if htm1.upper().find('<HTML') == -1: |
|
dl_link = '$AN,"",A="BINARY"$Click $MA+LIS,"[Here]",LM="U_Download(\\"' + l_url + '\\");"$ to download the file:\n\n' + l_url |
|
return hb_header + dl_link |
|
|
|
htm1 = htm1[htm1.upper().find('<HTML'):] |
|
htm1 = htm1.replace('$', '$$') |
|
|
|
htm1 = htm1.replace('<blockquote>',' ') |
|
htm1 = htm1.replace('<BLOCKQUOTE>',' ') |
|
|
|
htm1 = htm1.replace('<br>','\n') |
|
htm1 = htm1.replace('<br/>','\n') |
|
htm1 = htm1.replace('<br />','\n') |
|
htm1 = htm1.replace('<BR>','\n') |
|
htm1 = htm1.replace('<BR/>','\n') |
|
htm1 = htm1.replace('<BR />','\n') |
|
|
|
htm1 = htm1.replace('<li>',' * ') |
|
htm1 = htm1.replace('<LI>',' * ') |
|
|
|
htm1 = htm1.replace('</img>','') |
|
htm1 = htm1.replace('</IMG>','') |
|
|
|
title_text = '' |
|
a_pos = htm1.upper().find('<TITLE>') |
|
if a_pos != -1: |
|
title_text = htm1[a_pos:htm1.find('</', a_pos)].split('>')[1] |
|
|
|
soup1 = bs4.BeautifulSoup(htm1, 'lxml') |
|
|
|
unwrap_tags = [ 'html', 'body', 'p', 'b', 'pre', 'span', 'table', 'header' ] |
|
for tag in unwrap_tags: |
|
for match in soup1.findAll(tag): |
|
match.unwrap() |
|
|
|
for f in soup1.findAll('a'): |
|
for tag in f.findAll(True): |
|
if str(tag).find('<None>') == -1: |
|
if tag.name.upper() != 'IMG': |
|
tag.decompose() |
|
|
|
remove_tags = [ 'svg', 'embed', 'head', 'noscript', 'object', 'param', 'script', 'option' ] |
|
for tag in remove_tags: |
|
[s.extract() for s in soup1(tag)] |
|
|
|
html = str(soup1) |
|
|
|
html = html.replace('<h1>','$PURPLE$') |
|
html = html.replace('<H1>','$PURPLE$') |
|
html = html.replace('</h1>','$BLACK$') |
|
html = html.replace('</H1>','$BLACK$') |
|
|
|
html = html.replace('<u>','$UL,1$') |
|
html = html.replace('<U>','$UL,1$') |
|
html = html.replace('</u>','$UL,0$') |
|
html = html.replace('</U>','$UL,0$') |
|
|
|
html = html.replace('<b>','$IV,1$') |
|
html = html.replace('<B>','$IV,1$') |
|
html = html.replace('</b>','$IV,0$') |
|
html = html.replace('</B>','$IV,0$') |
|
|
|
a_pos = html.upper().find('<IMG ') |
|
while a_pos != -1: |
|
img_text = html[a_pos:].split('>')[0] |
|
img_text.replace("'",'"') |
|
img_src = '' |
|
img_pos = img_text.upper().find('SRC') |
|
if img_pos > 0: |
|
img_src = img_text[img_text.upper().find('SRC'):].split('"')[1] |
|
img_el = '[URIEL_IMG]' + img_src + '[/URIEL_IMG]' |
|
html = html[:a_pos] + img_el + html[1+html.upper().find('>', a_pos):] |
|
a_pos = html.upper().find('<IMG ') |
|
|
|
a_pos = html.upper().find('<BUTTON ') |
|
while a_pos != -1: |
|
button_text = html[a_pos:].split('>')[1] |
|
button_text = button_text[:button_text.upper().find('</BUTTON')] |
|
button_text = button_text.replace('"','\\"') |
|
button_doctext = '$BT,"' + button_text + '"$' |
|
html = html[:a_pos] + button_doctext + html[9+html.upper().find('</BUTTON>', a_pos):] |
|
a_pos = html.upper().find('<BUTTON ') |
|
|
|
a_ctr = 0 |
|
a_pos = html.upper().find('<A ') |
|
while a_pos != -1: |
|
link_pre = '' |
|
link_text = html[a_pos:].split('>')[1] |
|
link_text = link_text[:link_text.upper().find('</A')] |
|
while link_text.find('[URIEL_IMG]') != -1: |
|
link_pre += link_text[link_text.find('[URIEL_IMG]'):12+link_text.find('[/URIEL_IMG]')] + ' ' |
|
link_text = link_text[:link_text.find('[URIEL_IMG]')] + link_text[12+link_text.find('[/URIEL_IMG]'):] |
|
link_text = link_text.replace('"','\\"') |
|
link_href = '' |
|
link_pos = html[a_pos:html.upper().find('</A>', a_pos)].upper().find('HREF') |
|
if link_pos > 0: |
|
link_href = html[a_pos:html.upper().find('</A>', a_pos)][link_pos:].replace('\'','"').split('"')[1] |
|
doldoc_link = '$AN,"",A="A' + str(a_ctr) + '"$$MA+LIS,"' + link_text + '",LM="U_Navigate(\\"A' + str(a_ctr) + '\\",\\"' + link_href + '\\");"$' |
|
html = html[:a_pos] + link_pre + doldoc_link + html[4+html.upper().find('</A>', a_pos):] |
|
a_ctr += 1 |
|
a_pos = html.upper().find('<A ') |
|
|
|
a_pos = html.upper().find('<CENTER>') |
|
while a_pos != -1: |
|
center_text = html[a_pos:].split('>')[1] |
|
center_text = center_text[:center_text.upper().find('</CENTER')] |
|
center_text = center_text.replace('"','\\"') |
|
if center_text.upper().find('[URIEL_IMG]') != -1: |
|
center_doctext = center_text |
|
else: |
|
center_doctext = '$TX+CX,"' + center_text + '"$' |
|
html = html[:a_pos] + center_doctext + html[9+html.upper().find('</CENTER>', a_pos):] |
|
a_pos = html.upper().find('<CENTER>') |
|
|
|
html = html.replace('</div>','\n') |
|
html = html.replace('</DIV>','\n') |
|
|
|
html = html.replace('</td>', ' ') |
|
html = html.replace('</TD>', ' ') |
|
|
|
html = html.replace('</tr>', '\n') |
|
html = html.replace('</TR>', '\n') |
|
|
|
a_pos = html.upper().find('<INPUT ') |
|
while a_pos != -1: |
|
input_text = html[a_pos:].split('>')[0] |
|
input_text = input_text.replace("'", '"') |
|
input_doctext = '[$UL,1$ $UL,0$]' |
|
|
|
t_text = '' |
|
if input_text.upper().find('VALUE='): |
|
t_t = input_text[input_text.upper().find('VALUE='):].split('"') |
|
if len(t_t) > 2: |
|
t_text = t_t[1] |
|
|
|
bt_text = t_text if t_text != '' else 'Button' |
|
st_text = t_text if t_text != '' else 'Submit' |
|
|
|
if input_text.find('button') != -1: |
|
input_doctext = '$BT,"' + bt_text + '"$' |
|
if input_text.find('checkbox') != -1: |
|
input_doctext = '$CB$' |
|
if input_text.find('hidden') != -1: |
|
input_doctext = '' |
|
if input_text.find('submit') != -1: |
|
input_doctext = '$BT,"' + st_text + '"$' |
|
|
|
html = html[:a_pos] + input_doctext + html[1+html.upper().find('>', a_pos):] |
|
a_pos = html.upper().find('<INPUT ') |
|
|
|
a_pos = html.upper().find('<') |
|
while a_pos != -1: |
|
html = html[:a_pos] + html[1+html.upper().find('>', a_pos):] |
|
a_pos = html.upper().find('<') |
|
|
|
html = html.replace('<','<') |
|
html = html.replace('>','>') |
|
html = html.replace('&','&') |
|
html = html.replace(''','\'') |
|
html = html.replace('"','"') |
|
|
|
img_a_ctr = 0 |
|
while html.find('[URIEL_IMG]') != -1: |
|
img_url = html[11+html.find('[URIEL_IMG]'):html.find('[/URIEL_IMG]')] |
|
img_ma = '$AN,"",A="IMG' + str(img_a_ctr) + '"$$MA+LIS,"[IMG]",LM="U_InsertThumb(\\"IMG' + str(img_a_ctr) + '\\",\\"$IMIS$\\",\\"$IMIE$\\",\\"' + img_url + '\\");"$$AN,"",A="IMIS' + str(img_a_ctr) + '"$' |
|
html = html[:html.find('[URIEL_IMG]')] + img_ma + html[12+html.find('[/URIEL_IMG]'):] |
|
img_a_ctr += 1 |
|
|
|
hb_header = '$WW,1$$BLACK$$MA+LIS,"[Close]",LM="U_CloseBrowser;"$ $MA+LIS,"[Back]",LM="U_HistNav(0);"$ $MA+LIS,"[Fwd]",LM="U_HistNav(1);"$ $MA+LIS,"[Go]",LM="U_Browser(GetStr(\\"URL> \\"));"$ ' + title_text + '\n\n' |
|
|
|
ind_id = '' |
|
o_html = '' |
|
o_lj_ct = 0 |
|
o_lj_indent = False |
|
|
|
for line in html.split('\n'): |
|
if not o_lj_indent: |
|
if line[0:13] == '$AN,"",A="IMG': |
|
# Left Justified image detected. |
|
ind_id = line.split('IMG')[1].split('"')[0] |
|
line = line.replace('$IMIS$','IMIS' + ind_id) |
|
line = line.replace('$IMIE$','IMIE' + ind_id) |
|
o_lj_indent = True |
|
|
|
if o_lj_indent: |
|
o_lj_ct += 1 |
|
if o_lj_ct > 11: |
|
line = '$AN,"",A="IMIE' + ind_id + '"$' + line |
|
ind_id = '' |
|
o_lj_ct = 0 |
|
o_lj_indent = False |
|
|
|
line = line.replace('$IMIS$','') |
|
line = line.replace('$IMIE$','') |
|
o_html += line + '\n' |
|
|
|
return hb_header + o_html
|
|
|