import bs4 import re URIEL_GETPAGE = 0x10 URIEL_NAVBACK = 0x11 URIEL_NAVFWD = 0x12 URIEL_STR_SIZE = 144 URIEL_THUMB = 0x13 URIEL_DOWNLOAD = 0x14 class Uriel: download_buffer = '' user_agent = '' history = [] nav_index = -1 class rel: scheme = '' netloc = '' path = '' def uriel(data): if data == URIEL_GETPAGE: UrielGetPage() if data == URIEL_NAVBACK: UrielNavBack() if data == URIEL_NAVFWD: UrielNavFwd() if data == URIEL_THUMB: UrielThumb() if data == URIEL_DOWNLOAD: UrielDownload() def UrielGetPage(): global Uriel os.lseek(HGBD,0,os.SEEK_SET) HGBD_PARAM_BUF = os.read(HGBD,BLK_SIZE) os.lseek(HGBD,BLK_SIZE,os.SEEK_SET) HGBD_URL_BUF = os.read(HGBD,BLK_SIZE*4) if Uriel.user_agent == '': Uriel.user_agent = HGBD_PARAM_BUF[:HGBD_PARAM_BUF.find('\x00')] url_comp = urlparse.urlparse(HGBD_URL_BUF[:HGBD_URL_BUF.find('\x00')]) scheme = '' netloc = '' path = '' if url_comp.scheme == '': scheme = Uriel.rel.scheme else: scheme = url_comp.scheme Uriel.rel.scheme = url_comp.scheme if url_comp.netloc == '': netloc = Uriel.rel.netloc else: netloc = url_comp.netloc Uriel.rel.netloc = url_comp.netloc if url_comp.path != '': if url_comp.path.find('/') != -1: if url_comp.scheme == '' or url_comp.netloc == '': if url_comp.path[:1] != '/': path = Uriel.rel.path + url_comp.path Uriel.rel.path += url_comp.path[:url_comp.path.rfind('/')+1] else: path = url_comp.path Uriel.rel.path = url_comp.path[:url_comp.path.rfind('/')+1] else: path = url_comp.path Uriel.rel.path = url_comp.path[:url_comp.path.rfind('/')+1] else: path = Uriel.rel.path + url_comp.path post_scheme = netloc + "/" + urllib.quote(path) post_scheme = post_scheme.replace('//','/') url = scheme + "://" + post_scheme pagedata = subprocess.Popen('wget -q -O - -U "' + Uriel.user_agent + '" "' + url + '" 2>/dev/null', shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE).communicate()[0] filedata = UrielPreProcess(pagedata, url) filesize = len(filedata) if filesize>0: if filedata.find('$AN,"",A="BINARY"$') != -1: Uriel.download_buffer = pagedata if url[-2:].upper()==".Z": tmp_z_file = "/tmp/" + str(uuid.uuid4()).split('-')[0].upper() + ".Z" while os.path.exists(tmp_z_file): tmp_z_file = "/tmp/" + str(uuid.uuid4()).split('-')[0].upper() + ".Z" open(tmp_z_file,"wb").write(pagedata) try: os.system('tosz "' + tmp_z_file + '"') pagedata = open(tmp_z_file.split('.Z')[0],"rb").read() os.remove(tmp_z_file.split('.Z')[0]) Uriel.download_buffer = pagedata except: Uriel.download_buffer = '' Uriel.nav_index += 1 Uriel.history = Uriel.history[0:Uriel.nav_index] Uriel.history.append({'url':url, 'filedata':filedata}) ZeroParamBuf() os.lseek(HGBD,0,os.SEEK_SET) os.write(HGBD,str(filesize)) os.lseek(HGBD,128,os.SEEK_SET) os.write(HGBD,str(url)[:URIEL_STR_SIZE]) os.lseek(HGBD,BLK_SIZE,os.SEEK_SET) os.write(HGBD,filedata) logger.info("[Uriel] navigate to " + url) else: filesize = -1 ZeroParamBuf() os.lseek(HGBD,0,os.SEEK_SET) os.write(HGBD,str(filesize)) logger.error("[Uriel] error reading url " + url) conn.send(chr(URIEL_GETPAGE)) def UrielNavBack(): global Uriel if Uriel.nav_index > 0: Uriel.nav_index -= 1 url_comp = urlparse.urlparse(Uriel.history[Uriel.nav_index]['url']) filedata = Uriel.history[Uriel.nav_index]['filedata'] scheme = '' netloc = '' path = '' if url_comp.scheme == '': scheme = Uriel.rel.scheme else: scheme = url_comp.scheme if url_comp.netloc == '': netloc = Uriel.rel.netloc else: netloc = url_comp.netloc if url_comp.path != '': if url_comp.path.find('/') != -1: if url_comp.scheme == '' or url_comp.netloc == '': if url_comp.path[:1] != '/': path = Uriel.rel.path + url_comp.path Uriel.rel.path += url_comp.path[:url_comp.path.rfind('/')+1] else: path = url_comp.path Uriel.rel.path = url_comp.path[:url_comp.path.rfind('/')+1] else: path = url_comp.path Uriel.rel.path = url_comp.path[:url_comp.path.rfind('/')+1] else: path = Uriel.rel.path + url_comp.path post_scheme = netloc + "/" + urllib.quote(path) post_scheme = post_scheme.replace('//','/') url = scheme + "://" + post_scheme filesize = len(filedata) if filesize>0: ZeroParamBuf() os.lseek(HGBD,0,os.SEEK_SET) os.write(HGBD,str(filesize)) os.lseek(HGBD,128,os.SEEK_SET) os.write(HGBD,str(url)[:URIEL_STR_SIZE]) os.lseek(HGBD,BLK_SIZE,os.SEEK_SET) os.write(HGBD,filedata) logger.info("[Uriel] history navigate back to " + url) else: filesize = -1 ZeroParamBuf() os.lseek(HGBD,0,os.SEEK_SET) os.write(HGBD,str(filesize)) logger.error("[Uriel] error reading history for url " + url) conn.send(chr(URIEL_NAVBACK)) def UrielNavFwd(): global Uriel if Uriel.nav_index < len(Uriel.history)-1: Uriel.nav_index += 1 url_comp = urlparse.urlparse(Uriel.history[Uriel.nav_index]['url']) filedata = Uriel.history[Uriel.nav_index]['filedata'] scheme = '' netloc = '' path = '' if url_comp.scheme == '': scheme = Uriel.rel.scheme else: scheme = url_comp.scheme if url_comp.netloc == '': netloc = Uriel.rel.netloc else: netloc = url_comp.netloc if url_comp.path != '': if url_comp.path.find('/') != -1: if url_comp.scheme == '' or url_comp.netloc == '': if url_comp.path[:1] != '/': path = Uriel.rel.path + url_comp.path Uriel.rel.path += url_comp.path[:url_comp.path.rfind('/')+1] else: path = url_comp.path Uriel.rel.path = url_comp.path[:url_comp.path.rfind('/')+1] else: path = url_comp.path Uriel.rel.path = url_comp.path[:url_comp.path.rfind('/')+1] else: path = Uriel.rel.path + url_comp.path post_scheme = netloc + "/" + urllib.quote(path) post_scheme = post_scheme.replace('//','/') url = scheme + "://" + post_scheme filesize = len(filedata) if filesize>0: ZeroParamBuf() os.lseek(HGBD,0,os.SEEK_SET) os.write(HGBD,str(filesize)) os.lseek(HGBD,128,os.SEEK_SET) os.write(HGBD,str(url)[:URIEL_STR_SIZE]) os.lseek(HGBD,BLK_SIZE,os.SEEK_SET) os.write(HGBD,filedata) logger.info("[Uriel] history navigate fwd to " + url) else: filesize = -1 ZeroParamBuf() os.lseek(HGBD,0,os.SEEK_SET) os.write(HGBD,str(filesize)) logger.error("[Uriel] error reading history for url " + url) conn.send(chr(URIEL_NAVFWD)) def UrielThumb(): global Uriel os.lseek(HGBD,0,os.SEEK_SET) HGBD_PARAM_BUF = os.read(HGBD,BLK_SIZE) os.lseek(HGBD,BLK_SIZE,os.SEEK_SET) HGBD_URL_BUF = os.read(HGBD,BLK_SIZE*4) if Uriel.user_agent == '': Uriel.user_agent = HGBD_PARAM_BUF[:HGBD_PARAM_BUF.find('\x00')] url_comp = urlparse.urlparse(HGBD_URL_BUF[:HGBD_URL_BUF.find('\x00')]) scheme = '' netloc = '' path = '' if url_comp.scheme == '': scheme = Uriel.rel.scheme else: scheme = url_comp.scheme if url_comp.netloc == '': netloc = Uriel.rel.netloc else: netloc = url_comp.netloc if url_comp.path != '': if url_comp.path.find('/') != -1: if url_comp.scheme == '' or url_comp.netloc == '': if url_comp.path[:1] != '/': path = Uriel.rel.path + url_comp.path Uriel.rel.path += url_comp.path[:url_comp.path.rfind('/')+1] else: path = url_comp.path Uriel.rel.path = url_comp.path[:url_comp.path.rfind('/')+1] else: path = url_comp.path Uriel.rel.path = url_comp.path[:url_comp.path.rfind('/')+1] else: path = Uriel.rel.path + url_comp.path post_scheme = netloc + "/" + urllib.quote(path) post_scheme = post_scheme.replace('//','/') url = scheme + "://" + post_scheme tmp_thumb = '/tmp/' + str(uuid.uuid4()) + '.bmp' while os.path.exists(tmp_thumb): tmp_thumb = '/tmp/' + str(uuid.uuid4()) + '.bmp' pagedata = subprocess.Popen('wget -q -O - -U "' + Uriel.user_agent + '" "' + url + '" 2>/dev/null | gm convert -resize 100x100 - -colors 16 "' + tmp_thumb + '"', shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE).communicate()[0] filedata = open(tmp_thumb,"rb").read() try: os.remove(tmp_thumb) except: pass filesize = len(filedata) if filesize>0: ZeroParamBuf() os.lseek(HGBD,0,os.SEEK_SET) os.write(HGBD,str(filesize)) os.lseek(HGBD,BLK_SIZE,os.SEEK_SET) os.write(HGBD,filedata) logger.info("[Uriel] get image thumbnail " + url) else: filesize = -1 ZeroParamBuf() os.lseek(HGBD,0,os.SEEK_SET) os.write(HGBD,str(filesize)) logger.error("[Uriel] error reading url " + url) conn.send(chr(URIEL_THUMB)) def UrielDownload(): global Uriel filedata = Uriel.download_buffer filesize = len(filedata) if filesize>0: ZeroParamBuf() os.lseek(HGBD,0,os.SEEK_SET) os.write(HGBD,str(filesize)) os.lseek(HGBD,BLK_SIZE,os.SEEK_SET) os.write(HGBD,filedata) logger.info("[Uriel] binary, download file") else: filesize = -1 ZeroParamBuf() os.lseek(HGBD,0,os.SEEK_SET) os.write(HGBD,str(filesize)) logger.error("[Uriel] error downloading file") conn.send(chr(URIEL_DOWNLOAD)) def UrielPreProcess(htm1, l_url): title_text = '' hb_header = '$WW,1$$BLACK$$MA+LIS,"[Close]",LM="U_CloseBrowser;"$ $MA+LIS,"[Back]",LM="U_HistNav(0);"$ $MA+LIS,"[Fwd]",LM="U_HistNav(1);"$ $MA+LIS,"[Go]",LM="U_Browser(GetStr(\\"\nURL> \\"));"$ ' + title_text + '\n\n' if htm1.upper().find('',' ') htm1 = htm1.replace('
',' ') htm1 = htm1.replace('
','\n') htm1 = htm1.replace('
','\n') htm1 = htm1.replace('
','\n') htm1 = htm1.replace('
','\n') htm1 = htm1.replace('
','\n') htm1 = htm1.replace('
','\n') htm1 = htm1.replace('
  • ',' * ') htm1 = htm1.replace('
  • ',' * ') htm1 = htm1.replace('','') htm1 = htm1.replace('','') title_text = '' a_pos = htm1.upper().find('') if a_pos != -1: title_text = htm1[a_pos:htm1.find('</', a_pos)].split('>')[1] soup1 = bs4.BeautifulSoup(htm1, 'lxml') unwrap_tags = [ 'html', 'body', 'p', 'b', 'pre', 'span', 'table', 'header' ] for tag in unwrap_tags: for match in soup1.findAll(tag): match.unwrap() for f in soup1.findAll('a'): for tag in f.findAll(True): if str(tag).find('<None>') == -1: if tag.name.upper() != 'IMG': tag.decompose() remove_tags = [ 'svg', 'embed', 'head', 'noscript', 'object', 'param', 'script', 'option' ] for tag in remove_tags: [s.extract() for s in soup1(tag)] html = str(soup1) html = html.replace('<h1>','$PURPLE$') html = html.replace('<H1>','$PURPLE$') html = html.replace('</h1>','$BLACK$') html = html.replace('</H1>','$BLACK$') html = html.replace('<u>','$UL,1$') html = html.replace('<U>','$UL,1$') html = html.replace('</u>','$UL,0$') html = html.replace('</U>','$UL,0$') html = html.replace('<b>','$IV,1$') html = html.replace('<B>','$IV,1$') html = html.replace('</b>','$IV,0$') html = html.replace('</B>','$IV,0$') a_pos = html.upper().find('<IMG ') while a_pos != -1: img_text = html[a_pos:].split('>')[0] img_text.replace("'",'"') img_src = '' img_pos = img_text.upper().find('SRC') if img_pos > 0: img_src = img_text[img_text.upper().find('SRC'):].split('"')[1] img_el = '[URIEL_IMG]' + img_src + '[/URIEL_IMG]' html = html[:a_pos] + img_el + html[1+html.upper().find('>', a_pos):] a_pos = html.upper().find('<IMG ') a_pos = html.upper().find('<BUTTON ') while a_pos != -1: button_text = html[a_pos:].split('>')[1] button_text = button_text[:button_text.upper().find('</BUTTON')] button_text = button_text.replace('"','\\"') button_doctext = '$BT,"' + button_text + '"$' html = html[:a_pos] + button_doctext + html[9+html.upper().find('</BUTTON>', a_pos):] a_pos = html.upper().find('<BUTTON ') a_ctr = 0 a_pos = html.upper().find('<A ') while a_pos != -1: link_pre = '' link_text = html[a_pos:].split('>')[1] link_text = link_text[:link_text.upper().find('</A')] while link_text.find('[URIEL_IMG]') != -1: link_pre += link_text[link_text.find('[URIEL_IMG]'):12+link_text.find('[/URIEL_IMG]')] + ' ' link_text = link_text[:link_text.find('[URIEL_IMG]')] + link_text[12+link_text.find('[/URIEL_IMG]'):] link_text = link_text.replace('"','\\"') link_href = '' link_pos = html[a_pos:html.upper().find('</A>', a_pos)].upper().find('HREF') if link_pos > 0: link_href = html[a_pos:html.upper().find('</A>', a_pos)][link_pos:].replace('\'','"').split('"')[1] doldoc_link = '$AN,"",A="A' + str(a_ctr) + '"$$MA+LIS,"' + link_text + '",LM="U_Navigate(\\"A' + str(a_ctr) + '\\",\\"' + link_href + '\\");"$' html = html[:a_pos] + link_pre + doldoc_link + html[4+html.upper().find('</A>', a_pos):] a_ctr += 1 a_pos = html.upper().find('<A ') a_pos = html.upper().find('<CENTER>') while a_pos != -1: center_text = html[a_pos:].split('>')[1] center_text = center_text[:center_text.upper().find('</CENTER')] center_text = center_text.replace('"','\\"') if center_text.upper().find('[URIEL_IMG]') != -1: center_doctext = center_text else: center_doctext = '$TX+CX,"' + center_text + '"$' html = html[:a_pos] + center_doctext + html[9+html.upper().find('</CENTER>', a_pos):] a_pos = html.upper().find('<CENTER>') html = html.replace('</div>','\n') html = html.replace('</DIV>','\n') html = html.replace('</td>', ' ') html = html.replace('</TD>', ' ') html = html.replace('</tr>', '\n') html = html.replace('</TR>', '\n') a_pos = html.upper().find('<INPUT ') while a_pos != -1: input_text = html[a_pos:].split('>')[0] input_text = input_text.replace("'", '"') input_doctext = '[$UL,1$ $UL,0$]' t_text = '' if input_text.upper().find('VALUE='): t_t = input_text[input_text.upper().find('VALUE='):].split('"') if len(t_t) > 2: t_text = t_t[1] bt_text = t_text if t_text != '' else 'Button' st_text = t_text if t_text != '' else 'Submit' if input_text.find('button') != -1: input_doctext = '$BT,"' + bt_text + '"$' if input_text.find('checkbox') != -1: input_doctext = '$CB$' if input_text.find('hidden') != -1: input_doctext = '' if input_text.find('submit') != -1: input_doctext = '$BT,"' + st_text + '"$' html = html[:a_pos] + input_doctext + html[1+html.upper().find('>', a_pos):] a_pos = html.upper().find('<INPUT ') a_pos = html.upper().find('<') while a_pos != -1: html = html[:a_pos] + html[1+html.upper().find('>', a_pos):] a_pos = html.upper().find('<') html = html.replace('<','<') html = html.replace('>','>') html = html.replace('&','&') html = html.replace(''','\'') html = html.replace('"','"') img_a_ctr = 0 while html.find('[URIEL_IMG]') != -1: img_url = html[11+html.find('[URIEL_IMG]'):html.find('[/URIEL_IMG]')] img_ma = '$AN,"",A="IMG' + str(img_a_ctr) + '"$$MA+LIS,"[IMG]",LM="U_InsertThumb(\\"IMG' + str(img_a_ctr) + '\\",\\"$IMIS$\\",\\"$IMIE$\\",\\"' + img_url + '\\");"$$AN,"",A="IMIS' + str(img_a_ctr) + '"$' html = html[:html.find('[URIEL_IMG]')] + img_ma + html[12+html.find('[/URIEL_IMG]'):] img_a_ctr += 1 hb_header = '$WW,1$$BLACK$$MA+LIS,"[Close]",LM="U_CloseBrowser;"$ $MA+LIS,"[Back]",LM="U_HistNav(0);"$ $MA+LIS,"[Fwd]",LM="U_HistNav(1);"$ $MA+LIS,"[Go]",LM="U_Browser(GetStr(\\"URL> \\"));"$ ' + title_text + '\n\n' ind_id = '' o_html = '' o_lj_ct = 0 o_lj_indent = False for line in html.split('\n'): if not o_lj_indent: if line[0:13] == '$AN,"",A="IMG': # Left Justified image detected. ind_id = line.split('IMG')[1].split('"')[0] line = line.replace('$IMIS$','IMIS' + ind_id) line = line.replace('$IMIE$','IMIE' + ind_id) o_lj_indent = True if o_lj_indent: o_lj_ct += 1 if o_lj_ct > 11: line = '$AN,"",A="IMIE' + ind_id + '"$' + line ind_id = '' o_lj_ct = 0 o_lj_indent = False line = line.replace('$IMIS$','') line = line.replace('$IMIE$','') o_html += line + '\n' return hb_header + o_html