Browse Source

Added Host proxy & DolDoc preprocessor

master
Alec Murphy 5 years ago
parent
commit
9b117eb033
  1. 147
      uriel_preprocessor.py
  2. 153
      uriel_proxy

147
uriel_preprocessor.py

@ -0,0 +1,147 @@
import bs4
import re, sys
def preprocess(htm1):
title_text = ''
hb_header = '$WW,1$$BLACK$$MA+LIS,"[Close]",LM="CloseBrowser;"$ $MA+LIS,"[Back]",LM="Browser(\\"h:back\\");"$ $MA+LIS,"[Fwd]",LM="Browser(\\"h:fwd\\");"$ $MA+LIS,"[Go]",LM="Browser(GetStr(\\"\nURL> \\"));"$ ' + title_text + '\n\n'
if htm1.upper().find('<HTML') == -1:
return re.sub(r'[^\x00-\x7F]','', hb_header) + re.sub(r'[^\x00-\x7F]','', htm1)
htm1 = htm1[htm1.upper().find('<HTML'):]
htm1 = htm1.replace('$', '$$')
htm1 = htm1.replace('<blockquote>',' ')
htm1 = htm1.replace('<BLOCKQUOTE>',' ')
htm1 = htm1.replace('<br>','\n')
htm1 = htm1.replace('<br/>','\n')
htm1 = htm1.replace('<br />','\n')
htm1 = htm1.replace('<BR>','\n')
htm1 = htm1.replace('<BR/>','\n')
htm1 = htm1.replace('<BR />','\n')
htm1 = htm1.replace('<li>',' * ')
htm1 = htm1.replace('<LI>',' * ')
htm1 = htm1.replace('</img>','')
htm1 = htm1.replace('</IMG>','')
title_text = ''
a_pos = htm1.upper().find('<TITLE>')
if a_pos != -1:
title_text = htm1[a_pos:htm1.find('</', a_pos)].split('>')[1]
soup1 = bs4.BeautifulSoup(htm1, 'lxml')
unwrap_tags = [ 'html', 'body', 'p', 'b', 'pre', 'span', 'table', 'header' ]
for tag in unwrap_tags:
for match in soup1.findAll(tag):
match.unwrap()
for f in soup1.findAll('a'):
for tag in f.findAll(True):
tag.decompose()
remove_tags = [ 'svg', 'embed', 'head', 'noscript', 'object', 'param', 'script', 'option' ]
for tag in remove_tags:
[s.extract() for s in soup1(tag)]
html = str(soup1)
html = html.replace('<h1>','$PURPLE$')
html = html.replace('<H1>','$PURPLE$')
html = html.replace('</h1>','$BLACK$')
html = html.replace('</H1>','$BLACK$')
html = html.replace('<u>','$UL,1$')
html = html.replace('<U>','$UL,1$')
html = html.replace('</u>','$UL,0$')
html = html.replace('</U>','$UL,0$')
a_pos = html.upper().find('<IMG ')
while a_pos != -1:
html = html[:a_pos] + html[1+html.upper().find('>', a_pos):]
a_pos = html.upper().find('<IMG ')
a_pos = html.upper().find('<BUTTON ')
while a_pos != -1:
button_text = html[a_pos:].split('>')[1]
button_text = button_text[:button_text.upper().find('</BUTTON')]
button_text = button_text.replace('"','\\"')
button_doctext = '$BT,"' + button_text + '"$'
html = html[:a_pos] + button_doctext + html[9+html.upper().find('</BUTTON>', a_pos):]
a_pos = html.upper().find('<BUTTON ')
a_pos = html.upper().find('<A ')
while a_pos != -1:
link_text = html[a_pos:].split('>')[1]
link_text = link_text[:link_text.upper().find('</A')]
link_text = link_text.replace('"','\\"')
link_href = ''
link_pos = html[a_pos:html.upper().find('</A>', a_pos)].upper().find('HREF')
if link_pos > 0:
link_href = html[a_pos:html.upper().find('</A>', a_pos)][link_pos:].replace('\'','"').split('"')[1]
doldoc_link = '$MA+LIS,"' + link_text + '",LM="Browser(\\"' + link_href + '\\");"$'
html = html[:a_pos] + doldoc_link + html[4+html.upper().find('</A>', a_pos):]
a_pos = html.upper().find('<A ')
a_pos = html.upper().find('<CENTER>')
while a_pos != -1:
center_text = html[a_pos:].split('>')[1]
center_text = center_text[:center_text.upper().find('</CENTER')]
center_text = center_text.replace('"','\\"')
center_doctext = '$TX+CX,"' + center_text + '"$'
html = html[:a_pos] + center_doctext + html[9+html.upper().find('</CENTER>', a_pos):]
a_pos = html.upper().find('<CENTER>')
html = html.replace('</div>','\n')
html = html.replace('</DIV>','\n')
html = html.replace('</td>', ' ')
html = html.replace('</TD>', ' ')
html = html.replace('</tr>', '\n')
html = html.replace('</TR>', '\n')
a_pos = html.upper().find('<INPUT ')
while a_pos != -1:
input_text = html[a_pos:].split('>')[0]
input_text = input_text.replace("'", '"')
input_doctext = '[$UL,1$ $UL,0$]'
t_text = ''
if input_text.upper().find('VALUE='):
t_t = input_text[input_text.upper().find('VALUE='):].split('"')
if len(t_t) > 2:
t_text = t_t[1]
bt_text = t_text if t_text != '' else 'Button'
st_text = t_text if t_text != '' else 'Submit'
if input_text.find('button') != -1:
input_doctext = '$BT,"' + bt_text + '"$'
if input_text.find('checkbox') != -1:
input_doctext = '$CB$'
if input_text.find('hidden') != -1:
input_doctext = ''
if input_text.find('submit') != -1:
input_doctext = '$BT,"' + st_text + '"$'
html = html[:a_pos] + input_doctext + html[1+html.upper().find('>', a_pos):]
a_pos = html.upper().find('<INPUT ')
a_pos = html.upper().find('<')
while a_pos != -1:
html = html[:a_pos] + html[1+html.upper().find('>', a_pos):]
a_pos = html.upper().find('<')
html = html.replace('&lt;','<')
html = html.replace('&gt;','>')
html = html.replace('&amp;','&')
html = html.replace('&apos;','\'')
html = html.replace('&quot;','"')
hb_header = '$WW,1$$BLACK$$MA+LIS,"[Close]",LM="CloseBrowser;"$ $MA+LIS,"[Back]",LM="Browser(\\"h:back\\");"$ $MA+LIS,"[Fwd]",LM="Browser(\\"h:fwd\\");"$ $MA+LIS,"[Go]",LM="Browser(GetStr(\\"URL> \\"));"$ ' + title_text + '\n\n'
return re.sub(r'[^\x00-\x7F]','', hb_header) + re.sub(r'[^\x00-\x7F]','', html)

153
uriel_proxy

@ -0,0 +1,153 @@
#!/usr/bin/python
from uriel_preprocessor import preprocess
import os, socket, sys, time, urlparse
import requests
HOST = '127.0.0.1'
PORT = 8000
DELIM_BIN_EOF = '*[U_EOF]'
DELIM_BIN_GET = '*[U_GET]'
DELIM_BIN_PUT = '*[U_PUT]'
DELIM_BIN_SOF = '*[U_SOF]'
MODE_LISTEN = 0
MODE_PUT_START = 1
MODE_GET_START = 2
URIEL_VER_STR = 'Uriel/0.1'
blk_size = 8
delay_ms = .001
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
try:
s.bind((HOST, PORT))
except socket.error as msg:
sys.stdout.write('Error: failed to open socket\n')
sys.stdout.flush()
sys.exit()
s.listen(0)
sys.stdout.write('[uriel_proxy started]\n')
sys.stdout.flush()
while 1:
conn, addr = s.accept()
cmd_in = ''
rel_url = ''
history = []
hst_index = -1
state = MODE_LISTEN
while 1:
if 1==1:
data = conn.recv(1024)
cmd_in += data
if state == MODE_LISTEN:
if cmd_in.find(DELIM_BIN_GET) != -1:
state = MODE_GET_START
if state == MODE_GET_START:
if cmd_in.find('^') != -1:
get_file = cmd_in[cmd_in.find(DELIM_BIN_GET)+8:cmd_in.find('|')]
blk_ctr = 0
if get_file.find('://') != -1:
headers = { 'User-Agent': URIEL_VER_STR + ' (' + cmd_in.split('^')[0].rsplit('|')[1] + ')' }
r = requests.get(get_file, headers=headers)
file = r.text
else:
file = open(get_file, "rb").read()
while blk_ctr < len(file):
conn.sendall(file[blk_ctr:blk_ctr+blk_size])
blk_ctr += blk_size
time.sleep(delay_ms)
conn.sendall(DELIM_BIN_EOF+'\x00')
cmd_in = ""
state = MODE_LISTEN
if state == MODE_LISTEN:
if cmd_in.find(DELIM_BIN_PUT) != -1:
state = MODE_PUT_START
if state == MODE_PUT_START:
if cmd_in.find(DELIM_BIN_EOF) != -1:
put_files = cmd_in[cmd_in.find(DELIM_BIN_PUT)+8:cmd_in.find(DELIM_BIN_SOF)].split('|')
s_filename = put_files[0]
r_filename = put_files[1]
if r_filename == "":
r_filename = 'Xfer/' + s_filename.split('/')[len(s_filename.split('/'))-1]
open(r_filename,"wb").write(cmd_in[cmd_in.find(DELIM_BIN_SOF)+8:cmd_in.find(DELIM_BIN_EOF)])
cmd_in = ""
state = MODE_LISTEN
if cmd_in.find('^') != -1 and state == MODE_LISTEN:
user_agent = URIEL_VER_STR + ' (' + cmd_in.split('^')[0].rsplit('|')[1] + ')'
url = cmd_in.split('^')[0].rsplit('|')[0]
url = url.split('#')[0]
prot_ag_url = False
if url.lower()[0:4] != 'http':
if url[0:2] == '//':
url = 'http:' + url
prot_ag_url = True
if url.find('/') != -1 and not prot_ag_url:
if url.split('/')[0].find('.') != -1:
url = 'http://' + url
page_int = 0
if url == 'h:back':
if hst_index > 0:
hst_index -= 1
url = history[hst_index]['url']
page = history[hst_index]['page']
page_int = 1
if url == 'h:fwd':
if hst_index < len(history)-1:
hst_index += 1
url = history[hst_index]['url']
page = history[hst_index]['page']
page_int = 1
if page_int == 0:
if url.find('://') == -1:
url = rel_url + url
url = url[:url.find('//')] + '//' + url[url.find('//')+2:].replace('//','/')
headers = { 'User-Agent': user_agent }
r = requests.get(url, headers=headers)
data = r.text
page = preprocess(data)
hst_index += 1
history = history[0:hst_index]
history.append({'url':url, 'page':page})
u_page = page
page = ''
u_idx = 0
while u_idx < len(u_page):
if ord(u_page[u_idx:u_idx+1]) < 127:
page += u_page[u_idx:u_idx+1]
u_idx += 1
blk_ctr = 0
while blk_ctr < len(page):
conn.sendall(page[blk_ctr:blk_ctr+blk_size])
blk_ctr += blk_size
time.sleep(delay_ms)
conn.sendall('\xFF' * 32)
if url.find('://') != -1:
r_url = urlparse.urlparse(url)
rel_url = r_url[0] + '://' + r_url[1]
r_path = '/'
if r_url[2] != '':
if r_url[2][r_url[2].rfind('/'):].find('.') != -1:
r_path = r_url[2][:r_url[2].rfind('/')] + r_path
else:
r_path = r_url[2] + r_path
rel_url = rel_url + r_path
i_page = ''
page = ''
cmd_in = ''
url = ''
conn.close()
s.close()
Loading…
Cancel
Save