@ -1,12 +1,15 @@
import bs4
import re , sys
def preprocess ( htm1 ) :
def preprocess ( htm1 , l_url ) :
title_text = ' '
hb_header = ' $WW,1$$BLACK$$MA+LIS, " [Close] " ,LM= " CloseBrowser; " $ $MA+LIS, " [Back] " ,LM= " Browser( \\ " h:back \\ " ); " $ $MA+LIS, " [Fwd] " ,LM= " Browser( \\ " h:fwd \\ " ); " $ $MA+LIS, " [Go] " ,LM= " Browser(GetStr( \\ " \n URL> \\ " )); " $ ' + title_text + ' \n \n '
if htm1 . upper ( ) . find ( ' <HTML ' ) == - 1 :
return hb_header + htm1
dl_link = ' $AN, " " ,A= " BINARY " $Click $MA+LIS, " [Here] " ,LM= " Get( \\ " retry:send \\ " , \\ " ~/Downloads/ ' + l_url . split ( ' / ' ) [ len ( l_url . split ( ' / ' ) ) - 1 ] + ' \\ " ); " $ to download the file: \n \n ' + l_url
if l_url . split ( ' . ' ) [ len ( l_url . split ( ' . ' ) ) - 1 ] . upper ( ) == ' Z ' :
dl_link = dl_link . replace ( ' retry:send ' , ' retry:sendZ ' )
return hb_header + dl_link
htm1 = htm1 [ htm1 . upper ( ) . find ( ' <HTML ' ) : ]
htm1 = htm1 . replace ( ' $ ' , ' $$ ' )
@ -41,7 +44,9 @@ def preprocess(htm1):
for f in soup1 . findAll ( ' a ' ) :
for tag in f . findAll ( True ) :
tag . decompose ( )
if str ( tag ) . find ( ' <None> ' ) == - 1 :
if tag . name . upper ( ) != ' IMG ' :
tag . decompose ( )
remove_tags = [ ' svg ' , ' embed ' , ' head ' , ' noscript ' , ' object ' , ' param ' , ' script ' , ' option ' ]
for tag in remove_tags :
@ -59,10 +64,22 @@ def preprocess(htm1):
html = html . replace ( ' </u> ' , ' $UL,0$ ' )
html = html . replace ( ' </U> ' , ' $UL,0$ ' )
html = html . replace ( ' <b> ' , ' $IV,1$ ' )
html = html . replace ( ' <B> ' , ' $IV,1$ ' )
html = html . replace ( ' </b> ' , ' $IV,0$ ' )
html = html . replace ( ' </B> ' , ' $IV,0$ ' )
a_pos = html . upper ( ) . find ( ' <IMG ' )
while a_pos != - 1 :
html = html [ : a_pos ] + html [ 1 + html . upper ( ) . find ( ' > ' , a_pos ) : ]
a_pos = html . upper ( ) . find ( ' <IMG ' )
img_text = html [ a_pos : ] . split ( ' > ' ) [ 0 ]
img_text . replace ( " ' " , ' " ' )
img_src = ' '
img_pos = img_text . upper ( ) . find ( ' SRC ' )
if img_pos > 0 :
img_src = img_text [ img_text . upper ( ) . find ( ' SRC ' ) : ] . split ( ' " ' ) [ 1 ]
img_el = ' [URIEL_IMG] ' + img_src + ' [/URIEL_IMG] '
html = html [ : a_pos ] + img_el + html [ 1 + html . upper ( ) . find ( ' > ' , a_pos ) : ]
a_pos = html . upper ( ) . find ( ' <IMG ' )
a_pos = html . upper ( ) . find ( ' <BUTTON ' )
while a_pos != - 1 :
@ -73,17 +90,23 @@ def preprocess(htm1):
html = html [ : a_pos ] + button_doctext + html [ 9 + html . upper ( ) . find ( ' </BUTTON> ' , a_pos ) : ]
a_pos = html . upper ( ) . find ( ' <BUTTON ' )
a_ctr = 0
a_pos = html . upper ( ) . find ( ' <A ' )
while a_pos != - 1 :
link_pre = ' '
link_text = html [ a_pos : ] . split ( ' > ' ) [ 1 ]
link_text = link_text [ : link_text . upper ( ) . find ( ' </A ' ) ]
while link_text . find ( ' [URIEL_IMG] ' ) != - 1 :
link_pre + = link_text [ link_text . find ( ' [URIEL_IMG] ' ) : 12 + link_text . find ( ' [/URIEL_IMG] ' ) ] + ' '
link_text = link_text [ : link_text . find ( ' [URIEL_IMG] ' ) ] + link_text [ 12 + link_text . find ( ' [/URIEL_IMG] ' ) : ]
link_text = link_text . replace ( ' " ' , ' \\ " ' )
link_href = ' '
link_pos = html [ a_pos : html . upper ( ) . find ( ' </A> ' , a_pos ) ] . upper ( ) . find ( ' HREF ' )
if link_pos > 0 :
link_href = html [ a_pos : html . upper ( ) . find ( ' </A> ' , a_pos ) ] [ link_pos : ] . replace ( ' \' ' , ' " ' ) . split ( ' " ' ) [ 1 ]
doldoc_link = ' $MA+LIS, " ' + link_text + ' " ,LM= " Browser( \\ " ' + link_href + ' \\ " ); " $ '
html = html [ : a_pos ] + doldoc_link + html [ 4 + html . upper ( ) . find ( ' </A> ' , a_pos ) : ]
doldoc_link = ' $AN, " " ,A= " A ' + str ( a_ctr ) + ' " $$MA+LIS, " ' + link_text + ' " ,LM= " Navigate( \\ " A ' + str ( a_ctr ) + ' \\ " , \\ " ' + link_href + ' \\ " ); " $ '
html = html [ : a_pos ] + link_pre + doldoc_link + html [ 4 + html . upper ( ) . find ( ' </A> ' , a_pos ) : ]
a_ctr + = 1
a_pos = html . upper ( ) . find ( ' <A ' )
a_pos = html . upper ( ) . find ( ' <CENTER> ' )
@ -91,7 +114,10 @@ def preprocess(htm1):
center_text = html [ a_pos : ] . split ( ' > ' ) [ 1 ]
center_text = center_text [ : center_text . upper ( ) . find ( ' </CENTER ' ) ]
center_text = center_text . replace ( ' " ' , ' \\ " ' )
center_doctext = ' $TX+CX, " ' + center_text + ' " $ '
if center_text . upper ( ) . find ( ' [URIEL_IMG] ' ) != - 1 :
center_doctext = center_text
else :
center_doctext = ' $TX+CX, " ' + center_text + ' " $ '
html = html [ : a_pos ] + center_doctext + html [ 9 + html . upper ( ) . find ( ' </CENTER> ' , a_pos ) : ]
a_pos = html . upper ( ) . find ( ' <CENTER> ' )
@ -142,6 +168,39 @@ def preprocess(htm1):
html = html . replace ( ' ' ' , ' \' ' )
html = html . replace ( ' " ' , ' " ' )
img_a_ctr = 0
while html . find ( ' [URIEL_IMG] ' ) != - 1 :
img_url = html [ 11 + html . find ( ' [URIEL_IMG] ' ) : html . find ( ' [/URIEL_IMG] ' ) ]
img_ma = ' $AN, " " ,A= " IMG ' + str ( img_a_ctr ) + ' " $$MA+LIS, " [IMG] " ,LM= " InsertImg( \\ " IMG ' + str ( img_a_ctr ) + ' \\ " , \\ " $IMIS$ \\ " , \\ " $IMIE$ \\ " , \\ " ' + img_url + ' .uriel_img \\ " ); " $$AN, " " ,A= " IMIS ' + str ( img_a_ctr ) + ' " $ '
html = html [ : html . find ( ' [URIEL_IMG] ' ) ] + img_ma + html [ 12 + html . find ( ' [/URIEL_IMG] ' ) : ]
img_a_ctr + = 1
hb_header = ' $WW,1$$BLACK$$MA+LIS, " [Close] " ,LM= " CloseBrowser; " $ $MA+LIS, " [Back] " ,LM= " Browser( \\ " h:back \\ " ); " $ $MA+LIS, " [Fwd] " ,LM= " Browser( \\ " h:fwd \\ " ); " $ $MA+LIS, " [Go] " ,LM= " Browser(GetStr( \\ " URL> \\ " )); " $ ' + title_text + ' \n \n '
return hb_header + html
ind_id = ' '
o_html = ' '
o_lj_ct = 0
o_lj_indent = False
for line in html . split ( ' \n ' ) :
if not o_lj_indent :
if line [ 0 : 13 ] == ' $AN, " " ,A= " IMG ' :
# Left Justified image detected.
ind_id = line . split ( ' IMG ' ) [ 1 ] . split ( ' " ' ) [ 0 ]
line = line . replace ( ' $IMIS$ ' , ' IMIS ' + ind_id )
line = line . replace ( ' $IMIE$ ' , ' IMIE ' + ind_id )
o_lj_indent = True
if o_lj_indent :
o_lj_ct + = 1
if o_lj_ct > 11 :
line = ' $AN, " " ,A= " IMIE ' + ind_id + ' " $ ' + line
ind_id = ' '
o_lj_ct = 0
o_lj_indent = False
line = line . replace ( ' $IMIS$ ' , ' ' )
line = line . replace ( ' $IMIE$ ' , ' ' )
o_html + = line + ' \n '
return hb_header + o_html