Difference between revisions of "SearchKeywordsRetrieveEtexts.py"
(New page: <code><pre> #! /usr/bin/python # D. Thiebaut # searchKeywordsRetrieveEtext.py # # user feeds keyword, program get list of docs, then download each docs # process them, and display sentence...) |
|||
Line 1: | Line 1: | ||
− | < | + | --[[User:Thiebaut|D. Thiebaut]] 16:02, 24 January 2010 (UTC) |
+ | <source lang="python"> | ||
#! /usr/bin/python | #! /usr/bin/python | ||
# D. Thiebaut | # D. Thiebaut | ||
Line 79: | Line 80: | ||
main( True ) | main( True ) | ||
− | </ | + | </source> |
Latest revision as of 11:02, 24 January 2010
--D. Thiebaut 16:02, 24 January 2010 (UTC)
#! /usr/bin/python
# D. Thiebaut
# searchKeywordsRetrieveEtext.py
#
# user feeds keyword, program get list of docs, then download each docs
# process them, and display sentence containing keyword
import urllib, urllib2
import textwrap
MAINURL = "http://xgridmac.dyndns.org/~thiebaut/swish-e/swishe.php"
VERSION = """Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11)
Gecko/20071127 Firefox/2.0.0.11"""
HEADERS = {"User-Agent" : VERSION }
def getListOfDocs( keyword, debug=False ):
url = MAINURL
args = { "search" : keyword }
if debug: print "args = ", str( args )
req = urllib2.Request( url, urllib.urlencode( args ), HEADERS )
f = urllib2.urlopen( req )
return f.read()
def parse( text, debug=False ):
lines = text.split( "\n" )
list = []
doc = {}
for line in lines:
if line.find( "<br>url:" )!=-1:
doc["url"] = line.split( "url:" )[1].strip()
if line.find( "<br>score:" )!=-1:
doc["score"] = int( line.split( "score:" )[1].strip() )
if line.find( "<br>file:" )!=-1:
doc["file"] = line.split( "file:" )[1].strip()
if line.find( "<br>offset:" )!=-1:
doc["offset"] = int( line.split( "offset:" )[1].strip() )
list.append( doc )
doc = {}
return list
def getOneDoc( url, keywords, debug=False ):
args = { }
req = urllib2.Request( url, urllib.urlencode( args ), HEADERS )
f = urllib2.urlopen( req )
text = f.read()
offset = text.find( keywords )
if offset != -1:
exerpt = text[ max(offset-60, 0):min( offset+60, len(text) )]
else:
exerpt = text
#--- remove duplicate white spaces ---
exerpt = ' '.join( exerpt.split()[1:-1] )
#--- wrap text at 40
exerpt = '\n'.join( textwrap.wrap( exerpt, 40 ) )
return exerpt
def getAllDocs( listOfDocs, keywords, debug=False ):
for doc in listOfDocs:
url = doc[ "url" ]
fileName = doc[ "file" ]
if debug:
print "--------------------------------------------"
print "fetching url:", url
text = getOneDoc( url, keywords, debug )
if debug: print text
def main( debug=False ):
keywords = "love"
text = getListOfDocs( keywords, debug )
#if debug: print "text = ", text
listOfDocs = parse( text, False )
#if debug: print listOfDocs
getAllDocs( listOfDocs, keywords, debug )
main( True )