SearchKeywordsRetrieveEtexts.py

From dftwiki3
Revision as of 10:28, 24 January 2010 by Thiebaut (talk | contribs) (New page: <code><pre> #! /usr/bin/python # D. Thiebaut # searchKeywordsRetrieveEtext.py # # user feeds keyword, program get list of docs, then download each docs # process them, and display sentence...)
(diff) ← Older revision | Latest revision (diff) | Newer revision → (diff)
Jump to: navigation, search
#! /usr/bin/python
# D. Thiebaut
# searchKeywordsRetrieveEtext.py
#
# user feeds keyword, program get list of docs, then download each docs
# process them, and display sentence containing keyword

import urllib, urllib2
import textwrap

MAINURL = "http://xgridmac.dyndns.org/~thiebaut/swish-e/swishe.php"
VERSION = """Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11)
 Gecko/20071127 Firefox/2.0.0.11"""
HEADERS       = {"User-Agent"  : VERSION }


def getListOfDocs( keyword, debug=False ):
    url = MAINURL
    args = { "search" : keyword }
    if debug: print "args = ", str( args )
    req = urllib2.Request( url,  urllib.urlencode( args ), HEADERS )
    f = urllib2.urlopen( req )
    return f.read()

def parse( text, debug=False ):
    lines = text.split( "\n" )
    list = []
    doc  = {}
    for line in lines:
        if line.find( "<br>url:" )!=-1: 
            doc["url"] = line.split( "url:" )[1].strip() 
        if line.find( "<br>score:" )!=-1: 
            doc["score"] = int( line.split( "score:" )[1].strip() ) 
        if line.find( "<br>file:" )!=-1: 
            doc["file"] = line.split( "file:" )[1].strip() 
        if line.find( "<br>offset:" )!=-1: 
            doc["offset"] = int( line.split( "offset:" )[1].strip() ) 
            list.append( doc )
            doc = {}
        
    return list

def getOneDoc( url, keywords, debug=False ):
    args = { }
    req = urllib2.Request( url,  urllib.urlencode( args ), HEADERS )
    f = urllib2.urlopen( req )
    text = f.read()
    offset = text.find( keywords ) 
    if offset != -1:
        exerpt = text[ max(offset-60, 0):min( offset+60, len(text) )]
    else:
        exerpt = text
    #--- remove duplicate white spaces ---
    exerpt = ' '.join( exerpt.split()[1:-1] )

    #--- wrap text at 40
    exerpt = '\n'.join( textwrap.wrap( exerpt, 40 ) )
    return exerpt

def getAllDocs( listOfDocs, keywords, debug=False ):
    for doc in listOfDocs:
        url = doc[ "url" ]
        fileName = doc[ "file" ]
        if debug:
            print "--------------------------------------------"
            print "fetching url:", url
        text = getOneDoc( url, keywords, debug )
        if debug: print text        
        
def main( debug=False ):
    keywords = "love"
    text = getListOfDocs( keywords, debug )
    #if debug: print "text = ", text
    listOfDocs = parse( text, False )
    #if debug: print listOfDocs
    getAllDocs( listOfDocs, keywords, debug )
    
main( True )