Difference between revisions of "CSC352 Project1 Solution"
(One intermediate revision by the same user not shown) | |||
Line 4: | Line 4: | ||
__TOC__ | __TOC__ | ||
− | The | + | The solutions I picked were submitted by Yang and by Diana. |
=Analysis= | =Analysis= | ||
− | + | Two reports: [[media:CSC352Project1Yang.pdf | pdf1 ]] and [[media:CSC352Project1Diana.pdf | pdf2 ]]. | |
=Source Files= | =Source Files= |
Latest revision as of 09:33, 11 March 2010
--D. Thiebaut 14:24, 11 March 2010 (UTC)
The solutions I picked were submitted by Yang and by Diana.
Analysis
Source Files
Multiprocessing
#! /usr/bin/python
"""
proj1MultiProc.py
CSC352 Spring 2010
Yang Li
This is a multiprocessing program that retrieves 20 files
containing the searched term and analyze the word frequency
of each retrieved file.
It scores each file by the frequency ranking of the search term
among other words in that file. The search results are ordered in decreasing scores.
To run:
chmod +x proj1MultiProc.py
./proj1MultiProc.py <keyword>
"""
import urllib, urllib2
import multiprocessing
import textwrap,re,sys
MAINURL = "http://xgridmac.dyndns.org/~thiebaut/swish-e/swishe.php"
VERSION = """Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11)
Gecko/20071127 Firefox/2.0.0.11"""
HEADERS = {"User-Agent" : VERSION }
#list of stopwords
STOPWORDS = ['a', 'able', 'about', 'across', 'after', 'all', 'almost', 'also', 'am',
'among', 'an', 'and', 'any', 'are', 'as', 'at', 'be', 'because', 'been', 'but', 'by', 'can',
'cannot', 'could', 'dear', 'did', 'do', 'does', 'either', 'else', 'ever', 'every', 'for',
'from', 'get', 'got', 'had', 'has', 'have', 'he', 'her', 'hers', 'him', 'his', 'how',
'however', 'i', 'if', 'in', 'into', 'is', 'it', 'its', 'just', 'least', 'let', 'like', 'likely', 'may',
'me', 'might', 'most', 'must', 'my', 'neither', 'no', 'nor', 'not', 'of', 'off', 'often',
'on', 'only', 'or', 'other', 'our', 'own', 'rather', 'said', 'say', 'says', 'she', 'should',
'since', 'so', 'some', 'than', 'that', 'the', 'their', 'them', 'then', 'there', 'these',
'they', 'this', 'tis', 'to', 'too', 'twas', 'us', 'wants', 'was', 'we', 'were', 'what', 'when',
'where', 'which', 'while', 'who', 'whom', 'why', 'will', 'with', 'would', 'yet', 'you', 'your']
def contentSearch(f_url,keywd,queue):
"""
Fetch file from url, find context of keyword and store to queue
f_url: file url
keyword: search keyword
queue: multiprocessing queue object that stores output
"""
output="fetching url: "+f_url+"\n"
#read each file
f_req = urllib2.Request(f_url,None,HEADERS)
try:
f = urllib2.urlopen( f_req )
except urllib2.URLError:
return
text = f.read()
#find keyword
i = text.find(keywd)
if i != -1:
#find context
start=0
end=-1
if i>=50:
start=i-50
if len(text)>i+50:
end=i+50
output+="\n".join(textwrap.wrap(" ".join(text[start:end].split()[1:-1]),40))
#find score
relavence = score(text,keywd)
queue.put([relavence,output])
f.close()
def score(text,keyword):
"""
compute word frequency and document score
text: string of text to be analyzed
keyword: search keyword
"""
freqDict = dict() #dictionary of word-freqency pairs
wordlist = re.split('\W+',text) #split text into words
#ignoring punctuations
for word in wordlist:
if word in STOPWORDS:
#ignore stop word
continue
if word not in freqDict:
freqDict[word]=1
else:
freqDict[word]=freqDict[word]+1
#print freqDict
sorted = [] #list of freqency-word pairs for sorting
for word,freq in freqDict.iteritems():
sorted.append([freq,word])
sorted.sort()
sorted.reverse()
sc=-1 #page score
for i in range(len(sorted)):
if sorted[i][1]==keyword:
sc = 1.0/(1+i)
break
return sc
""" Main class """
def main():
if len( sys.argv ) > 1:
keyword = sys.argv[1]
else:
print "Usage: ./multiprocessRetrieveEText.py <keyword1,keyword2,...>"
quit()
url = MAINURL
args = { "search" : keyword }
req = urllib2.Request( url, urllib.urlencode( args ), HEADERS )
result = urllib2.urlopen( req )
lines = result.readlines()
result.close()
# list of processes
list = [];
queue = multiprocessing.Queue(20);
# read search result and create a process for each url
for line in lines:
if line.find( "<br>url:" ) != -1:
f_url = line.split( "url:" )[1].strip()
p = multiprocessing.Process(target=contentSearch,args=(f_url,keyword,queue))
p.start()
list.append(p)
# wait for all processes to finish
for proc in list:
proc.join()
# print output
result = []
while not queue.empty():
result.append(queue.get())
result.sort()
result.reverse()
for i in range(len(result)):
print "\n---------------------------"
print "Rank: %d Score: %f"% (i+1,result[i][0])
print result[i][1]
if i<20:
print "\n==========================="
print str(20-i)+" url(s) can not be opened.\n"
print "\nDone"
return 0
if __name__=="__main__":
main()
Threading
#! /usr/bin/python
"""
proj1Thread.py
CSC352 Spring 2010
Yang Li
This is a multi-threaded program that retrieves 20 files
containing the searched term and analyze the word frequency
of each retrieved file.
It scores each file by the frequency ranking of the search term
among other words in that file. The search results are ordered in decreasing scores.
To run:
chmod +x proj1Thread.py
./proj1Thread.py <keyword>
"""
import urllib, urllib2
import multiprocessing
import textwrap
import sys,re
from threading import Thread
from Queue import Queue # use "queue" lowercase in Python 3
MAINURL = "http://xgridmac.dyndns.org/~thiebaut/swish-e/swishe.php"
VERSION = """Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11)
Gecko/20071127 Firefox/2.0.0.11"""
HEADERS = {"User-Agent" : VERSION }
#list of stopwords
STOPWORDS = ['a', 'able', 'about', 'across', 'after', 'all', 'almost', 'also', 'am',
'among', 'an', 'and', 'any', 'are', 'as', 'at', 'be', 'because', 'been', 'but', 'by',
'can', 'cannot', 'could', 'dear', 'did', 'do', 'does', 'either', 'else', 'ever', 'every',
'for', 'from', 'get', 'got', 'had', 'has', 'have', 'he', 'her', 'hers', 'him', 'his', 'how',
'however', 'i', 'if', 'in', 'into', 'is', 'it', 'its', 'just', 'least', 'let', 'like', 'likely', 'may',
'me', 'might', 'most', 'must', 'my', 'neither',
'no', 'nor', 'not', 'of', 'off', 'often', 'on', 'only', 'or', 'other', 'our', 'own', 'rather',
'said', 'say', 'says', 'she', 'should', 'since', 'so', 'some', 'than', 'that', 'the', 'their',
'them', 'then', 'there', 'these', 'they', 'this', 'tis', 'to', 'too', 'twas', 'us', 'wants', 'was',
'we', 'were', 'what', 'when', 'where', 'which', 'while', 'who', 'whom', 'why', 'will',
'with', 'would', 'yet', 'you', 'your']
"""extended thread class"""
class urlRetrieverThread( Thread ):
def __init__(self,f_url,keyword,queue):
Thread.__init__(self)
self.f_url = f_url; #file url
self.keyword = keyword; #search keyword
self.queue=queue; #multiprocessing queue that stores output
def run(self):
"""
Fetch file from url, find context of keyword and compute scores
"""
output="fetching url: "+self.f_url+"\n"
#read each file
f_req = urllib2.Request(self.f_url,None,HEADERS)
try:
f = urllib2.urlopen( f_req )
except urllib2.URLError:
return
text = f.read()
#find keyword
i = text.find(self.keyword)
if i != -1:
#find context
start=0
end=-1
if i>=50:
start=i-50
if len(text)>i+50:
end=i+50
output+="\n".join(textwrap.wrap(" ".join(text[start:end].split()[1:-1]),40))
#find score
relavence = self.score(text)
self.queue.put([relavence,output])
f.close()
def score(self,text):
"""
compute word frequency and document score
text: string of text to be analyzed
"""
freqDict = dict() #dictionary of word-freqency pairs
wordlist = re.split('\W+',text) #split text into words
#ignoring punctuations
for word in wordlist:
if word in STOPWORDS:
#ignore stop word
continue
if word not in freqDict:
freqDict[word]=1
else:
freqDict[word]=freqDict[word]+1
sorted = [] #list of freqency-word pairs for sorting
for word,freq in freqDict.iteritems():
sorted.append([freq,word])
sorted.sort()
sorted.reverse()
sc=-1 #page score
for i in range(len(sorted)):
if sorted[i][1]==self.keyword:
sc = 1.0/(1+i)
break
return sc
""" Main class """
def main():
if len( sys.argv ) > 1:
keyword = sys.argv[1]
print "Search for keyword:",keyword,"...\n"
else:
print "Usage: ./multiprocessRetrieveEText.py <keyword1,keyword2,...>"
quit()
url = MAINURL
args = { "search" : keyword }
req = urllib2.Request( url, urllib.urlencode( args ), HEADERS )
result = urllib2.urlopen( req )
lines = result.readlines()
result.close()
# list of threads
list = [];
queue = Queue(20);
# read search result and create a process for each url
for line in lines:
if line.find( "<br>url:" ) != -1:
f_url = line.split( "url:" )[1].strip()
t = urlRetrieverThread(f_url,keyword,queue)
t.start()
list.append(t)
# wait for all processes to finish
for t in list:
t.join()
# print output
result = []
while not queue.empty():
result.append(queue.get())
result.sort()
result.reverse()
for i in range(len(result)):
print "\n---------------------------"
print "Rank: %d Score: %f"% (i+1,result[i][0])
print result[i][1]
if i<20:
print "\n==========================="
print str(20-i)+" url(s) can not be opened.\n"
print "\nDone"
return 0
if __name__=="__main__":
main()