Difference between revisions of "CSC111 Homework 11 2018"

From dftwiki3
Jump to: navigation, search
 
(5 intermediate revisions by the same user not shown)
Line 1: Line 1:
 
[[User:Thiebaut|D. Thiebaut]] ([[User talk:Thiebaut|talk]]) 20:58, 19 April 2018 (EDT)
 
[[User:Thiebaut|D. Thiebaut]] ([[User talk:Thiebaut|talk]]) 20:58, 19 April 2018 (EDT)
 
----
 
----
 +
<onlydft>
 
=Make-Up Homework 11=
 
=Make-Up Homework 11=
 
<br />
 
<br />
 
<bluebox>
 
<bluebox>
This homework is due on Thursday, May 3rd, at 11:55 p.m. and is a make-up homework.  You can use it to replace the lowest grade you have received on Homework Assignments 1 to 10.
+
This homework is due on Thursday, May 3rd, at 11:55 p.m. and is a make-up homework, and is optional.  You can use it to replace the lowest grade you have received on Homework Assignments 1 to 10.
 
<br />
 
<br />
 
Below is the algorithm explaining how will count toward your homework average grade.
 
Below is the algorithm explaining how will count toward your homework average grade.
Line 31: Line 32:
 
=Assignment=
 
=Assignment=
 
<br />
 
<br />
* Write a program that processes a csv file that has been downloaded from the department of eduction of the U.S. government.  It contains the scorecard for 7,594 colleges and universities in the States.  Each line of the csv file represent one college/university.  This is what is called a ''scorecard''.  Each scorecard contains 123 fields.  The '''city''' where the college/university is located is the field at Index 4.  The '''state''' is in the field at Index 5.
+
* Write a program that processes a csv file that has been downloaded from the department of eduction of the U.S. government.  It contains the scorecard for 7,594 colleges and universities in the States.  Each line of the csv file represents one college/university.  This is what is called a ''scorecard''.  Each scorecard contains 123 fields.  The '''city''' where the college/university is located is the field at Index 4.  The '''state''' is in the field at Index 5.
 
* The original file was downloaded from https://catalog.data.gov/dataset?res_format=CSV and is mirrored here: http://cs.smith.edu/~dthiebaut/111/collegeScorecard.csv
 
* The original file was downloaded from https://catalog.data.gov/dataset?res_format=CSV and is mirrored here: http://cs.smith.edu/~dthiebaut/111/collegeScorecard.csv
* You should use the smith college URL in your program.
+
* <font color="red">You should use the smith college URL in your program.</font>
 
* Your program should output the 10 cities that contain the largest number of colleges and universities.
 
* Your program should output the 10 cities that contain the largest number of colleges and universities.
 
* The output should be formatted as followed:
 
* The output should be formatted as followed:
Line 50: Line 51:
 
* Submit your program as hw11.py in the Homework 11 section on Moodle.
 
* Submit your program as hw11.py in the Homework 11 section on Moodle.
 
<br />
 
<br />
 +
==Warning==
 
<br />
 
<br />
 +
* Downloading the csv file from the URL may take between 3 to 30 seconds, depending on how busy the network and the server are...
 
<br />
 
<br />
 +
 +
</onlydft>
 
<showafterdate after="20180504 12:00" before="20180601 00:00">
 
<showafterdate after="20180504 12:00" before="20180601 00:00">
 
<br />
 
<br />
Line 61: Line 66:
 
# collegeScorecard.py
 
# collegeScorecard.py
 
# https://ed-public-download.app.cloud.gov/downloads/Most-Recent-Cohorts-Scorecard-Elements.csv
 
# https://ed-public-download.app.cloud.gov/downloads/Most-Recent-Cohorts-Scorecard-Elements.csv
 +
 +
URL = "http://cs.smith.edu/~dthiebaut/111/collegeScorecard.csv"
  
 
def getLines( fileName ):
 
def getLines( fileName ):
 +
    '''Given a file name, reads the file and returns
 +
    all the lines contained in the file'''
 
     file = open( fileName, 'r' )
 
     file = open( fileName, 'r' )
 
     lines = file.read()  
 
     lines = file.read()  
Line 68: Line 77:
 
     lines = lines.split( "\n" )
 
     lines = lines.split( "\n" )
 
     return lines
 
     return lines
 +
 +
def getURLWriteToFile( URL, outputFileName ):
 +
    '''Given a URL, gets the text stored in the file on the Web
 +
    and saves it into a local file'''
 +
    from urllib.request import urlopen  # library for fetching
 +
                                        # Web pages
 +
    # open the URL
 +
    response = urlopen(URL)
 +
 +
    # get its contents, and store it in a string called text
 +
    text = response.read().decode('utf-8')
 +
 +
    # save the string to file
 +
    open( outputFileName, "w" ).write( text )
  
 
def main():
 
def main():
 +
    # get the file from the URL, and save it locally
 +
    getURLWriteToFile( URL, "collegeScorecard.csv" )
 +
 +
    # get the lines from the local file
 
     lines = getLines( "collegeScorecard.csv" )
 
     lines = getLines( "collegeScorecard.csv" )
 +
 +
    # get some information about the file
 
     header = lines[0].split( ',' )
 
     header = lines[0].split( ',' )
 
     noFields = len( header )
 
     noFields = len( header )
 +
 +
    # indices of fields of interest
 
     cityIndex = 4
 
     cityIndex = 4
 
     stateIndex = 5
 
     stateIndex = 5
 +
 +
    # create a dictionary for counting the number of
 +
    # times each city and state appears in a line.
 +
    #
 
     cityStateDico = {}
 
     cityStateDico = {}
 
     for line in lines[1: ]:
 
     for line in lines[1: ]:
Line 82: Line 117:
 
         except:
 
         except:
 
             continue
 
             continue
         cityState = city + "_" + state
+
 
 +
        # skip invalid states
 +
        if len( state ) != 2:
 +
            continue
 +
 
 +
        # create a string containing both city and state
 +
         cityState = city + ", " + state
 +
 
 +
        # if it's not in directory, add it, otherwise
 +
        # increment the counter associated with it
 
         if cityState not in cityStateDico:
 
         if cityState not in cityStateDico:
 
             cityStateDico[ cityState ] = 1
 
             cityStateDico[ cityState ] = 1
Line 88: Line 132:
 
             cityStateDico[ cityState ] += 1
 
             cityStateDico[ cityState ] += 1
  
 +
    # create a list of tuples for all the cities, with the
 +
    # counter first, and then the city name.
 
     listCities = []
 
     listCities = []
 
     for cityState in cityStateDico.keys():
 
     for cityState in cityStateDico.keys():
 
         listCities.append( (cityStateDico[cityState], cityState ) )
 
         listCities.append( (cityStateDico[cityState], cityState ) )
  
 +
    # sort and reverse the list
 
     listCities.sort()
 
     listCities.sort()
 
     listCities.reverse()
 
     listCities.reverse()
  
 +
    # display the first 10
 
     for i in range( 10 ):
 
     for i in range( 10 ):
         print( listCities[i] )
+
         print( listCities[i][0], listCities[i][1] )
 
          
 
          
 
          
 
          
 
main()
 
main()
 +
 +
   
  
 
</source>
 
</source>

Latest revision as of 12:47, 1 June 2018

D. Thiebaut (talk) 20:58, 19 April 2018 (EDT)



...

<showafterdate after="20180504 12:00" before="20180601 00:00">

Solution Program


Source


# collegeScorecard.py
# https://ed-public-download.app.cloud.gov/downloads/Most-Recent-Cohorts-Scorecard-Elements.csv

URL = "http://cs.smith.edu/~dthiebaut/111/collegeScorecard.csv"

def getLines( fileName ):
    '''Given a file name, reads the file and returns
    all the lines contained in the file'''
    file = open( fileName, 'r' )
    lines = file.read() 
    file.close()
    lines = lines.split( "\n" )
    return lines

def getURLWriteToFile( URL, outputFileName ):
    '''Given a URL, gets the text stored in the file on the Web
    and saves it into a local file'''
    from urllib.request import urlopen  # library for fetching
                                        # Web pages 
    # open the URL
    response = urlopen(URL)

    # get its contents, and store it in a string called text
    text = response.read().decode('utf-8')

    # save the string to file
    open( outputFileName, "w" ).write( text )

def main():
    # get the file from the URL, and save it locally
    getURLWriteToFile( URL, "collegeScorecard.csv" )

    # get the lines from the local file
    lines = getLines( "collegeScorecard.csv" )

    # get some information about the file
    header = lines[0].split( ',' )
    noFields = len( header )

    # indices of fields of interest
    cityIndex = 4
    stateIndex = 5

    # create a dictionary for counting the number of
    # times each city and state appears in a line.
    # 
    cityStateDico = {}
    for line in lines[1: ]:
        try:
            city = line.split(',')[cityIndex].strip()
            state = line.split(',')[stateIndex].strip()
        except:
            continue

        # skip invalid states
        if len( state ) != 2:
            continue

        # create a string containing both city and state
        cityState = city + ", " + state

        # if it's not in directory, add it, otherwise
        # increment the counter associated with it
        if cityState not in cityStateDico:
            cityStateDico[ cityState ] = 1
        else:
            cityStateDico[ cityState ] += 1

    # create a list of tuples for all the cities, with the
    # counter first, and then the city name.
    listCities = []
    for cityState in cityStateDico.keys():
        listCities.append( (cityStateDico[cityState], cityState ) )

    # sort and reverse the list
    listCities.sort()
    listCities.reverse()

    # display the first 10
    for i in range( 10 ):
        print( listCities[i][0], listCities[i][1] )
        
        
main()

Output


91 New York, NY
76 Chicago, IL
74 Houston, TX
59 Los Angeles, CA
52 San Antonio, TX
50 Miami, FL
48 Brooklyn, NY
46 Philadelphia, PA
42 Atlanta, GA
40 Dallas, TX


</showafterdate>