Tutorial 1: R and Simulation of Population Growth

From dftwiki3
Revision as of 08:37, 17 July 2015 by Thiebaut (talk | contribs)
Jump to: navigation, search

--D. Thiebaut (talk) 09:26, 17 July 2015 (EDT)



This changes the default colors in lattice plots.

knitr settings to control how R chunks work.

require(knitr) opts_chunk$set( tidy=FALSE, # display code as typed size="small" # slightly smaller font for code ) ```

Getting the data

# generate pop growth
#
# generatePop.py
# D. Thiebaut

import random

# define parameters
dataFileName = "pop%04d.dat"
maxT = 2            # how fast we progress in time
T = 3 * 31          # max time frame (3 months)
maxPop = 2400       # max # of students
proportion = 0.50   # how much of the population contributes
                    # to new cases of infection
oneBigFile = True
severalFiles = True

def generateOneInfectionHistory( Id ):
    global dataFileName
    
    # iterate and generate population
    pop = 0             # starting pop
    t = 0
    out = ""
    while t <= T:
        out += "%d, %d, %d\n" % ( Id, t, pop )
        if pop < maxPop / 2:
            pop += 1 + random.randrange( int( pop*proportion) +1 )
        else:
            pop += 1 + random.randrange( int( (maxPop - pop)*proportion) + 1 )
        pop = min( maxPop, pop )    
        t += 1 + random.randrange( maxT )
    return out

def main():
    allOut = "Id, time, pop\n"
    for i in range( 10 ):
        out = generateOneInfectionHistory( i+1 )       
        print( dataFileName % (i+1), "created" )
        allOut += out
        if severalFiles:
            out = "Id, time, pop\n" + out
            open( "pop%04d.dat" % (i+1), 'w' ).write( out )

    if oneBigFile:
            open( "pop%04d_%04d.dat" % (0,(i+1)), 'w' ).write( allOut )
        
        
main()

Reading the Data In

 
pop0000_0200 <- read.csv("~/Desktop/Dropbox/CVC2015_Workshop/pop50_0000_0200.dat")

This generates N different files, and 1 large file containing all the N files.
The format of a file is:

Id, time, pop
1, 0, 0
1, 2, 1
1, 3, 2
1, 5, 3
1, 7, 4
1, 9, 5
1, 11, 8
1, 12, 10
1, 13, 12
1, 15, 16
...
1, 84, 2400
1, 86, 2400
1, 88, 2400
1, 89, 2400
1, 90, 2400
1, 92, 2400
1, 93, 2400

Create Mean of Points

pop0000_0200avgCount <- pop0000_0200 %>% 
          group_by( time = 5*trunc(time/5)  ) %>% 
          summarise( avgPop = mean( pop ), count=n() )

Plot

Option 1: jagged mean

ggplot( data=pop0000_0200, aes( x = time, y = pop, color = Id ) ) + geom_point( ) +
              scale_colour_gradientn(colours=rainbow(4)) + 
              stat_summary(fun.y = mean, geom = 'line', color = 'blue' )

Option 2: smooth means, but data in bins

ggplot( data=pop0000_0200, aes( x = trunc(time/3)*3, y = pop, color = Id ) )  +
        geom_point( ) +  
        scale_colour_gradientn(colours=rainbow(4)) + 
        stat_summary(fun.y = mean, geom = 'smooth', color = 'blue', width=3 )

<h1 id="option-3-plotting-just-the-mean">Option 3: Plotting just the Mean</h1>

<source lang="text">
ggplot( data=pop0000_0200avgCount  ) + 
          geom_line(  aes( x=time, y=avgPop ), color=&#39;blue&#39; ) + 
          xlab( &#39;time&#39; ) + ylab( &#39;Infected Population&#39; )

Step 4: computing discrete slope

Add an additional variable (column) to pop0000_0200avgCount, call it slope, and fill it with the avgPop. Then run a for-loop and fill the new column with the value of the slope.

temp &lt;- mutate( pop0000_0200avgCount, slope=avgPop )
for ( i in 2: nrow( temp) ) { 
       temp[i,4] &lt;- temp[i,2]-temp[i-1,2] 
}
head( temp )

Display the resulting slopes as points:

ggplot(  ) + geom_line( data=temp, aes( x=time, y=avgPop ), color=&#39;blue&#39; ) + 
        geom_point( data=temp, aes(x=time, y=slope), color=&#39;red&#39; )

Compute the max slope:

summarize( temp, maxSlope=max( slope ) )

Shiny Application Ver. 1: Local Files

# app.R
# D. Thiebaut

library( &quot;ggplot2&quot; )
library( &quot;shiny&quot; )

data_sets &lt;- list()
data_sets[[&quot;pop10&quot;]] &lt;- read.csv(&quot;~/Desktop/Dropbox/CVC2015_Workshop/pop10_0000_0200.dat&quot;)
data_sets[[&quot;pop20&quot;]] &lt;- read.csv(&quot;~/Desktop/Dropbox/CVC2015_Workshop/pop20_0000_0200.dat&quot;)
data_sets[[&quot;pop30&quot;]] &lt;- read.csv(&quot;~/Desktop/Dropbox/CVC2015_Workshop/pop30_0000_0200.dat&quot;)
data_sets[[&quot;pop40&quot;]] &lt;- read.csv(&quot;~/Desktop/Dropbox/CVC2015_Workshop/pop40_0000_0200.dat&quot;)
data_sets[[&quot;pop50&quot;]] &lt;- read.csv(&quot;~/Desktop/Dropbox/CVC2015_Workshop/pop50_0000_0200.dat&quot;)
data_sets[[&quot;pop60&quot;]] &lt;- read.csv(&quot;~/Desktop/Dropbox/CVC2015_Workshop/pop60_0000_0200.dat&quot;)
data_sets[[&quot;pop70&quot;]] &lt;- read.csv(&quot;~/Desktop/Dropbox/CVC2015_Workshop/pop70_0000_0200.dat&quot;)
data_sets[[&quot;pop80&quot;]] &lt;- read.csv(&quot;~/Desktop/Dropbox/CVC2015_Workshop/pop80_0000_0200.dat&quot;)
data_sets[[&quot;pop90&quot;]] &lt;- read.csv(&quot;~/Desktop/Dropbox/CVC2015_Workshop/pop90_0000_0200.dat&quot;)


server &lt;- function( input, output ) {
  output$main_plot &lt;- renderPlot({
    
      ggplot( data=data_sets[[paste0(&quot;pop&quot;, input$n_breaks)]], 
              aes( x = time, y = pop, color = Id ) ) + 
              geom_point( ) +  
              scale_colour_gradientn(colours=rainbow(4)) + 
              stat_summary(fun.y = mean, geom = &#39;line&#39;, color = &#39;blue&#39; )
  } )
}

ui &lt;- fluidPage(
  selectInput(inputId = &quot;n_breaks&quot;,
              label = &quot;Population Growth (magic param):&quot;,
              choices = c(10, 20, 30, 40, 50, 60, 70, 80, 90),
              selected = 50),
  
  plotOutput(outputId = &quot;main_plot&quot;, height = &quot;300px&quot;)
)

shinyApp(ui = ui, server = server)

Shiny Application Ver. 2: Load Data File Dynamically

# app.R
# D. Thiebaut

library( &quot;ggplot2&quot; )
library( &quot;shiny&quot; )

server &lt;- function( input, output ) {
  
  dataSet &lt;- reactive( {
      fileName &lt;- paste0( &quot;~/Desktop/Dropbox/CVC2015_Workshop/pop&quot;, input$n_breaks, &quot;_0000_0200.dat&quot;)
      read.csv( fileName )
      } )
  
  output$main_plot &lt;- renderPlot({
      ggplot( data=dataSet(), 
              aes( x = time, y = pop, color = Id ) ) + 
              geom_point( ) +  
              scale_colour_gradientn(colours=rainbow(4)) + 
              stat_summary(fun.y = mean, geom = &#39;line&#39;, color = &#39;blue&#39; )
  } )
}

ui &lt;- fluidPage(
  selectInput(inputId = &quot;n_breaks&quot;,
              label = &quot;Population Growth (magic param):&quot;,
              choices = c(10, 20, 30, 40, 50, 60, 70, 80, 90),
              selected = 50),
  
  plotOutput(outputId = &quot;main_plot&quot;, height = &quot;300px&quot;)
)

shinyApp(ui = ui, server = server)

Shiny Application Ver. 3: Grab file dynamically from Web Page

Simply create the dataset differently. The URL for the data is http://hadoop0.dyndns.org/R/

Index of /R

[ICO]   Name    Last modified   Size    Description
[PARENTDIR] Parent Directory        -    
[TXT]   generatePop.py  2015-07-16 09:21    1.8K     
[   ]   pop10_0000_0200.dat 2015-07-16 11:45    141K     
[   ]   pop20_0000_0200.dat 2015-07-16 09:22    146K     
[   ]   pop30_0000_0200.dat 2015-07-16 09:21    150K     
[   ]   pop40_0000_0200.dat 2015-07-16 09:22    153K     
[   ]   pop50_0000_0200.dat 2015-07-16 09:21    154K     
[   ]   pop60_0000_0200.dat 2015-07-16 09:22    155K     
[   ]   pop70_0000_0200.dat 2015-07-16 09:22    156K     
[   ]   pop80_0000_0200.dat 2015-07-16 09:22    157K     
[   ]   pop90_0000_0200.dat 2015-07-16 09:22    157K     
Apache/2.4.7 (Ubuntu) Server at hadoop0.dyndns.org Port 80

And we just need to change how the data-set is created:

dataSet &lt;- reactive( { fileName &lt;- paste0( &quot;http://hadoop0.dyndns.org/R/pop&quot;, input$n_breaks, &quot;_0000_0200.dat&quot;) read.csv( url( fileName ) ) } )

Shiny Application Ver. 4: Grab file dynamically from cgi-script

Steps

  • add cgi-bin capability to Apache server on Hadoop0
  • create python cgi-bin script
  • slightly modify how to get the data from a file name created using the number generated in the input widget.
  • the new URL is http://hadoop0.dyndns.org/cgi-bin/generatePop.py?param=50
  • change 50 to some int between 1 and 99.
#! /usr/bin/env python3
# D. Thiebaut
# generate pop growth
#

import random, sys
import cgi

#--- cgi setup ---
print( &quot;Content-Type: text/plain&quot; )
print()

#--- define global parameters ---
dataFileName = &quot;pop%02d_%04d.dat&quot;
maxT = 2            # how fast we progress in time
T = 3 * 31          # max time frame (3 months)
maxPop = 2400       # max # of students
proportion = 0.50   # how much of the population contributes
                    # to new cases of infection
noFiles      = 200                    
oneBigFile   = True
severalFiles = False
printOut     = True


def getParams():
    &quot;&quot;&quot; get parameters from URL&quot;&quot;&quot;
    dico = {}
    arguments = cgi.FieldStorage()
    for i in arguments.keys():
        #print( i, &quot;--&gt;&quot;, arguments[i].value )
        dico[i] = arguments[i].value
    return dico

def getProportion():
    &quot;&quot;&quot; get proportion parameter from URL&quot;&quot;&quot;
    dico = getParams()
    try:
        return int(dico[&quot;param&quot;])/100.0
    except:
        return 0.50   # default value if nothing is passed
                      # in URL


def generateOneInfectionHistory( Id ):
    &quot;&quot;&quot;generate 1 semester worth of data, showing increase
    of infected students population as a function of days
    (1 semester max)&quot;&quot;&quot;
    
    global dataFileName
    
    # iterate and generate population
    pop = 0             # starting pop
    t = 0
    out = &quot;&quot;
    while t &lt;= T:
        out += &quot;%d, %d, %d\n&quot; % ( Id, t, pop )
        if pop &lt; maxPop / 2:
            pop += 1 + random.randrange( int( pop*proportion) +1 )
        else:
            pop += 1 + random.randrange( int( (maxPop - pop)*proportion) + 1 )
        pop = min( maxPop, pop )    
        t += 1 + random.randrange( maxT )
    return out


def main():
    global noFiles, proportion

    # get proportion parameter from URL
    proportion = getProportion()

    allOut = &quot;Id, time, pop\n&quot;
    if printOut:
        print( allOut, end=&quot;&quot; )

    for i in range( noFiles ):
        out = generateOneInfectionHistory( i+1 )       
        if printOut: 
            print( out, end=&quot;&quot; )

        allOut += out
        if severalFiles:
            out = &quot;Id, time, pop\n&quot; + out
            open( &quot;pop%02d_%04d.dat&quot; % (int(proportion*100),i+1), &#39;w&#39; ).write( out )
            #print( dataFileName % (int(proportion*100), i+1), &quot;created&quot; )

    if oneBigFile:
        open( &quot;pop%02d_%04d_%04d.dat&quot; % (int(proportion*100),0,(i+1)), &#39;w&#39; ).write( allOut )


        
main()

New shiny file.


# app.R
# D. Thiebaut
# reads data from files on a Web server
#
library( &quot;ggplot2&quot; )
library( &quot;shiny&quot; )

server &lt;- function( input, output ) {
  
  dataSet &lt;- reactive( {
    fileName &lt;- paste0( &quot;http://hadoop0.dyndns.org/cgi-bin/generatePop.py?param=&quot;, input$n_breaks )
    read.csv( url( fileName ) )
  } )
  
  output$main_plot &lt;- renderPlot({
    ggplot( data=dataSet(), 
            aes( x = time, y = pop, color = Id ) ) + 
      geom_point( ) +  
      scale_colour_gradientn(colours=rainbow(4)) + 
      stat_summary(fun.y = mean, geom = &#39;line&#39;, color = &#39;blue&#39; )
  } )
}

ui &lt;- fluidPage(
  selectInput(inputId = &quot;n_breaks&quot;,
              label = &quot;Population Growth (magic param):&quot;,
              choices = c(10, 20, 30, 40, 50, 60, 70, 80, 90),
              selected = 50),
  
  plotOutput(outputId = &quot;main_plot&quot;, height = &quot;300px&quot;)
)

shinyApp(ui = ui, server = server)

Shiny Application Ver. 4: Slider, Grab file dynamically from cgi-script

<p>Just change the UI.

# app.R
# D. Thiebaut
# reads data from files on a Web server using cgi-bin
# uses a slider
library( &quot;ggplot2&quot; )
library( &quot;shiny&quot; )

server &lt;- function( input, output ) {
  
  dataSet &lt;- reactive( {
    fileName &lt;- paste0( &quot;http://hadoop0.dyndns.org/cgi-bin/generatePop.py?param=&quot;, input$n_breaks )
    read.csv( url( fileName ) )
  } )
  
  output$main_plot &lt;- renderPlot({
    ggplot( data=dataSet(), 
            aes( x = time, y = pop, color = Id ) ) + 
      geom_point( ) +  
      scale_colour_gradientn(colours=rainbow(4)) + 
      stat_summary(fun.y = mean, geom = &#39;line&#39;, color = &#39;blue&#39; )
  } )
}

ui &lt;- fluidPage(
  titlePanel( &quot;Infected Population&quot;, windowTitle = &quot;Growth of Infected Population&quot; ),
  
  sliderInput( inputId = &quot;n_breaks&quot;, label = &quot;Population Growth (magic param):&quot;,
               min = 1, max = 99, step = 0.5, value = 50 ),
  
  plotOutput(outputId = &quot;main_plot&quot;, height = &quot;300px&quot;)
)

shinyApp(ui = ui, server = server)

Shiny Application Ver. 5: adding an input widget


# app.R
# D. Thiebaut
# reads data from files on a Web server using cgi-bin
# uses a slider
library( &quot;ggplot2&quot; )
library( &quot;shiny&quot; )

server &lt;- function( input, output ) {
  
  dataSet &lt;- reactive( {
    fileName &lt;- paste0( &quot;http://hadoop0.dyndns.org/cgi-bin/generatePop.py?proportion=&quot;, input$n_breaks,
                        &quot;&amp;simulations=&quot;, input$noSimulations )
    read.csv( url( fileName ) )
  } )
  
  output$main_plot &lt;- renderPlot({
    ggplot( data=dataSet(), 
            aes( x = time, y = pop, color = Id ) ) + 
      geom_point( ) +  
      scale_colour_gradientn(colours=rainbow(4)) + 
      stat_summary(fun.y = mean, geom = &#39;line&#39;, color = &#39;blue&#39; )
  } )
}

ui &lt;- fluidPage(

  titlePanel( &quot;Infected Population&quot;, windowTitle = &quot;Growth of Infected Population&quot; ),
  
  mainPanel( h2( &quot;Description&quot;), p( &quot;This graph shows the result of 200 simulations of the growth a population of infected students on a campus, as a function of some &#39;magic&#39; parameter controlled by the slider.&quot;),
             p( &quot;The points show the growth resulting from the 200 simulations, and the line shows the average of the points over bins of 3 time periods.&quot; ),
             p( &quot;The data are read from a URL where a server generates data on the fly.  The value of the slider is sent as a suffix to the URL (e.g. http://hadoop0.dyndns.org/cgi-bin/generatePop.py?param=71) and the server generates 200 different simulations.&quot; )
                
  ),
  selectInput(inputId = &quot;noSimulations&quot;,
              label = &quot;Number of Simulations:&quot;,
              choices = c(20, 100, 250, 500, 1000),
              selected = 250),
  
  sliderInput( inputId = &quot;n_breaks&quot;, label = &quot;Population Growth (magic param):&quot;,
               min = 1, max = 99, step = 0.5, value = 50 ),
  
  plotOutput(outputId = &quot;main_plot&quot;, height = &quot;300px&quot;)
)

shinyApp(ui = ui, server = server)

Publishing to ShinyApps.io