Fixed performance issue in #71

2019-08-30 22:27:43 +00:00 · 2019-08-30 22:27:43 +00:00 · c0c3024eeb
--- a/dbViewR/R/selectFromDB.R
+++ b/dbViewR/R/selectFromDB.R
@ -38,7 +38,7 @@ selectFromDB <- function( queryIn = jsonlite::toJSON(
                              GROUP_BY =list(COLUMN=c('encountered_week','residence_puma','residence_census_tract')),
                              SUMMARIZE=list(COLUMN='pathogen', IN= c('h1n1pdm'))
                            )
-                          ), source = 'simulated_data', 
+                          ), source = 'production', 
                          credentials_path = '/home/rstudio/seattle_flu',
                          na.rm = FALSE
                          ){
@ -71,12 +71,7 @@ selectFromDB <- function( queryIn = jsonlite::toJSON(
    # defined by the environment variables above.
    rawData <- DBI::dbConnect(RPostgres::Postgres(), service = "seattleflu-production")
-    db <- DBI::dbGetQuery(rawData, "select distinct * from shipping.incidence_model_observation_v1;")
+    db <- DBI::dbGetQuery(rawData, "select distinct * from shipping.incidence_model_observation_v2;")
    # fake pathogen field until db is ready
    # if (!('pathogen' %in% names(db))){
    #   db$pathogen <- 'unknown'
    # }
    # db <- DBI::dbGetQuery(rawData, paste('select distinct * from shipping.incidence_model_observation_v1 encounter',
    #                                      'left join shipping.presence_absence_result_v1 taq',
@ -86,42 +81,35 @@ selectFromDB <- function( queryIn = jsonlite::toJSON(
    # this logic should be substantially rethought, as I'm mixing sql and dplyr in confusing ways, but it will have to do for now!
    # get all samples and nest
-    db3 <- DBI::dbGetQuery(rawData, paste('select distinct * from shipping.presence_absence_result_v1',
+    db2 <- DBI::dbGetQuery(rawData, paste('select distinct * from shipping.presence_absence_result_v1',
                                          ';'),sep=' ') 
-    names(db3)[names(db3)=='target'] <- 'pathogen'
+    names(db2)[names(db2)=='target'] <- 'pathogen'
-    db3 <- db3 %>% group_by(sample) %>%
+    # count pathogens found and tests performed
-      mutate(number_pathogens_found = sum(present), number_pathogens_tested = n()) %>% 
+    db2 <- db2 %>% group_by(sample) %>%
-      filter(present == TRUE | number_pathogens_found==0) %>%
+      mutate(number_pathogens_found = sum(present), number_pathogens_tested = n())
      group_by(sample,number_pathogens_found,number_pathogens_tested) %>% 
      tidyr::nest() 
      for (k in which(db3$number_pathogens_found==0)){
        db3$data[[k]] <- tibble(pathogen='undetected',present=TRUE)
      }
-    names(db3)[names(db3) == 'data'] <- 'pathogens_found'    
+    # add in "undetected" pathogen for samples that were tested but had no detections
    db3 <- db2 %>% group_by(sample) %>% filter(all(present == FALSE) &  all(number_pathogens_tested>0)) %>%
      summarize(pathogen = 'undetected') %>% mutate(present=TRUE)
    # join undetecteds with positives
    db4 <- bind_rows(db2 %>% filter(present == TRUE), db3)
    # join with encounter list, using nice formatting
-    db <- db %>% left_join(db3)
+    db <- db %>% left_join(db4) 
    # put in "not_yet_tested" for samples with no test results
    idx<-is.na(db$number_pathogens_tested)
    db$number_pathogens_tested[idx] <- 0
-    for (k in which(idx)){
+    db$pathogen[idx] <- 'not_yet_tested'
-      db$pathogens_found[[k]] <- tibble(pathogen='not_yet_tested',present=TRUE)
+    db$present[idx] <- TRUE
    }
    db <- db %>% tidyr::unnest()  # nice flat file like simulated data, but with repeated encounters for multiple positives
    ## wacky thing in census tract
    db$residence_census_tract <- sub('\\.0','',db$residence_census_tract)
    db$work_census_tract <- sub('\\.0','',db$work_census_tract)
    DBI::dbDisconnect(rawData)
  } else {
     print('unknown source database!')
  }
--- a/dbViewR/man/selectFromDB.Rd
+++ b/dbViewR/man/selectFromDB.Rd
@ -10,7 +10,7 @@ selectFromDB(queryIn = jsonlite::toJSON(list(SELECT = list(COLUMN =
  "residence_census_tract")), GROUP_BY = list(COLUMN =
  c("encountered_week", "residence_puma", "residence_census_tract")),
  SUMMARIZE = list(COLUMN = "pathogen", IN = c("h1n1pdm")))),
-  source = "simulated_data",
+  source = "production",
  credentials_path = "/home/rstudio/seattle_flu", na.rm = FALSE)
 }
 \arguments{