From c0c3024eeb2eaedad1cd5d171f7647a9e82d0e01 Mon Sep 17 00:00:00 2001 From: famulare Date: Fri, 30 Aug 2019 22:27:43 +0000 Subject: [PATCH] Fixed performance issue in #71 --- dbViewR/R/selectFromDB.R | 46 ++++++++++++++----------------------- dbViewR/man/selectFromDB.Rd | 2 +- 2 files changed, 18 insertions(+), 30 deletions(-) diff --git a/dbViewR/R/selectFromDB.R b/dbViewR/R/selectFromDB.R index 21d9e0b..2fd4354 100644 --- a/dbViewR/R/selectFromDB.R +++ b/dbViewR/R/selectFromDB.R @@ -38,7 +38,7 @@ selectFromDB <- function( queryIn = jsonlite::toJSON( GROUP_BY =list(COLUMN=c('encountered_week','residence_puma','residence_census_tract')), SUMMARIZE=list(COLUMN='pathogen', IN= c('h1n1pdm')) ) - ), source = 'simulated_data', + ), source = 'production', credentials_path = '/home/rstudio/seattle_flu', na.rm = FALSE ){ @@ -71,12 +71,7 @@ selectFromDB <- function( queryIn = jsonlite::toJSON( # defined by the environment variables above. rawData <- DBI::dbConnect(RPostgres::Postgres(), service = "seattleflu-production") - db <- DBI::dbGetQuery(rawData, "select distinct * from shipping.incidence_model_observation_v1;") - - # fake pathogen field until db is ready - # if (!('pathogen' %in% names(db))){ - # db$pathogen <- 'unknown' - # } + db <- DBI::dbGetQuery(rawData, "select distinct * from shipping.incidence_model_observation_v2;") # db <- DBI::dbGetQuery(rawData, paste('select distinct * from shipping.incidence_model_observation_v1 encounter', # 'left join shipping.presence_absence_result_v1 taq', @@ -86,42 +81,35 @@ selectFromDB <- function( queryIn = jsonlite::toJSON( # this logic should be substantially rethought, as I'm mixing sql and dplyr in confusing ways, but it will have to do for now! # get all samples and nest - db3 <- DBI::dbGetQuery(rawData, paste('select distinct * from shipping.presence_absence_result_v1', + db2 <- DBI::dbGetQuery(rawData, paste('select distinct * from shipping.presence_absence_result_v1', ';'),sep=' ') - names(db3)[names(db3)=='target'] <- 'pathogen' + names(db2)[names(db2)=='target'] <- 'pathogen' - db3 <- db3 %>% group_by(sample) %>% - mutate(number_pathogens_found = sum(present), number_pathogens_tested = n()) %>% - filter(present == TRUE | number_pathogens_found==0) %>% - group_by(sample,number_pathogens_found,number_pathogens_tested) %>% - tidyr::nest() - - for (k in which(db3$number_pathogens_found==0)){ - db3$data[[k]] <- tibble(pathogen='undetected',present=TRUE) - } + # count pathogens found and tests performed + db2 <- db2 %>% group_by(sample) %>% + mutate(number_pathogens_found = sum(present), number_pathogens_tested = n()) - names(db3)[names(db3) == 'data'] <- 'pathogens_found' + # add in "undetected" pathogen for samples that were tested but had no detections + db3 <- db2 %>% group_by(sample) %>% filter(all(present == FALSE) & all(number_pathogens_tested>0)) %>% + summarize(pathogen = 'undetected') %>% mutate(present=TRUE) + # join undetecteds with positives + db4 <- bind_rows(db2 %>% filter(present == TRUE), db3) # join with encounter list, using nice formatting - db <- db %>% left_join(db3) + db <- db %>% left_join(db4) + + # put in "not_yet_tested" for samples with no test results idx<-is.na(db$number_pathogens_tested) db$number_pathogens_tested[idx] <- 0 - for (k in which(idx)){ - db$pathogens_found[[k]] <- tibble(pathogen='not_yet_tested',present=TRUE) - } - - db <- db %>% tidyr::unnest() # nice flat file like simulated data, but with repeated encounters for multiple positives + db$pathogen[idx] <- 'not_yet_tested' + db$present[idx] <- TRUE - ## wacky thing in census tract - db$residence_census_tract <- sub('\\.0','',db$residence_census_tract) - db$work_census_tract <- sub('\\.0','',db$work_census_tract) DBI::dbDisconnect(rawData) - } else { print('unknown source database!') } diff --git a/dbViewR/man/selectFromDB.Rd b/dbViewR/man/selectFromDB.Rd index b999d80..6b3d182 100644 --- a/dbViewR/man/selectFromDB.Rd +++ b/dbViewR/man/selectFromDB.Rd @@ -10,7 +10,7 @@ selectFromDB(queryIn = jsonlite::toJSON(list(SELECT = list(COLUMN = "residence_census_tract")), GROUP_BY = list(COLUMN = c("encountered_week", "residence_puma", "residence_census_tract")), SUMMARIZE = list(COLUMN = "pathogen", IN = c("h1n1pdm")))), - source = "simulated_data", + source = "production", credentials_path = "/home/rstudio/seattle_flu", na.rm = FALSE) } \arguments{