Fixed performance issue in #71
This commit is contained in:
Родитель
b42578ee99
Коммит
c0c3024eeb
|
@ -38,7 +38,7 @@ selectFromDB <- function( queryIn = jsonlite::toJSON(
|
||||||
GROUP_BY =list(COLUMN=c('encountered_week','residence_puma','residence_census_tract')),
|
GROUP_BY =list(COLUMN=c('encountered_week','residence_puma','residence_census_tract')),
|
||||||
SUMMARIZE=list(COLUMN='pathogen', IN= c('h1n1pdm'))
|
SUMMARIZE=list(COLUMN='pathogen', IN= c('h1n1pdm'))
|
||||||
)
|
)
|
||||||
), source = 'simulated_data',
|
), source = 'production',
|
||||||
credentials_path = '/home/rstudio/seattle_flu',
|
credentials_path = '/home/rstudio/seattle_flu',
|
||||||
na.rm = FALSE
|
na.rm = FALSE
|
||||||
){
|
){
|
||||||
|
@ -71,12 +71,7 @@ selectFromDB <- function( queryIn = jsonlite::toJSON(
|
||||||
# defined by the environment variables above.
|
# defined by the environment variables above.
|
||||||
rawData <- DBI::dbConnect(RPostgres::Postgres(), service = "seattleflu-production")
|
rawData <- DBI::dbConnect(RPostgres::Postgres(), service = "seattleflu-production")
|
||||||
|
|
||||||
db <- DBI::dbGetQuery(rawData, "select distinct * from shipping.incidence_model_observation_v1;")
|
db <- DBI::dbGetQuery(rawData, "select distinct * from shipping.incidence_model_observation_v2;")
|
||||||
|
|
||||||
# fake pathogen field until db is ready
|
|
||||||
# if (!('pathogen' %in% names(db))){
|
|
||||||
# db$pathogen <- 'unknown'
|
|
||||||
# }
|
|
||||||
|
|
||||||
# db <- DBI::dbGetQuery(rawData, paste('select distinct * from shipping.incidence_model_observation_v1 encounter',
|
# db <- DBI::dbGetQuery(rawData, paste('select distinct * from shipping.incidence_model_observation_v1 encounter',
|
||||||
# 'left join shipping.presence_absence_result_v1 taq',
|
# 'left join shipping.presence_absence_result_v1 taq',
|
||||||
|
@ -86,42 +81,35 @@ selectFromDB <- function( queryIn = jsonlite::toJSON(
|
||||||
# this logic should be substantially rethought, as I'm mixing sql and dplyr in confusing ways, but it will have to do for now!
|
# this logic should be substantially rethought, as I'm mixing sql and dplyr in confusing ways, but it will have to do for now!
|
||||||
|
|
||||||
# get all samples and nest
|
# get all samples and nest
|
||||||
db3 <- DBI::dbGetQuery(rawData, paste('select distinct * from shipping.presence_absence_result_v1',
|
db2 <- DBI::dbGetQuery(rawData, paste('select distinct * from shipping.presence_absence_result_v1',
|
||||||
';'),sep=' ')
|
';'),sep=' ')
|
||||||
|
|
||||||
names(db3)[names(db3)=='target'] <- 'pathogen'
|
names(db2)[names(db2)=='target'] <- 'pathogen'
|
||||||
|
|
||||||
db3 <- db3 %>% group_by(sample) %>%
|
# count pathogens found and tests performed
|
||||||
mutate(number_pathogens_found = sum(present), number_pathogens_tested = n()) %>%
|
db2 <- db2 %>% group_by(sample) %>%
|
||||||
filter(present == TRUE | number_pathogens_found==0) %>%
|
mutate(number_pathogens_found = sum(present), number_pathogens_tested = n())
|
||||||
group_by(sample,number_pathogens_found,number_pathogens_tested) %>%
|
|
||||||
tidyr::nest()
|
|
||||||
|
|
||||||
for (k in which(db3$number_pathogens_found==0)){
|
|
||||||
db3$data[[k]] <- tibble(pathogen='undetected',present=TRUE)
|
|
||||||
}
|
|
||||||
|
|
||||||
names(db3)[names(db3) == 'data'] <- 'pathogens_found'
|
# add in "undetected" pathogen for samples that were tested but had no detections
|
||||||
|
db3 <- db2 %>% group_by(sample) %>% filter(all(present == FALSE) & all(number_pathogens_tested>0)) %>%
|
||||||
|
summarize(pathogen = 'undetected') %>% mutate(present=TRUE)
|
||||||
|
|
||||||
|
# join undetecteds with positives
|
||||||
|
db4 <- bind_rows(db2 %>% filter(present == TRUE), db3)
|
||||||
|
|
||||||
# join with encounter list, using nice formatting
|
# join with encounter list, using nice formatting
|
||||||
db <- db %>% left_join(db3)
|
db <- db %>% left_join(db4)
|
||||||
|
|
||||||
|
# put in "not_yet_tested" for samples with no test results
|
||||||
idx<-is.na(db$number_pathogens_tested)
|
idx<-is.na(db$number_pathogens_tested)
|
||||||
db$number_pathogens_tested[idx] <- 0
|
db$number_pathogens_tested[idx] <- 0
|
||||||
for (k in which(idx)){
|
db$pathogen[idx] <- 'not_yet_tested'
|
||||||
db$pathogens_found[[k]] <- tibble(pathogen='not_yet_tested',present=TRUE)
|
db$present[idx] <- TRUE
|
||||||
}
|
|
||||||
|
|
||||||
db <- db %>% tidyr::unnest() # nice flat file like simulated data, but with repeated encounters for multiple positives
|
|
||||||
|
|
||||||
## wacky thing in census tract
|
|
||||||
db$residence_census_tract <- sub('\\.0','',db$residence_census_tract)
|
|
||||||
db$work_census_tract <- sub('\\.0','',db$work_census_tract)
|
|
||||||
|
|
||||||
DBI::dbDisconnect(rawData)
|
DBI::dbDisconnect(rawData)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
print('unknown source database!')
|
print('unknown source database!')
|
||||||
}
|
}
|
||||||
|
|
|
@ -10,7 +10,7 @@ selectFromDB(queryIn = jsonlite::toJSON(list(SELECT = list(COLUMN =
|
||||||
"residence_census_tract")), GROUP_BY = list(COLUMN =
|
"residence_census_tract")), GROUP_BY = list(COLUMN =
|
||||||
c("encountered_week", "residence_puma", "residence_census_tract")),
|
c("encountered_week", "residence_puma", "residence_census_tract")),
|
||||||
SUMMARIZE = list(COLUMN = "pathogen", IN = c("h1n1pdm")))),
|
SUMMARIZE = list(COLUMN = "pathogen", IN = c("h1n1pdm")))),
|
||||||
source = "simulated_data",
|
source = "production",
|
||||||
credentials_path = "/home/rstudio/seattle_flu", na.rm = FALSE)
|
credentials_path = "/home/rstudio/seattle_flu", na.rm = FALSE)
|
||||||
}
|
}
|
||||||
\arguments{
|
\arguments{
|
||||||
|
|
Загрузка…
Ссылка в новой задаче