Merge pull request #55 from seattleflu/connect-to-real-data

Connect to real data
This commit is contained in:
Mike Famulare 2019-05-10 10:27:03 -07:00 коммит произвёл GitHub
Родитель 0c42821847 d3d035de20
Коммит 0dd76ac7da
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
13 изменённых файлов: 187 добавлений и 629 удалений

Просмотреть файл

@ -8,23 +8,74 @@ library(modelTestR)
library(dplyr)
library(ggplot2)
SRC <- 'production'
# SRC <- 'simulated_data'
## factors and levels to be modeled ##
#####################
### REAL DATA #######
#####################
db <- selectFromDB(queryIn= list(SELECT =c("*")), source = 'production')
db <- selectFromDB(queryIn= list(SELECT =c("*")), source = SRC)
pathogens <- c('all', unique(db$observedData$pathogen))
factors <- c('site_type','sex','flu_shot')
geoLevels <- list( seattle_geojson = c('residence_puma','residence_neighborhood_district_name','residence_cra_name','residence_census_tract'),
geoLevels <- list( seattle_geojson = c('residence_neighborhood_district_name','residence_cra_name'),
wa_geojson = c('residence_puma'), # census tract impossible due to memory limits
king_county_geojson = c('residence_puma','residence_census_tract')
king_county_geojson = c('residence_census_tract')
)
#####################################
###### timeseries models ############
#####################################
# number of subjects with pathogen and factor at residence location
for (SOURCE in names(geoLevels)){
for (PATHOGEN in pathogens){
for (GEO in geoLevels[[SOURCE]]){
queryIn <- list(
SELECT =list(COLUMN=c('pathogen', factors, GEO,'encountered_week')),
WHERE =list(COLUMN='pathogen', IN=PATHOGEN),
GROUP_BY =list(COLUMN=c(factors,GEO,"encountered_week")),
SUMMARIZE=list(COLUMN='pathogen', IN= PATHOGEN)
)
shp <- masterSpatialDB(shape_level = gsub('residence_','',GEO), source = SOURCE)
db <- expandDB( selectFromDB( queryIn, source=SRC, na.rm=TRUE ), shp=shp )
# training occassionaly segfaults on but it does not appear to be deterministic...
tryCatch(
{
db <- appendCatchmentModel(db,shp=shp, source=SRC, na.rm=TRUE )
modelDefinition <- latentFieldModel(db=db, shp=shp)
model <- modelTrainR(modelDefinition)
print(summary(model$inla))
saveModel(model)
dir.create('/home/rstudio/seattle_flu/plots/', showWarnings = FALSE)
fname <- paste('/home/rstudio/seattle_flu/plots/',paste(PATHOGEN,SOURCE,paste(factors,collapse='-'),GEO,'encountered_week',sep='-'),'.png',sep='')
png(filename = fname,width = 6, height = 5, units = "in", res = 300)
print(ggplot(model$latentField) +
geom_line(aes_string(x='encountered_week',y="modeled_intensity_mode", color=GEO,group =GEO)) +
# geom_ribbon(aes_string(x='encountered_week',ymin="modeled_intensity_lower_95_CI", ymax="modeled_intensity_upper_95_CI", fill=GEO,group =GEO),alpha=0.1) +
guides(color=FALSE) )
dev.off()
}, error=function(e){cat("ERROR :",conditionMessage(e), "\n")}
)
}
}
}
##############################
## time-independent maps #####
##############################
@ -44,7 +95,7 @@ for (SOURCE in names(geoLevels)){
shp <- masterSpatialDB(shape_level = gsub('residence_','',GEO), source = SOURCE)
db <- expandDB( selectFromDB( queryIn, source='production', na.rm=TRUE ), shp=shp )
db <- expandDB( selectFromDB( queryIn, source=SRC, na.rm=TRUE ), shp=shp )
modelDefinition <- smoothModel(db=db, shp=shp)
@ -74,28 +125,27 @@ for (SOURCE in names(geoLevels)){
}
}
# age-distributions by pathogen and factor
###########################################
# age-distributions by pathogen and factor#
###########################################
# eventually this should be multinomial models, but indepdendent binomial for now
### BROKEN BECAUSE NEED TO PROPOGATE AGE_RANGE from DATABASE THROUGH SYSTEM
for (PATHOGEN in pathogens){
for (FACTOR in factors){
queryIn <- list(
SELECT =list(COLUMN=c('pathogen',FACTOR,'age_range_fine_lower')),
GROUP_BY =list(COLUMN=c('pathogen',FACTOR,'age_range_fine_lower')),
SUMMARIZE=list(COLUMN='pathogen', IN= PATHOGEN)
WHERE =list(COLUMN='pathogen', IN= PATHOGEN),
GROUP_BY =list(COLUMN=c(FACTOR,'age_range_fine_lower')),
SUMMARIZE=list(COLUMN='pathogen', IN= 'all')
)
# queryIn <- list(
# SELECT =list(COLUMN=c(FACTOR,'age_range_fine_lower')),
# GROUP_BY =list(COLUMN=c(FACTOR,'age_range_fine_lower')),
# SUMMARIZE=list(COLUMN=FACTOR, IN= 'all')
# )
db<- selectFromDB( queryIn, source='production', na.rm=TRUE )
db<- selectFromDB( queryIn, source=SRC, na.rm=TRUE )
# get all ages denominator. I'm not sure how to implement this as single query..
tmp<-db$observedData %>% group_by_(.dots=c(FACTOR,'age_range_fine_lower')) %>% summarize(n=sum(n))
tmp <- selectFromDB( queryIn[c('SELECT','GROUP_BY','SUMMARIZE')], source=SRC, na.rm=TRUE )$observedData %>%
group_by_(.dots=c(FACTOR,'age_range_fine_lower')) %>% summarize(n=sum(n))
db$observedData <- db$observedData %>% select(-n) %>% left_join(tmp)
db <- expandDB(db)
@ -106,61 +156,25 @@ for (PATHOGEN in pathogens){
saveModel(model)
p1<-ggplot(model$modeledData) + geom_line(aes(x=age_range_fine_lower ,y=modeled_count_mode, group=pathogen)) +
geom_point(aes(x=age_range_fine_lower, y=positive, group=pathogen)) +
geom_ribbon(aes(x=age_range_fine_lower ,ymin=modeled_count_0_025quant, ymax=modeled_count_0_975quant , group=pathogen)) +
facet_wrap(FACTOR) +
ylim(c(0,2*max(model$modeledData$modeled_count_mode)))
if (model$modelDefinition$family[1]=='poisson'){
p1<-ggplot(model$modeledData) + geom_line(aes(x=age_range_fine_lower ,y=modeled_count_mode, group=pathogen)) +
geom_point(aes(x=age_range_fine_lower, y=positive, group=pathogen)) +
geom_ribbon(aes(x=age_range_fine_lower ,ymin=modeled_count_lower_95_CI, ymax=modeled_count_upper_95_CI , group=pathogen)) +
facet_wrap(FACTOR) +
ylim(c(0,2*max(model$modeledData$modeled_count_mode)))
} else if (model$modelDefinition$family[1]=='binomial'){
p1<-ggplot(model$modeledData) + geom_line(aes(x=age_range_fine_lower ,y=modeled_fraction_mode, group=pathogen)) +
geom_point(aes(x=age_range_fine_lower, y=positive/n, group=pathogen)) +
geom_ribbon(aes(x=age_range_fine_lower ,ymin=modeled_fraction_lower_95_CI, ymax=modeled_fraction_upper_95_CI , group=pathogen)) +
facet_wrap(FACTOR) +
ylim(c(0,1))
}
fname <- paste('/home/rstudio/seattle_flu/data/plots/',paste(PATHOGEN,FACTOR,'age_range_fine',sep='-'),'.png',sep='')
dir.create('/home/rstudio/seattle_flu/plots/', showWarnings = FALSE)
fname <- paste('/home/rstudio/seattle_flu/plots/',paste(PATHOGEN,FACTOR,'age_range_fine',sep='-'),'.png',sep='')
png(filename = fname,width = 6, height = 5, units = "in", res = 300)
print(p1)
dev.off()
}
}
#####################################
###### timeseries models ############
#####################################
# number of subjects with pathogen and factor at residence location
for (SOURCE in names(geoLevels)){
for (PATHOGEN in pathogens){
for (GEO in geoLevels[[SOURCE]]){
queryIn <- list(
SELECT =list(COLUMN=c('pathogen', factors, GEO,'encountered_week')),
WHERE =list(COLUMN='pathogen', IN=PATHOGEN),
GROUP_BY =list(COLUMN=c('pathogen',factors,GEO,"encountered_week")),
SUMMARIZE=list(COLUMN='pathogen', IN= PATHOGEN)
)
shp <- masterSpatialDB(shape_level = gsub('residence_','',GEO), source = SOURCE)
db <- expandDB( selectFromDB( queryIn, source='production', na.rm=TRUE ), shp=shp )
# training occassionaly segfaults on but it does not appear to be deterministic...
tryCatch(
{
model <- modelTrainR(modelDefinition)
db <- appendCatchmentModel(db,shp=shp, source='production', na.rm=TRUE )
modelDefinition <- latentFieldModel(db=db, shp=shp)
model <- modelTrainR(modelDefinition)
print(summary(model$inla))
saveModel(model)
fname <- paste('/home/rstudio/seattle_flu/data/plots/',paste(PATHOGEN,SOURCE,paste(factors,collapse='-'),GEO,'encountered_week',sep='-'),'.png',sep='')
png(filename = fname,width = 6, height = 5, units = "in", res = 300)
print(ggplot(model$latentField) + geom_line(aes_string(x='encountered_week',y="modeled_intensity_mode", color=GEO,group =GEO)) )
dev.off()
}, error=function(e){cat("ERROR :",conditionMessage(e), "\n")}
)
}
}
}

Просмотреть файл

@ -39,6 +39,11 @@ expandDB <- function( db = dbViewR::selectFromDB(),
# age
validColumnData$age = seq(0,90,by=1)
validColumnData$age_bin_fine_lower = c(0,2,seq(5,85,by=5))
validColumnData$age_bin_fine_upper = c(2,seq(5,90,by=5))
validColumnData$age_bin_coarse_lower = c(0,2,5,18,65,90)
validColumnData$age_bin_coarse_upper = c(2,5,18,65,90)
# age bin
if(any(grepl('age',names(db$observedData)))) {
@ -67,8 +72,7 @@ expandDB <- function( db = dbViewR::selectFromDB(),
validColumnData$work_city = validColumnData$work_city[validColumnData$work_city!='NA']
# factors (these don't get interpolated by the models, so we only want the valid levels for the dataset at hand)
factorNames <- names(db$observedData)[ !( (names(db$observedData) %in% c('n','positive')) |
grepl('age',names(db$observedData)) |
factorNames <- names(db$observedData)[ !( (names(db$observedData) %in% c('age','n','positive')) |
grepl('residence_',names(db$observedData)) |
grepl('work_',names(db$observedData)) |
grepl('encounter',names(db$observedData)) )]
@ -110,9 +114,18 @@ expandDB <- function( db = dbViewR::selectFromDB(),
db$observedData$time_row <- validColumnData$time_row[match(db$observedData$encountered_week,validColumnData$encountered_week)]
}
if(any(grepl('age_bin',names(db$observedData)))){
db$observedData$age_row <- validColumnData$age_row[match(db$observedData$age_bin,validColumnData$age_bin)]
}
# if(any(grepl('age_bin',names(db$observedData)))){
# db$observedData$age_row <- validColumnData$age_row[match(db$observedData$age_bin,validColumnData$age_bin)]
# }
if(any(grepl('age_range_fine_lower',names(db$observedData)))){
db$observedData$age_row <- validColumnData$age_range_fine_lower[match(db$observedData$age_range_fine_lower,validColumnData$age_range_fine_lower)]
} else if (any(grepl('age_range_fine_upper',names(db$observedData)))){
db$observedData$age_row <- validColumnData$age_range_fine_upper[match(db$observedData$age_range_fine_upper,validColumnData$age_range_fine_upper)]
} else if (any(grepl('age_range_coarse_lower',names(db$observedData)))){
db$observedData$age_row <- validColumnData$age_range_coarse_lower[match(db$observedData$age_range_coarse_lower,validColumnData$age_range_coarse_lower)]
} else if (any(grepl('age_range_coarse_upper',names(db$observedData)))){
db$observedData$age_row <- validColumnData$age_range_coarse_upper[match(db$observedData$age_range_coarse_upper,validColumnData$age_range_coarse_upper)]
}
return(db)
}

Просмотреть файл

@ -134,32 +134,34 @@ selectFromDB <- function( queryIn = jsonlite::toJSON(
}
}
}
# age bin mutations
# this is nasty! I need to transform to age bins for modeling, but I don't want to carry the MUTATE query around!
if(any(grepl('age',names(db)))){
db$age_bin <- floor(pmin(db$age,90))
queryList$GROUP_BY$COLUMN[queryList$GROUP_BY$COLUMN %in% 'age'] <- 'age_bin'
}
if('GROUP_BY' %in% names(queryList)){
db<- db %>% dplyr::group_by_(.dots=queryList$GROUP_BY$COLUMN)
}
if('SUMMARIZE' %in% names(queryList)){
if (queryList$SUMMARIZE$IN != 'all'){
summary_criteria <- lazyeval::interp(~sum(y %in% x), .values=list(y = as.name(queryList$SUMMARIZE$COLUMN), x = queryList$SUMMARIZE$IN))
} else {
summary_criteria <- lazyeval::interp(~n()) # must always output n and positive for downstream interpretation!
if('GROUP_BY' %in% names(queryList)){
db<- db %>% dplyr::group_by_(.dots=queryList$GROUP_BY$COLUMN)
}
db <- db %>% dplyr::summarise_(n = lazyeval::interp(~n()), positive = summary_criteria)
}
if('SUMMARIZE' %in% names(queryList)){
if (queryList$SUMMARIZE$IN != 'all'){
summary_criteria <- lazyeval::interp(~sum(y %in% x), .values=list(y = as.name(queryList$SUMMARIZE$COLUMN), x = queryList$SUMMARIZE$IN))
} else {
summary_criteria <- lazyeval::interp(~n()) # must always output n and positive for downstream interpretation!
}
db <- db %>% dplyr::summarise_(n = lazyeval::interp(~n()), positive = summary_criteria)
}
# "pathogen" column is required for incidenceMapR model definitions
if(!('pathogn' %in% queryList$GROUP_BY$COLUMN)){
if( 'pathogen' %in% queryList$WHERE$COLUMN){
db$pathogen <- queryList$WHERE$IN['pathogen' %in% queryList$WHERE$COLUMN]
} else {
db$pathogen<-'all'
}
}
}
# type harmonization
for( COLUMN in names(db)[names(db) %in% c('residence_census_tract','residence_cra_name','residence_puma','residence_neighborhood_district_name','residence_city',
'work_census_tract','work_cra_name','work_puma','work_neighborhood_district_name','work_city')]){
@ -167,10 +169,10 @@ selectFromDB <- function( queryIn = jsonlite::toJSON(
}
# drop rows with NA since incidenceMapR (INLA) will ignore them anyway
if(na.rm){
db <- db %>% replace(.=='NA', NA) %>% tidyr::drop_na()
}
if(na.rm){
db <- db %>% replace(.=='NA', NA) %>% tidyr::drop_na()
}
summarizedData <- list(observedData = db,queryList = c(queryList))
return(summarizedData)

Просмотреть файл

@ -21,10 +21,10 @@ library(dplyr)
## return subset
queryJSON <- jsonlite::toJSON(
list(
SELECT =list(COLUMN=c('individual','pathogen','encountered_date','site_type','sex','flu_shot','age','has_fever','has_cough','has_myalgia')),
SELECT =list(COLUMN=c('individual','pathogen','encountered_week','site_type','sex','flu_shot','age','has_fever','has_cough','has_myalgia')),
WHERE =list(COLUMN='pathogen', IN = c('h1n1pdm', 'h3n2')),
WHERE =list(COLUMN='encountered_date', BETWEEN = c(2019,2019.2)),
WHERE =list(COLUMN='sampling_location', IN='hospital')
WHERE =list(COLUMN='encountered_week', IN= c('2018-W52','2019-W01')),
WHERE =list(COLUMN='site_type', IN='hospital')
)
)
db <- selectFromDB( queryJSON )
@ -37,8 +37,8 @@ library(dplyr)
## return h1n1pdm summary by time and location
queryIn <- list(
SELECT =list(COLUMN=c('pathogen','encountered_date','residence_puma','residence_census_tract')),
GROUP_BY =list(COLUMN=c('encountered_date','residence_puma','residence_census_tract')),
SELECT =list(COLUMN=c('pathogen','encountered_week','residence_puma','residence_census_tract')),
GROUP_BY =list(COLUMN=c('encountered_week','residence_puma','residence_census_tract')),
SUMMARIZE=list(COLUMN='pathogen', IN= c('h1n1pdm'))
)
db <- selectFromDB( queryIn )
@ -128,8 +128,8 @@ names(db$observedData)
## return subset
queryJSON <- jsonlite::toJSON(
list(
SELECT =list(COLUMN=c('individual','encountered_date','site_type','sex','flu_shot','age')),
WHERE =list(COLUMN='encountered_date', BETWEEN = c('2019-01-01','2019-02-28')),
SELECT =list(COLUMN=c('individual','encountered_week','site_type','sex','flu_shot','age')),
WHERE =list(COLUMN='encountered_week', IN = c('2019-W04','2019-W05','2019-W06')),
WHERE =list(COLUMN='site_type', IN='childrensHospital')
)
)

Просмотреть файл

@ -1,259 +0,0 @@
### Example Requests
Incidence request
##### Curl
```
curl -X POST \
http://40.112.165.255/flu \
-H 'cache-control: no-cache' \
-H 'content-type: application/json' \
-H 'postman-token: aa46a1fe-9a61-225e-2416-94b321baa238' \
-d '{
"output":["incidence"],
"attributes":["Census_Tract","vax_status"],
"basis":["year"],
"values":["incidence_median","incidence_sd"]
}'
```
##### Node Request
```javascript
var request = require("request");
var options = { method: 'POST',
url: 'http://40.112.165.255/flu',
headers:
{ 'postman-token': 'ed0b5dd6-9eae-30de-a8eb-d6869dc0bf76',
'cache-control': 'no-cache',
'content-type': 'application/json' },
body:
{ output: [ 'incidence' ],
attributes: [ 'Census_Tract', 'vax_status' ],
basis: [ 'year' ],
values: [ 'incidence_median', 'incidence_sd' ] },
json: true };
request(options, function (error, response, body) {
if (error) throw new Error(error);
console.log(body);
});
```
##### Node Native
```javascript
var options = {
"method": "POST",
"hostname": "40.112.165.255",
"port": null,
"path": "/flu",
"headers": {
"content-type": "application/json",
"cache-control": "no-cache",
"postman-token": "619097ad-0469-3593-8f6c-938f1a8c6d7c"
}
};
var req = http.request(options, function (res) {
var chunks = [];
res.on("data", function (chunk) {
chunks.push(chunk);
});
res.on("end", function () {
var body = Buffer.concat(chunks);
console.log(body.toString());
});
});
req.write(JSON.stringify({ output: [ 'incidence' ],
attributes: [ 'Census_Tract', 'vax_status' ],
basis: [ 'year' ],
values: [ 'incidence_median', 'incidence_sd' ] }));
req.end();
```
##### javscript
```javascript
var data = JSON.stringify({
"output": [
"incidence"
],
"attributes": [
"Census_Tract",
"vax_status"
],
"basis": [
"year"
],
"values": [
"incidence_median",
"incidence_sd"
]
});
var xhr = new XMLHttpRequest();
xhr.withCredentials = true;
xhr.addEventListener("readystatechange", function () {
if (this.readyState === 4) {
console.log(this.responseText);
}
});
xhr.open("POST", "http://40.112.165.255/flu");
xhr.setRequestHeader("content-type", "application/json");
xhr.setRequestHeader("cache-control", "no-cache");
xhr.setRequestHeader("postman-token", "43420113-8008-3c53-9cc3-e6c8de9d8d55");
xhr.send(data);
```
##### Python
```python
import requests
url = "http://40.112.165.255/flu"
payload = "{\n\t\"output\":[\"incidence\"],\n\t\"attributes\":[\"Census_Tract\",\"vax_status\"],\n\t\"basis\":[\"year\"],\n\t\"values\":[\"incidence_median\",\"incidence_sd\"]\n}"
headers = {
'content-type': "application/json",
'cache-control': "no-cache",
'postman-token': "d3083842-39d5-c6c5-9332-59f9eadcb234"
}
response = requests.request("POST", url, data=payload, headers=headers)
print(response.text)
```
### Observed Request
##### Curl
```
curl -X POST \
http://40.112.165.255/flu \
-H 'cache-control: no-cache' \
-H 'content-type: application/json' \
-H 'postman-token: 42b2d09b-fb4a-8135-8993-f7f8e6fa308c' \
-d '{
"output": ["observed"],
"attributes": ["Census_Tract", "vax_status"],
"basis": ["year"],
"values": ["flu_count", "nsamples"]
}
```
##### Node Request
```javascript
var request = require("request");
var options = { method: 'POST',
url: 'http://40.112.165.255/flu',
headers:
{ 'postman-token': '7b6caf7d-a3c6-88cc-1306-249f88d152b3',
'cache-control': 'no-cache',
'content-type': 'application/json' },
body:
{ output: [ 'observed' ],
attributes: [ 'Census_Tract', 'vax_status' ],
basis: [ 'year' ],
values: [ 'flu_count', 'nsamples' ] },
json: true };
request(options, function (error, response, body) {
if (error) throw new Error(error);
console.log(body);
});
```
##### Node Native
```javascript
var http = require("http");
var options = {
"method": "POST",
"hostname": "40.112.165.255",
"port": null,
"path": "/flu",
"headers": {
"content-type": "application/json",
"cache-control": "no-cache",
"postman-token": "18f97827-f7f3-c58c-26f4-5e9a0c04ace2"
}
};
var req = http.request(options, function (res) {
var chunks = [];
res.on("data", function (chunk) {
chunks.push(chunk);
});
res.on("end", function () {
var body = Buffer.concat(chunks);
console.log(body.toString());
});
});
req.write(JSON.stringify({ output: [ 'observed' ],
attributes: [ 'Census_Tract', 'vax_status' ],
basis: [ 'year' ],
values: [ 'flu_count', 'nsamples' ] }));
req.end();
```
##### Javascript
```javascript
var data = JSON.stringify({
"output": [
"observed"
],
"attributes": [
"Census_Tract",
"vax_status"
],
"basis": [
"year"
],
"values": [
"flu_count",
"nsamples"
]
});
var xhr = new XMLHttpRequest();
xhr.withCredentials = true;
xhr.addEventListener("readystatechange", function () {
if (this.readyState === 4) {
console.log(this.responseText);
}
});
xhr.open("POST", "http://40.112.165.255/flu");
xhr.setRequestHeader("content-type", "application/json");
xhr.setRequestHeader("cache-control", "no-cache");
xhr.setRequestHeader("postman-token", "4d81fb6f-f40e-b108-7a42-da98ab8ddd40");
xhr.send(data);
```
##### Python
```python
import requests
url = "http://40.112.165.255/flu"
payload = "{\n\n \"output\": [\"observed\"],\n \"attributes\": [\"Census_Tract\", \"vax_status\"],\n \"basis\": [\"year\"],\n \"values\": [\"flu_count\", \"nsamples\"]\n\n}\n\n "
headers = {
'content-type': "application/json",
'cache-control': "no-cache",
'postman-token': "86c99e05-61c5-204a-3e58-53db81e23856"
}
response = requests.request("POST", url, data=payload, headers=headers)
print(response.text)
```

1
incidenceMapR/.gitignore поставляемый
Просмотреть файл

@ -1,2 +1,3 @@
.Rproj.user
.httr-oauth
plots

Просмотреть файл

@ -52,10 +52,9 @@ appendCatchmentModel <- function(db,shp = NULL, source='simulated_data', na.rm=T
catchmentModel <- modelTrainR(catchmentModelDefinition)
# append catchment as intercept covariate
db$observedData <- db$observedData %>% left_join(catchmentModel$modeledData %>% select(site_type, geo, modeled_count_0_5quant))
names(db$observedData)[names(db$observedData) %in% 'modeled_count_0_5quant'] <- 'catchment'
db$observedData <- db$observedData %>% left_join(catchmentModel$modeledData %>% select(site_type, geo, modeled_count_median))
names(db$observedData)[names(db$observedData) %in% 'modeled_count_median'] <- 'catchment'
db$observedData$catchment <- log(db$observedData$catchment) - mean(log(db$observedData$catchment))
# db$observedData$catchment <- db$observedData$catchment/sd(db$observedData$catchment)
return(db)
}

Просмотреть файл

@ -28,6 +28,9 @@ appendSmoothData <- function(model,modelDefinition){
# snake_case
names(modeledData) <- gsub('\\.','_',names(modeledData))
# clean quantile names
names(modeledData)[grepl('0_',names(modeledData))]<-paste(outputColName,c('lower_95_CI','median','upper_95_CI'),sep='_')
# pretty order
columns <- modelDefinition$queryList$GROUP_BY$COLUMN[modelDefinition$queryList$GROUP_BY$COLUMN %in% names(modeledData)]
modeledData <- modeledData %>% arrange_(.dots=columns)
@ -62,6 +65,47 @@ appendLatentFieldData <- function(model,modelDefinition){
# snake_case
names(latentField) <- gsub('\\.','_',names(latentField))
# filter out unwanted fields
latentField <- latentField
latentField <- latentField[!(names(latentField) %in% c('modeled_intensity_ID','modeled_intensity_kld'))]
# clean quantile names
names(latentField)[grepl('0_',names(latentField))]<-paste('modeled_intensity',c('lower_95_CI','median','upper_95_CI'),sep='_')
# apply link function
if (modelDefinition$family[1] == 'poisson'){
for( COLUMN in names(latentField)[grepl('modeled',names(latentField))]){
if (grepl('sd',COLUMN)){
latentField[[COLUMN]] <- exp(latentField$modeled_intensity_median + latentField$modeled_intensity_sd^2/2) * sqrt(exp(latentField$modeled_intensity_sd^2)-1)
} else if (grepl('mean',COLUMN)){
latentField[[COLUMN]] <- exp(latentField$modeled_intensity_median + latentField$modeled_intensity_sd^2/2)
} else {
latentField[[COLUMN]] <- exp(latentField[[COLUMN]])
}
}
} else if (modelDefinition$family[1] == 'binomial'){
for( COLUMN in names(latentField)[grepl('modeled',names(latentField))]){
if (grepl('sd',COLUMN)){
# TODO: transform marginals
tmp <- exp(latentField$modeled_intensity_mean + rnorm(1e5,sd=latentField$modeled_intensity_sd))
tmp <- tmp/(1+tmp)
latentField$modeled_intensity_sd<-sd(tmp)
} else if (grepl('mean',COLUMN)){
# TODO: transform marginals
tmp <- exp(latentField$modeled_intensity_mean + rnorm(1e5,sd=latentField$modeled_intensity_sd))
tmp <- tmp/(1+tmp)
latentField$modeled_intensity_mean<-mean(tmp)
} else {
latentField[[COLUMN]] <- exp(latentField[[COLUMN]])/(1+latentField[[COLUMN]])
}
}
}
# pretty order
columns <- modelDefinition$queryList$GROUP_BY$COLUMN[modelDefinition$queryList$GROUP_BY$COLUMN %in% names(latentField)]
latentField <- latentField %>% arrange_(.dots=columns)

Просмотреть файл

@ -39,7 +39,7 @@ latentFieldModel <- function(db , shp, family = NULL, neighborGraph = NULL){
# construct priors
hyper=list()
hyper$global <- list(prec = list( prior = "pc.prec", param = 1/10, alpha = 0.01))
hyper$local <- list(prec = list( prior = "pc.prec", param = 1/100, alpha = 0.01))
hyper$local <- list(prec = list( prior = "pc.prec", param = 1/200, alpha = 0.01))
hyper$age <- list(prec = list( prior = "pc.prec", param = 1, alpha = 0.01))
hyper$time <- list(prec = list( prior = "pc.prec", param = 1/50, alpha = 0.01))
@ -268,7 +268,7 @@ latentFieldModel <- function(db , shp, family = NULL, neighborGraph = NULL){
# get original values for linear combination categories
lc.colIdx <- (names(inputData) %in% db$queryList$GROUP_BY$COLUMN) & !(names(inputData) %in% validFactorNames)
lc.colIdx <- (names(inputData) %in% c('pathogen',db$queryList$GROUP_BY$COLUMN)) & !(names(inputData) %in% validFactorNames)
lc.data <- inputData[lc.rowIdx,lc.colIdx]
df <- data.frame(outcome = outcome, inputData, replicateIdx)

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Просмотреть файл

@ -1,7 +1,4 @@
#' getHumanReadableModelIdFromModel: return human readable verion of model from query
library(logging)
#'
#' @param model INLA model object that will generatie id from
#'

Просмотреть файл

@ -1,81 +0,0 @@
# testModelTrainR
# script to test incidenceMapR package
library(dbViewR)
library(incidenceMapR)
library(modelTestR)
library(modelServR)
library(dplyr)
library(magrittr)
shp <- masterSpatialDB() # census-tract shapefiles
neighborGraph <- constructAdjacencyNetwork(shp)
# ggplot build eventually will be replaced by function ggplotSmoothSequential
library(ggplot2)
plotSettings <- ggplot() + theme_bw() + theme(panel.border = element_blank()) + xlab('')
###################################
#### timeseries latent field ######
###################################
geoLevels <- c('residence_puma5ce','residence_neighborhood_district_name','residence_cra_name','residence_neighborhood_district_name','residence_census_tract')
# geoLevels<-c('residence_puma5ce')
# geoLevels<-c('residence_neighborhood_district_name')
# geoLevels<-c('residence_cra_name')
# geoLevels <- c('residence_census_tract')
# get pathogen list
queryIn <- list(
SELECT =list(COLUMN=c('pathogen')),
GROUP_BY =list(COLUMN=c('pathogen')),
SUMMARIZE=list(COLUMN='pathogen', IN= 'all')
)
pathogens <- unique(selectFromDB( queryIn )$observedData$pathogen)
for(geo in geoLevels){
for( k in 1:length(pathogens)){
# query pathogen, time, levels you want to play with
queryIn <- list(
SELECT =list(COLUMN=c('encountered_week','pathogen','sampling_location','flu_shot',geo)),
WHERE =list(COLUMN=c('pathogen'), IN=pathogens[k]),
GROUP_BY =list(COLUMN=c('pathogen','encountered_week','sampling_location','flu_shot',geo)),
SUMMARIZE=list(COLUMN='pathogen', IN= pathogens[k])
)
db <- expandDB( db<-selectFromDB( queryIn ))
db <- appendCatchmentModel(db,shp=shp)
# build latent field model
modelDefinition <- latentFieldModel(db=db, shp=shp)
model <- modelTrainR(modelDefinition)
# model<-returnModel(db$queryList,format='model',type='latent_field',cloudDir='~/data')
print(summary(model$inla))
idx <- model$modeledData$sampling_location=='hospital' & model$modeledData$flu_shot==1
plot(log(model$modeledData$fitted_values_mode[idx]))
plot(model$latentField$latent_field_mode)
plot(model$latentField$latent_field_mode,
log(model$modeledData$fitted_values_mode[idx]))
saveModel(model, cloudDir = './data')
if (geo =='residence_census_tract'){
for(m in unique(model$modeledData$sampling_location)){
for (n in unique(model$modeledData$time_row)){
tmp<-list(modeledData = model$modeledData[model$modeledData$sampling_location==m & model$modeledData$time_row == n & model$modeledData$flu_shot==0,])
ggplotSmoothMap(tmp,shp,paste(m,n))
}
}
}
}
}