diff --git a/test/worker_classficiation.R b/test/workerClassification.R similarity index 100% rename from test/worker_classficiation.R rename to test/workerClassification.R diff --git a/test/worker_cluster.R b/test/workerCluster.R similarity index 95% rename from test/worker_cluster.R rename to test/workerCluster.R index e6b02b0..851fdbf 100644 --- a/test/worker_cluster.R +++ b/test/workerCluster.R @@ -21,25 +21,25 @@ if (rxGetComputeContext()@description == "dopar") { # make a function to do clustering of given data set. clusterAnalysis <- function(data, numClusters) { - + xdf <- RxXdfData(data) - + # create formula. - + names <- rxGetVarNames(data=xdf) names <- names[!(names %in% c("Class", "Time"))] # the original data set is labelled so remove the label. formula <- as.formula(paste0("~", paste(names, collapse="+"))) - + # to scale data. - + df <- rxImport(xdf, varsToDrop=c("Time", "Class")) df <- as.data.frame(scale(df)) - + clusters <- rxKmeans(formula, df, numClusters=numClusters) - + clusters$cluster } @@ -49,4 +49,4 @@ results <- rxExec(FUN=clusterAnalysis, data="data.xdf", numClusters=rxElemArg(c(2:5))) -save(results, file="./results.RData") +save(results, file="./results.RData") \ No newline at end of file diff --git a/vignettes/60Kmeans.Rmd b/vignettes/60Kmeans.Rmd index e5c3c20..92cf720 100644 --- a/vignettes/60Kmeans.Rmd +++ b/vignettes/60Kmeans.Rmd @@ -167,7 +167,7 @@ data are labelled and so in our clustering analysis the label is removed. The R code for clustering is available from github as -[worker_cluster.R]{...test/worker_cluster.R}. The analysis basically +[workerCluster.R]{...test/workerCluster.R}. The analysis basically normalises the credit transaction data and then performs 10 repeated clustering analyses (targeting 2 clusters) for each using the k-means algorithm. The repetition is completed in parallel with the specified @@ -177,7 +177,7 @@ computing context information will be automatically added by the The script can then be saved and later on path to the script is used as reference. For example, in this demo, the script is saved with name -"worker_cluster.R" which is located in the "/test" directory. +"workerCluster.R" which is located in the "/test" directory. The following code is to run the clustering analysis on a specified computing environment. This is achieved by setting computing @@ -212,7 +212,7 @@ AzureDSVM::executeScript(context=context, machines=machines, remote=master, user=USER, - script="./test/worker_cluster.R", + script="./test/workerCluster.R", master=master, slaves=slaves, computeContext="localParallel") @@ -244,7 +244,7 @@ AzureDSVM::executeScript(context=context, machines=machines, remote=master, user=USER, - script="./test/worker_cluster.R", + script="./test/workerCluster.R", master=master, slaves=slaves, computeContext="clusterParallel") diff --git a/vignettes/80ModelSelect.Rmd b/vignettes/80ModelSelect.Rmd index f87a18d..ca148b8 100644 --- a/vignettes/80ModelSelect.Rmd +++ b/vignettes/80ModelSelect.Rmd @@ -179,7 +179,7 @@ directly from [togaware]{https://access.togaware.com/creditcard.xdf} in XDF format. The data consists both normal and fraudulent transactions, which are indicated by the label "Class", and the problem is to detect a potential fraudulent transaction based on patterns "learnt" by the trained model. Codes of solving such a machine learning problem can be obtained from -[worker_classification.R]{...test/worker_classification.R}. The function `mlProcess` takes data, formula, and model specs as inputs. Considering scalability and performance efficiency, data of xdf format is used, which allows parallel computation outside memory. Area-under-curve is used as performance metric to evaluate quality of model. The function returns a model object (based on the training results) and evaluation result of the model. +[workerClassification.R]{...test/workerClassification.R}. The function `mlProcess` takes data, formula, and model specs as inputs. Considering scalability and performance efficiency, data of xdf format is used, which allows parallel computation outside memory. Area-under-curve is used as performance metric to evaluate quality of model. The function returns a model object (based on the training results) and evaluation result of the model. Following shows snippets of the machine learning process. @@ -263,7 +263,7 @@ executeScript(context, machines=LDSVM, remote=VM_URL, user=USER, - script="./worker_classification.R", + script="./workerClassification.R", master=VM_URL, slaves=VM_URL, computeContext="localParallel") @@ -277,7 +277,7 @@ executeScript(context, machines=LDSVMS, remote=VMS_URL[1], user=USER, - script="./worker_classification.R", + script="./workerClassification.R", master=VMS_URL[1], slaves=VMS_URL[-1], computeContext="clusterParallel")