diff --git a/README.md b/README.md index af68e0d..8de8d2b 100644 --- a/README.md +++ b/README.md @@ -74,29 +74,34 @@ Which would result in the shiny server app running on port 3838. ## Instructions of use 1. Import time series CSV file. Assumed structure: -- date (`"%Y-%m-%d %H:%M:%S"`). TagAnomaly will attempt to infer the date from other patterns as well, using the *parsedate* package -- category (optional) +- date ("%Y-%m-%d %H:%M:%S") +- category - value -2. (Optional) Import raw data time series CSV file. - -If the original time series is an aggreation over time windows, this time series is the raw values themselves. This way we could dive deeper into an anomalous value and see what it is comprised of. +2. (Optional) Import raw data time series CSV file. If the original time series is an aggreation over time windows, this time series is the raw values themselves. This way we could dive deeper into an anomalous value and see what it is comprised of. Assumed structure: -- date (`"%Y-%m-%d %H:%M:%S"`). TagAnomaly will attempt to infer the date from other patterns as well, using the *parsedate* package -- category (optional) -- content +- date ("%Y-%m-%d %H:%M:%S") +- category +- value -2. Select category (optional, if exists) +2. Select category (if exists) 3. Select time range on slider -4. Select points on plot that look anomalous. -Optional (1): click on one time range on the table below the plot to see raw data on this time range -Optional (2): Open the `All Categories` tab to see how other time series behave on the same time range. -5. Once you decide that these are actual anomalies, save the resulting table to csv by clicking on `Download labels set` and continue to the next category. +4. Inspect your time series: +(1): click on one time range on the table below the plot to see raw data on this time range +(2): Open the "All Categories" tab to see how other time series behave on the same time range. + +4.Select points on plot that look anomalous. + +5. Click "Add selected points" to add the marked points to the candidate list. + +7. Once you decide that these are actual anomalies, save the resulting table to csv by clicking on "Download labels set" and continue to the next category. #### Current limitations/issues -It is currently impossible to have multiple selections on one plot. A workaround is to select one area, download the csv and select the next area. Each downloaded CSV has a random string so files won't override each other. Once labeling is finished, one option is to run the provided [prep_labels.py](https://github.com/Microsoft/TagAnomaly/blob/master/prep_labels.py) file in order to concatenate all of TagAnomaly's output file to one CSV. +Points added but not saved will be lost in case the date slider or categories are changed, hence it is difficult to save multiple points from a complex time series. Once all segments are labeled, one can run the provided [prep_labels.py](https://github.com/Microsoft/TagAnomaly/blob/master/prep_labels.py) file in order to concatenate all of TagAnomaly's output file to one CSV. + + # Contributing This project welcomes contributions and suggestions. Most contributions require you to agree to a diff --git a/assets/selected.png b/assets/selected.png index 1de6e3b..5962816 100644 Binary files a/assets/selected.png and b/assets/selected.png differ diff --git a/server.R b/server.R index 107056e..edccb71 100644 --- a/server.R +++ b/server.R @@ -5,7 +5,6 @@ library(parsedate) # for cases when the provided date wasn't in a specific patte library(DT) library(ggplot2) - ## Extend the max file upload size options(shiny.maxRequestSize=150*1024^2) @@ -13,7 +12,9 @@ options(shiny.maxRequestSize=150*1024^2) ## Shiny server function server <- function(input,output, session) { - ## Reactive values + ####---- Reactive values ---#### + + # A boolean checking if the provided dataset contains multiple categories or not. This affects the UI hasCategories <- reactiveVal(value = T,label='hasCategories') @@ -23,10 +24,46 @@ server <- function(input,output, session) { # Whether the first column holds a numeric value (TRUE), or a date value (FALSE) numericTimestamp <- reactiveVal(value = F,label = 'numericTimestamp') - ####---- Time-Series data handling ----#### + selectedPoints <- reactiveVal(value = data.frame(),label='selectedPoints') + brushed <- reactive({ + brushedPoints(getTimeFilteredCategoryDataset(), input$user_brush) + }) - ## Read CSV input file + #### Event observers #### + + # Update selected points when the user clicks 'Add' + observeEvent(input$add, { + selectedPoints(selectedPoints() %>% bind_rows(brushed())) + }) + + # Update selected points when the user clicks 'Remove' + observeEvent(input$delete, { + if (dim(selectedPoints())[1] > 0) { + selectedPoints(selectedPoints()%>% anti_join(brushed())) + } + }) + + ####---- Time-Series data injestion and handling ----#### + + + getDataset <- reactive({ + ## Get time-series dataset from file upload + + if(is.null(input$timeseriesfile)) return(NULL) + dataset <- tryReadFile() + + + validate( + need(nrow(dataset) > 0, "Input file is empty"), + need(('date' %in% names(dataset)),"date column not found. Consider renaming your timestamp column to date"), + need(('value' %in% names(dataset)),"value column not found. Consider renaming your value column to value") + + ) + dataset + }) + tryReadFile <- function() { + ## Read CSV input file from the user provided path out <- tryCatch( { read.csv(input$timeseriesfile$datapath,stringsAsFactors = F) @@ -46,6 +83,7 @@ server <- function(input,output, session) { } padMissingDates <- function(dataset,padValue = 0, timeSeriesGapValue){ + ## Interpolate missing time/date values category <- dataset[1,'category'] %>% unlist() pad <- data.frame(date = seq(from = min(dataset$date),to = max(dataset$date),by = timeSeriesGapValue)) @@ -59,24 +97,12 @@ server <- function(input,output, session) { dataset } - - ## Get time-series dataset from file upload - getDataset <- reactive({ - if(is.null(input$timeseriesfile)) return(NULL) - - dataset <- tryReadFile() - - validate( - need(nrow(dataset) > 0, "Input file is empty"), - need(('date' %in% names(dataset)),"date column not found. Consider renaming your timestamp column to date"), - need(('value' %in% names(dataset)),"value column not found. Consider renaming your value column to value") - - ) - dataset - }) + getTimeSeriesDataset <- reactive({ + ### Turn dataset into a time series by transforming the date column into POSIXct. + ### If dataset is numeric, turn numericTimestamp flag to TRUE. dataset <- getDataset() if(is.null(dataset)) return(NULL) @@ -122,8 +148,9 @@ server <- function(input,output, session) { dataset }) - ## Get a dataset for a specific category + getCategoryDataset <- reactive({ + ## Get a dataset for a specific category ts <- getTimeSeriesDataset() if(is.null(ts)) return(NULL) @@ -143,8 +170,9 @@ server <- function(input,output, session) { dataset }) - ## Get the entire dataset, filtered by the slider range + getTimeFilteredDataset <- reactive({ + ## Get the entire dataset, filtered by the slider range dataset <- getTimeSeriesDataset() if(is.null(dataset)) return(NULL) if(is.null(input$slider)) return(NULL) @@ -152,20 +180,26 @@ server <- function(input,output, session) { dataset %>% filter(date >= input$slider[1], date <= input$slider[2]) }) - ## Get category dataset, filtered by the slider range + getTimeFilteredCategoryDataset <- reactive({ + ## Get category dataset, filtered by the slider range dataset <- getCategoryDataset() if(is.null(dataset)) return(NULL) if(is.null(input$slider)) return(NULL) + + session$resetBrush("input$user_brush") + selectedPoints(data.frame()) dataset %>% filter(date >= input$slider[1], date <= input$slider[2]) }) ####---- Raw data handling ----#### - ## Get raw data (an additional dataset for which the time-series dataset is an aggregation) - ## See R/create_sample_data.R for a script that creates demo time-series and raw datasets + getRawData <- reactive({ + ## Get raw data (an additional dataset for which the time-series dataset is an aggregation) + ## See R/create_sample_data.R for a script that creates demo time-series and raw datasets + cate <- input$category if(is.null(input$rawfile)) return(NULL) @@ -192,8 +226,9 @@ server <- function(input,output, session) { raw }) - ## get raw data for a sample selected by the user + getRawDataForSample <- reactive({ + ## get raw data for a sample selected by the user lastclicked <- input$summaryTable_rows_selected if(is.null(lastclicked)) return(NULL) @@ -276,6 +311,7 @@ server <- function(input,output, session) { ## Select input for categories, based on the categories found in the time series dataset output$category <- renderUI({ + if(hasCategories()==TRUE){ req(input$timeseriesfile) dataset <- getTimeSeriesDataset() @@ -303,25 +339,16 @@ server <- function(input,output, session) { panel.grid.minor = element_blank(), text = element_text(size = 14)) + + + if (dim(selectedPoints())[1] > 0) { + g <- g + geom_point(aes(date, value), data = selectedPoints(), color = "red") + } + g },message = "Rendering plot...") }) - - - - - ## Capture the selected points on the graph - selectedPoints <- reactive({ - user_brush <- input$user_brush - pts <- brushedPoints(getTimeFilteredCategoryDataset(), user_brush, xvar = "date", yvar = "value") - if(is.null(pts)) return(NULL) - if(hasCategories()){ - pts %>% select(date, category, value) - } else { - pts %>% select(date, value) - } - }) - + ## Plot showing all categories output$allplot <- renderPlot({ @@ -458,6 +485,7 @@ server <- function(input,output, session) { ####---- Data output ----#### ## download selected points + output$mydownload <- downloadHandler( filename = function(){ random_string <- paste0(paste0(sample(LETTERS,2 , TRUE),collapse=''),sample(999, 1, TRUE), paste0(sample(LETTERS,2 , TRUE),collapse=''),collapse = '') @@ -467,9 +495,9 @@ server <- function(input,output, session) { } else{ paste0(gsub(".csv",replacement = "",input$timeseriesfile$name),'-',random_string,'-labels.csv') } - }, - content = function(file) { + }, content = function(file) { write.csv(selectedPoints(),file) } ) + } \ No newline at end of file diff --git a/ui.R b/ui.R index 5d555c9..0e69f8b 100644 --- a/ui.R +++ b/ui.R @@ -23,10 +23,11 @@ header <- dashboardHeader(title = 'Taganomaly - Anomaly detection labeling tool' - value
2. Select category (if exists)
3. Select time range on slider
- 4.Select points on plot that look anomalous.
-
Optional (1): click on one time range on the table below the plot to see raw data on this time range
-
Optional (2): Open the "All Categories" tab to see how other time series behave on the same time range.
-
5. Once you decide that these are actual anomalies, save the resulting table to csv by clicking on "Download labels set" and continue to the next category.
+ 5. Click "Add selected points" to add the marked points to the candidate list.
+b
Optional (1): click on one time range on the table below the plot to see raw data on this time range
+
Optional (2): Open the "All Categories" tab to see how other time series behave on the same time range.
+ 7. Once you decide that these are actual anomalies, save the resulting table to csv by clicking on "Download labels set" and continue to the next category.
' ))) ) @@ -63,7 +64,9 @@ body <- dashboardBody( h2('Time Series for labeling:'), h5("Graph might take a few moments to load"), plotOutput("plot", brush = "user_brush"), - h2('Selected points:'), + actionButton("add", "Add selected points"), + actionButton("delete", "Remove selected points"), + h2('Currently marked points:'), dataTableOutput("summaryTable"), h2('Inspect raw data:'), h5('Select a point or more on the graph, then select a record on the \"Selected Points\" table to see raw data'),