/* tsad1.cpp - Time Series Anomaly Detector ver. 1 (C) 2004, Matt Mahoney. This is free software distributed under terms of the GNU General Public License, http://www.gnu.org/licenses/gpl.txt tsad1 assigns anomaly scores to time series data. Usage: dir/b | tsad1 n col sample The input is a list of training files and test files. The first n samples are considered training and the rest are test. The time series is column "col" starting at 0 (default 0). The model consists of points every "sample" (default 1). Larger values will make the program faster. About 1000 modeled points (n/sample = 1000) is reasonably fast. Each file should consist of columns of real numbers separated by other characters (e.g. spaces, tabs, or commas). The output is 2 columns of numbers (suitable for graphing) showing the data and anomaly score. Maximum and total anomaly scores (in that order) are printed to stderr for each file. To compile: g++ tsad1.cpp */ #include #include #include #include #include #include using namespace std; // Read line from f into s until EOF bool getline(FILE* f, string& s) { s=""; if (!f) return false; int c; while((c=getc(f))!=EOF && c!='\n') s+=c; return c!=EOF || s!=""; } // Parse numbers from s, return number in column col >= 0 double split(const string& s, int col) { int b=0, e=0, i=0; // begin, end of number, column while (true) { while (bb) { if (i==col) return atof(s.substr(b, e-b).c_str()); } else return 0; b=e; ++i; } } inline double sqr(double x) {return x*x;} int main(int argc, char **argv) { const int n=argc>1?atoi(argv[1]):1000000000; // number of training points const int col=argc>2?atoi(argv[2]):0; // input column to use const int sample=argc>3?atoi(argv[3]):1; // sample rate // Read files string filename; double v=0, dv1=0, dv=0, ddv=0, fv=0, ffv=0, ffv1=0, ffdv1=0, fdv=0, ffdv=0, fddv=0, ffddv=0; // f=filtered d=derivative 1=previous double vmin=0, vmax=0, dvmin=0, dvmax=0, ddvmin=0, ddvmax=0; // bounds vector x, dx, ddx; // trained data const double T=0.8, DT=0.8, DDT=0.8; // filtering time constants double amax=0, asum=0, fa=0; // max, sum, filtered anomaly score int i=0; // sample count (including skipped samples) while (getline(stdin, filename)) { FILE* f=fopen(filename.c_str(), "r"); if (!f) perror(filename.c_str()); amax=asum=0; string s; while (getline(f, s)) { if (i++ % sample) continue; // compute point (ffv, ffdv, ffddv) v=split(s, col); fv=fv*T+v*(1-T); // filtered input ffv1=ffv; ffv=ffv*T+fv*(1-T); // filtered again dv1=dv; // derivative of twice filtered data dv=ffv-ffv1; fdv=fdv*DT+dv*(1-DT); ffdv1=ffdv; ffdv=ffdv*DT+fdv*(1-DT); // derivative filtered twice again ddv=ffdv-ffdv1; fddv=fddv*DDT+ddv*(1-DDT); ffddv=ffddv*DDT+fddv*(1-DDT); // 2nd derivative filtered twice more if (i<=n) { // train by saving all points in path x.push_back(ffv); dx.push_back(ffdv); ddx.push_back(ffddv); } if (i>=n && i0) { // find bounds for scaling vmin=vmax=x[0]; dvmin=dvmax=dx[0]; ddvmin=ddvmax=ddx[0]; for (int i=1; ivmax) vmax=x[i]; if (dx[i]dvmax) dvmax=dx[i]; if (ddx[i]ddvmax) ddvmax=ddx[i]; } } if (i>1) { double dist=0; for (int j=0; jvmin) d+=sqr(ffv-x[j])/(vmax-vmin); if (dvmax>dvmin) d+=sqr(ffdv-dx[j])/(dvmax-dvmin); if (ddvmax>ddvmin) d+=sqr(ffddv-ddx[j])/(ddvmax-ddvmin); if (j==0 || damax) amax=dist; } } if (f) { // file anomaly score (max, sum) fclose(f); fprintf(stderr, "%-50s %12.6f %12.6f\n", filename.c_str(), amax, asum); } } fprintf(stderr, "%d samples\n", i); return 0; }