/* tsad3.cpp - Time Series Anomaly Detector ver. 3 (C) 2004, Matt Mahoney. This is free software distributed under terms of the GNU General Public License, http://www.gnu.org/licenses/gpl.txt tsad3 assigns anomaly scores to time series data. Usage: dir/b | tsad3 n col sample T1 T2 T3 The input is a list of training files and test files. The first n samples are considered training. The time series is column "col" starting at 0 (default 0). The sample rate defaults to 1 (use higher numbers to skip every "sample" values for speed). Time constants for 3 filters are T1=5 (first 2 filters), T2=20, T3=100. Each file should consist of columns of real numbers separated by other characters (e.g. spaces, tabs, or commas). The output is 4 columns of numbers (suitable for graphing by lgraph.cpp) showing the first filtered input, anomaly score, and the last 2 filtered values of the input. Max and total anomaly scores per file are printed to stderr. To compile: g++ tsad3.cpp */ #include #include #include #include #include #include using namespace std; // Read line from f into s until EOF bool getline(FILE* f, string& s) { s=""; if (!f) return false; int c; while((c=getc(f))!=EOF && c!='\n') s+=c; return c!=EOF || s!=""; } // Parse numbers from s, return number in column col >= 0 double split(const string& s, int col) { int b=0, e=0, i=0; // begin, end of number, column while (true) { while (bb) { if (i==col) return atof(s.substr(b, e-b).c_str()); } else return 0; b=e; ++i; } } inline double sqr(double x) {return x*x;} int main(int argc, char **argv) { const int n=argc>1?atoi(argv[1]):1000000000; // training set size const int col=argc>2?atoi(argv[2]):0; // input column to use const int sample=argc>3?atoi(argv[3]):1; // sample rate const double T1=argc>4?1-1/atof(argv[4]):0.8; // filter time constants const double T2=argc>5?1-1/atof(argv[5]):0.95; const double T3=argc>6?1-1/atof(argv[6]):0.99; // Read files string filename; double v=0, fv=0, ffv=0, fffv=0, ffffv=0; // filtered values double vmin=0, vmax=0, dvmin=0, dvmax=0, ddvmin=0, ddvmax=0; // range vector x, dx, ddx; // trained data double amax=0, asum=0, fa=0; // anomaly max, sum, average int i=0; while (getline(stdin, filename)) { FILE* f=fopen(filename.c_str(), "r"); if (!f) perror(filename.c_str()); amax=asum=0; string s; while (getline(f, s)) { if (i++ % sample) continue; v=split(s, col); fv=fv*T1+v*(1-T1); ffv=ffv*T1+fv*(1-T1); fffv=fffv*T2+ffv*(1-T2); ffffv=ffffv*T3+fffv*(1-T3); if (i<=n) { // train x.push_back(ffv); dx.push_back(fffv); ddx.push_back(ffffv); } if (i>=n && i0) { // find bounds vmin=vmax=x[0]; dvmin=dvmax=dx[0]; ddvmin=ddvmax=ddx[0]; for (int i=1; ivmax) vmax=x[i]; if (dx[i]dvmax) dvmax=dx[i]; if (ddx[i]ddvmax) ddvmax=ddx[i]; } } if (i>1) { double dist=0; for (int j=0; jvmin) d+=sqr(ffv-x[j])/(vmax-vmin); if (dvmax>dvmin) d+=sqr(fffv-dx[j])/(dvmax-dvmin); if (ddvmax>ddvmin) d+=sqr(ffffv-ddx[j])/(ddvmax-ddvmin); if (j==0 || damax) amax=dist; } } if (f) { fclose(f); fprintf(stderr, "%-50s %12.6f %12.6f\n", filename.c_str(), amax, asum); } } fprintf(stderr, "%d samples\n", i); return 0; }