/* tsad4.cpp - Time Series Anomaly Detector ver. 4. (C) 2004, Matt Mahoney. This is free software distributed under terms of the GNU General Public License, http://www.gnu.org/licenses/gpl.txt tsad4 assigns anomaly scores to time series data. It reads N samples training of a time series and records the path formed by the low-pass filtered data and its first and second derivatives in M dimensional space, scaled to fit a unit M-cube. The anomaly score of the remaining test data is the square of the distance to the closest point on the path. The path is represented by a piecewise linear approximation with K segments. The data may be undersampled by reading only every S samples. Usage: dir/b | tsad4 N C T K M S P > result For example: dir/b tek*.csv | tsad4 1000 2 5 20 3 12 1 1 The input is a list of training files and test files containing time series data formatted as columns of numbers separated by other characters such as spaces, tabs, or commas. Parameters are integers with the following meanings: N = number of training samples. All the rest are test samples. C = Input column number. The leftmost column is 1. All other columns are ignored. Default is 1. T = Filtering time constant (in samples). The input data x[i] is smoothed to y[i] by low pass filtering: y[i] = ((T-1) * y[i-1] + x[i]) / T. Default is 5. K = Number of vertices to approximate the training path. Default is N. M = Number of dimensions (data plus M-1 derivatives), range 1-4. R = State transitions to test. The state is the closest path segment. Normally the state advances when the next segment is closer. Setting R=1 allows skipping one or moving back one. R=2 tests random segments and jumps if closer. R=3 does both tests. R=4 tests every segment. R+8 is as above but approximates segments by moving them 1/4 the distance to the removed vertices. S = Read every S samples. Default is 1 (don't skip). N and T refer to the number of samples after skipping S-1 of every S. P = number of paths in the training data. Each path is N samples for a total of N*P training points. Default is 1. If P > 1 then the anomaly score is the square of the distance from the smallest M-dimensional box bounding the P nearest points to the test point in each of the paths. Each path is approximated to K vertices. tsad4 computes a model of N training points in M dimensions by piecewise linear approximation of the path formed by the input and its first M-1 derivatives (scaled to a unit M-cube) to K vertices. The anomaly score of a test point is the square of the distance to this approximated path. The file tsad4.lst is created containing a table of M+2 columns as follows: state a x dx ddx dddx For the first K rows, each (x,dx,ddx,dddx) point is a vertex in the segmented approximation, state is the vertex number (0 to K-1), a is the sample number (0 to N-1) corresponding to this vertex, x is the input (filtered), and dx, ddx, and dddx are the first 3 derivatives of x. For the test data (after N samples), (x,dx,ddx,dddx) is the test point, state is the current state, or closest path segment to the test point, and a is the anonaly score. Each point (x,dx,ddx,dddx) is low pass filtered twice with time constant T samples. The anomaly score is filtered once. This is shown schematically in Fig. 1. x dx ddx input -> LP -> LP --+--> D -> LP -> LP --+--> D -> LP -> LP -+ | | | V (col. 1) V (col. 3) V (col. 5) +-----------------------------------------------+ a | Anomaly Detector |-> LP --> +-----------------------------------------------+ (col. 2) Fig. 1. Anomaly detection configuration and output for M = 3. In Fig. 1, D is a differentiator, such that D(t) = x[t] - x[t-1], and LP is a low pass filter, such that LP(t) = ((T-1)LP(t-1) + x[t]) / T, where x[t] is the t'th input. Standard output prints a table of maximum and total anomaly scores for each test file. The maximum and total are after filtering once as shown in Fig. 1. This filter state is reset to 0 after each file. No data is output for the training portion of the file. To compile: g++ tsad4.cpp */ #include #include #include #include #include #include #include #include #include #include using namespace std; // functoid elapsedTime() returns time in seconds since the previous call class ElapsedTime { clock_t t; // time of last call public: ElapsedTime() { t=clock(); } double operator()() { // seconds since previous call clock_t c=clock(); double r=double(c-t)/CLOCKS_PER_SEC; t=c; return r; } } elapsedTime; // Read line from f into s until EOF bool getline(FILE* f, string& s) { s=""; if (!f) return false; int c; while((c=getc(f))!=EOF && c!='\n') s+=c; return c!=EOF || s!=""; } // Parse numbers from s, return number in column col >= 1 double split(const string& s, int col) { int b=0, e=0, i=1; // begin, end of number, column while (true) { while (bb) { if (i==col) return atof(s.substr(b, e-b).c_str()); } else return 0; b=e; ++i; } } // Square inline double sqr(double x) {return x*x;} // Globals int M = 3; // dimensions int R = 4; // flags: 1=test 4 segments, 2=random, 4=all, 8=adjust const int MMAX = 4; // max M double scale[MMAX] = {1,1,1,1}; // Model scale // Square of the distance from point A to point B in M dimensions double dist2(const double a[], const double b[]) { double d=0; for (int i=0; i0) { double n=0; for (int i=0; i1) t=1; d=0; // Result for (int i=0; ii < i < next->i Vertex(): err(0), i(0), prev(0), next(0) {} void compute_error(double x[][MMAX]) { if (prev && next) err = dist2(x[prev->i], x[next->i], x[i]) * sqrt(dist2(x[prev->i], x[next->i])); } }; Vertex* heap; // [N] with list endpoints at 0 and N-1 int size; // Heap size, N-2 shrinking down to K-2 // swap heap[i] and heap[j] maintaining links void exch(int i, int j) { if (i==j) return; if (heap[j].next == &heap[i]) swap(j, i); if (heap[i].next == &heap[j]) { int h=heap[i].prev - heap; // h - i - j - k becomes h - j - i - k int k=heap[j].next - heap; swap(heap[i], heap[j]); heap[h].next = &heap[j]; heap[j].next = &heap[i]; heap[i].next = &heap[k]; heap[k].prev = &heap[i]; heap[i].prev = &heap[j]; heap[j].prev = &heap[h]; } else { // i and j are not adjacent swap(heap[i], heap[j]); heap[i].prev->next = heap[i].next->prev = &heap[i]; heap[j].prev->next = heap[j].next->prev = &heap[j]; } } static inline int left(int i) {return i+i;} static inline int right(int i) {return i+i+1;} static inline int parent(int i) {return (unsigned int)i>>1;} // Ensure heap[i].err is smaller than children void heapify(int i) { assert(i>0 && i<=size); while (true) { int smallest=i; if (left(i)<=size && heap[left(i)].err < heap[smallest].err) smallest = left(i); if (right(i)<=size && heap[right(i)].err < heap[smallest].err) smallest = right(i); if (i==smallest) break; else { exch(i, smallest); i = smallest; } } } // Ensure heap[i].err is larger than parent void decrease_key(int i) { assert(i>0 && i<=size); while (i>1) { if (heap[i].err < heap[parent(i)].err) { exch(i, parent(i)); i = parent(i); } else break; } } // Endure heap[i].err is at the right level void fix(int i) { heapify(i); decrease_key(i); } public: void shrink(double x[][MMAX], int N, int K); void print(FILE* f, double x[][MMAX], int K) const; }; // Print the model void Shrinker::print(FILE* f, double x[][MMAX], int K) const { Vertex *p=heap; for (int i=0; ii); for (int j=0; jnext; } } void Shrinker::shrink(double x[][MMAX], int N, int K) { // Initialize doubly linked heap heap = new Vertex[N]; size = N-2; for (int i=0; i=1; --i) // make heap at 1..N-2 heapify(i); // Remove vertices while (size>0 && size>K-2) { exch(1, size--); // pop smallest err from heap Vertex& v = heap[size+1]; // Move neighbors toward removed vertex if (R&8) { double p[MMAX]; // closest point on path to removed vertex dist2(x[v.prev->i], x[v.next->i], x[v.i], p); for (int i=0; ii][i]+=d; x[v.next->i][i]+=d; } } // Unlink vertex from list v.prev->next = v.next; v.next->prev = v.prev; heapify(1); // Update err of neighbors of removed or adjusted vertices if (v.prev > heap) { v.prev->compute_error(x); fix(v.prev - heap); if ((R&8) && v.prev->prev > heap) { v.prev->prev->compute_error(x); fix(v.prev->prev - heap); } } if (v.next <= heap+size) { v.next->compute_error(x); fix(v.next - heap); if ((R&8) && v.next->next <= heap+size) { v.next->next->compute_error(x); fix(v.next->next - heap); } } } // Save back to x Vertex *p=heap; for (int i=0; ii; assert(j>=i && ji) memmove(x[i], x[j], M*sizeof(double)); p=p->next; } assert(!p); } int main(int argc, char **argv) { // Print help message if (argc<2) { printf("tsad4 v5 time series anomaly detector\n" "(C) 2004, Matt Mahoney. Free under GPL: http://www.gnu.org/licenses/gpl.txt\n" "\n" "Usage: dir/b | tsad4 N C T K M R S P\n" "\n" "Input: training and test files containing columns of numbers\n" "N = number of training samples per path; the rest is test\n" "C = column number to use (starting at 1); default is 1\n" "T = filter time constant; default is 5 samples\n" "K = number of vertices in model approximation; default is N\n" "M = number of dimensions (M-1 derivatives); range 1-4; default is 3\n" "R = state transitions to test: 1=prev and 2nd, 2=random, 4=all (default)\n" " R+8 as above but approximates segments 1/4 distance to removed points\n" "S = subsample rate; default is 1\n" "P = number of training paths, default is 1\n" "\n" "Output is maximum and total anomaly score per file\n" "tsad4.out is in M+2 columns: state, anomaly score, data, derivatives\n" "tsad4.mod lists the model vertices: time, M coordinates\n"); return 1; } elapsedTime(); // start timer // Input parameters const int N = atoi(argv[1]); // training set size const int C = argc>2 ? atoi(argv[2]) : 1; // column const double T = argc>3 ? atof(argv[3]) : 5.0; // filter time constant const int K = argc>4 ? atoi(argv[4]) : N; // model segments M = argc>5 ? atoi(argv[5]) : 3; // dimensions if (M>MMAX) M=MMAX; if (M<1) M=1; R = argc>6 ? atoi(argv[6]) : 4; // transitions to test const int S = argc>7 ? atoi(argv[7]) : 1; // sample rate const int P = argc>8 ? atoi(argv[8]) : 1; printf("N=%d C=%d T=%1.2f K=%d M=%d R=%d S=%d P=%d\n", N, C, T, K, M, R, S, P); // Open output FILE* out=fopen("tsad4.out", "w"); if (!out) { perror("tsad4.out"); exit(1); } // Read files string filename; double v[MMAX]={0}; // current value and M-1 derivatives double fv[MMAX]={0}; // filtered v double ffv[MMAX]={0}; // filtered fv double (*x)[MMAX] = new double[N*P][MMAX]; // training data, derivatives memset(x, 0, N*MMAX*P*sizeof(double)); double amax=0, asum=0, fa=0; // max, sum, filtered anomaly score const double DT=1-1/T; int i=0; // sample number vector state(P); // closest path segment to test data vector lo(M), hi(M); // bounding box of nearest paths to ffv double* p=new double[M]; // closest point on j'th path while (getline(stdin, filename)) { FILE* f=fopen(filename.c_str(), "r"); if (!f) perror(filename.c_str()); amax=asum=fa=0; fill(state.begin(), state.end(), 0); string s; while (getline(f, s)) { ++i; // Filter and take derivatives v[0]=split(s, C); for (int j=0; jxmax) xmax=x[k][j]; } if (xmax>xmin) scale[j]=1/(xmax-xmin); else scale[j]=0; printf("scale[%d] = %f (%f to %f)\n", j, scale[j], xmin, xmax); } printf("%1.2f sec. to scale %d dimensions to unit cube\n", elapsedTime(), M); // Approximate N*P vertices with K*P vertices FILE* mod=fopen("tsad4.mod", "w"); Shrinker sh; for (int j=0; jN*P) { // Find closest point on each of P paths, update state[P] for (int j=0; j0 // test previous state && (d=dist2((x+j*N)[state[j]-1], (x+j*N)[state[j]], ffv)) < dist) { newstate=state[j]-1; dist=d; } if ((R&2) && K>2) { // test random state int r=rand()%(K-1); if ((d=dist2((x+j*N)[r], (x+j*N)[r+1], ffv)) < dist) { dist=d; newstate=r; } } if (R&4) { // test all states newstate=state[j]; for (int k=0; khi[k]) hi[k]=p[k]; } } // Set p to closest point to ffv in box copy(ffv, ffv+M, p); for (int j=0; jhi[j]) p[j]=hi[j]; } // Anomaly score is distance squared from ffv to p dist=dist2(ffv, p); fa=fa*DT+dist*(1-DT); // Print test results fprintf(out, "%g", fa); for (int j=0; jamax) amax=fa; } // Subsample for (int j=1; j