/* phad.cpp - tcpdump anomaly detection for DARPA IDS evaluation. Copyright (C) 2002, Matt Mahoney. This program is distributed without warranty under terms of the GNU general public license. See http://www.gnu.org/licenses/gpl.txt UNIX: phad 1123200 in3* in4* in5* |sort +0.45 -r >phad.sim MSDOS: phad 14400 in23 |sort /+46 /r >phad.sim Training time is in seconds (e.g. 13 days or 4 hours). Files are tcpdump files in chronological order. Output is in unsorted .sim format for eval.cpp, e.g. ID (0), date, time, victim IP address, and a score from 0 to 1. 0 04/06/1999 08:59:16 172.016.112.194 0.631169 # The Ethernet, IP, TCP, UDP, and ICMP packet headers are divided into fields of 1-4 bytes. During training, the set of possible values for each field are recorded in a set of up to K contiguous clusters. During testing, an anomaly is detected if a field has a value not in one of the clusters. The score is tn/r, where the field was seen n times in training, there were r anomalies in training, and it was t seconds since the last anomaly was seen in this field. The score output is -0.6 + 0.1 log10 SUM tn/r, summed over the fields. */ #include #include #include #include #include #include // Convert 2-4 bytes to unsigned long, MSB first inline unsigned int i2(const unsigned char* p) { return (p[0]<<8)|p[1]; } inline unsigned long i3(const unsigned char* p) { return (((p[0]<<8)|p[1])<<8)|p[2]; } inline unsigned long i4(const unsigned char* p) { return (((((p[0]<<8)|p[1])<<8)|p[2])<<8)|p[3]; } // Print the time (seconds since 1970) in a readable format void print_time(unsigned long seconds) { char s[30]; time_t t=seconds; tm* local=localtime(&t); if (local) { strftime(s, 30, "%m/%d/%Y %H:%M:%S", local); printf("%s", s); } else printf("?"); } // Print the time right now void real_time(const char* msg) { time_t t=time(NULL); printf("%s ", msg); print_time(t); printf("\n"); } // Parsed data packet const int MAX_PACKET=1600; // Ethernet + tcpdump header max size static unsigned char data[MAX_PACKET]; // tcpdump header and packet const unsigned char *base=data, *ethernet=0, *ip=0, *icmp=0, *udp=0, *tcp=0, *tcp_option=0, *appl=0, *null=0; // Pointers to start of each header enum Format {DECIMAL, HEX, TIME, IP}; // For printing a field const int K=32; // Max number of clusters // Print x in an appropriate format and length for a network packet field void print(unsigned long x, Format format, int length) { switch (format) { case DECIMAL: printf("%lu", x); break; case HEX: { putchar('x'); for (int i=length*8-8; i>=0; i-=8) printf("%02X", (x>>i)&255); break; } case TIME: print_time(x); break; case IP: printf("%03d.%03d.%03d.%03d", (x>>24)&255, (x>>16)&255, (x>>8)&255, x&255); } } // A table of fields struct Field { const char* name; // Field name const unsigned char** p; // Value is at (*p)[offset] int offset; int length; // Number of bytes in field Format format; // How it should be printed unsigned long n; // Number of observations unsigned long r; // Max number of different observed values unsigned long t; // Time of last anomaly unsigned long vmin[K+1], vmax[K+1]; // Cluster bounds int k; // Number of clusters, 0 to K // Current value of this field unsigned long value() const { if (!*p) return 0; const unsigned char* dp = *p+offset; switch (length) { case 1: return *dp; case 2: return i2(dp); case 3: return i3(dp); case 4: return i4(dp); } return 0; } }; Field field[] = { {"Time", &base, 0, 4, TIME}, {"Ether Size", &base, 8, 4, DECIMAL}, {"Ether Dest Hi", ðernet, 0, 3, HEX}, {"Ether Dest Lo", ðernet, 3, 3, HEX}, {"Ether Src Hi", ðernet, 6, 3, HEX}, {"Ether Src Lo", ðernet, 9, 3, HEX}, {"Ether Protocol", ðernet, 12, 2, HEX}, {"IP Header Len", &ip, 0, 1, HEX}, {"IP TOS", &ip, 1, 1, HEX}, {"IP Length", &ip, 2, 2, DECIMAL}, {"IP Frag ID", &ip, 4, 2, DECIMAL}, {"IP Frag Ptr", &ip, 6, 2, HEX}, {"IP TTL", &null, 8, 1, DECIMAL}, // Removed {"IP Protocol", &ip, 9, 1, DECIMAL}, {"IP Checksum", &ip, 10, 2, HEX}, {"IP Src", &ip, 12, 4, IP}, {"IP Dest", &ip, 16, 4, IP}, // 16 {"TCP Src Port", &tcp, 0, 2, DECIMAL}, {"TCP Dest Port", &tcp, 2, 2, DECIMAL}, {"TCP Seq", &tcp, 4, 4, DECIMAL}, {"TCP Ack", &tcp, 8, 4, DECIMAL}, {"TCP Header Len", &tcp, 12, 1, HEX}, {"TCP Flg UAPRSF", &tcp, 13, 1, HEX}, {"TCP Window Sz", &tcp, 14, 2, DECIMAL}, {"TCP Checksum", &tcp, 16, 2, HEX}, {"TCP URG Ptr", &tcp, 18, 2, DECIMAL}, {"TCP Option", &tcp_option, 0, 4, HEX}, {"UCP Src Port", &udp, 0, 2, DECIMAL}, {"UDP Dest Port", &udp, 2, 2, DECIMAL}, {"UDP Len", &udp, 4, 2, DECIMAL}, {"UDP Checksum", &udp, 6, 2, HEX}, {"ICMP Type", &icmp, 0, 1, DECIMAL}, {"ICMP Code", &icmp, 1, 1, DECIMAL}, {"ICMP Checksum", &icmp, 2, 2, HEX} }; main(int argc, char** argv) { // Check for args if (argc==1) { printf("Usage: ta7 training_time tcpDumpFiles...\n\n"); } real_time("Starting"); long packets=0; // count // Training and test times, start of window unsigned long start_time=0, now=0, prev=0, test_time=0; // after 1/1/1970 // Read the tcpdump files for (int argi=2; argilength2 || length2>MAX_PACKET-16) { fprintf(stderr, "%s: Bad packet length at %ld\n", argv[argi], ftell(f)); break; } // Read the rest of the data if (fread(data+16, 1, length, f)!=length) break; // End of file ++packets; // Get start time prev=now; now=i4(data); if (start_time==0) { start_time=now; test_time=now+atol(argv[1]); } if (now>=test_time && prev=0x60) { // Header size tcp_option=tcp+20; appl=tcp+24; } } else if (ip[9]==17) { udp=ip+ipheader; appl=udp+8; } else if (ip[9]==1) { icmp=ip+ipheader; appl=icmp+4; } } // Replace checksums with their computed values. IP first if (ip) { int ipheader=(ip[0]&15)*4; unsigned long checksum=0; for (int i=0; i>16); data[ip-base+11]=checksum; data[ip-base+10]=checksum>>8; } // UDP checksum is optional (0 if not computed) if (udp && i2(udp+6)) { int udplen=i2(udp+4); // Length of UDP header and payload unsigned long checksum=17+udplen+i2(ip+12)+i2(ip+14)+i2(ip+16) +i2(ip+18); // Pseudo header (protocol, length, source, dest) for (int i=0; i>16); data[udp-base+7]=checksum; data[udp-base+6]=checksum>>8; } // TCP checksum if (tcp) { int tcplen=i2(ip+2)-4*(ip[0]&15); unsigned long checksum=6+tcplen+i2(ip+12)+i2(ip+14)+i2(ip+16) +i2(ip+18); // Checksum of psuedo header as in UDP for (int i=0; i>16; data[tcp-base+17]=checksum; data[tcp-base+16]=checksum>>8; } // ICMP checksum if (icmp) { int icmplen=i2(ip+2)-4*(ip[0]&15); unsigned long checksum=0; for (int i=0; i>16; data[icmp-base+3]=checksum; data[icmp-base+2]=checksum>>8; } // Process the fields double score=0; // Anomaly score double bscore=0; // Highest anomaly score of any field int bi=0; // Field with highest score for (int i=0; ifi.vmax[mid]) lo=mid+1; else hi=mid; } // If not found and still training, insert cluster for v at lo if (nowlo; --j) { fi.vmin[j]=fi.vmin[j-1]; fi.vmax[j]=fi.vmax[j-1]; } fi.vmin[lo]=fi.vmax[lo]=v; ++fi.k; ++fi.r; fi.t=now; // Merge adjacent clusters. If full, merge the two closest. if (fi.k>=K || (lo>0 && v==fi.vmax[lo-1]+1) || (lo1) { int bj=0; // First of pair to merge unsigned long bd=fi.vmin[1]-fi.vmax[0]; // Least dist found for (int j=1; j1) break; for (int j=bj+1; j0 && now>=test_time && (lo==fi.k || v0 && fi.r>0) { double sc=double(now-fi.t)*double(fi.n)/double(fi.r); score+=sc; if (sc>bscore) { bscore=sc; bi=i; } } fi.t=now; } } } // Print anomaly double percent=0; if (score>0) percent=100*bscore/score; score*=1e-6; if (score>1) { score=0.1*log(score)/log(10); bscore=0.1*log(bscore)/log(10); printf(" 0 "); print(now, TIME, 4); printf(" "); print(field[16].value(), IP, 4); printf(" %8.6f # %s=", score, field[bi].name); print(field[bi].value(), field[bi].format, field[bi].length); printf(" %1.0f%%\n", percent); } } } // Print stats for (int i=0; i4) printf("...(%d)", fi.k); else if (i<3 || i==fi.k-1) { putchar(' '); print(fi.vmin[i], fi.format, fi.length); if (fi.vmax[i]>fi.vmin[i]) { putchar('-'); print(fi.vmax[i], fi.format, fi.length); } } } printf("\n"); } printf("%ld packets\n", packets); real_time("Done"); return 0; }