/* lerad.cpp - Anomaly rule learning algorithm and detector Copyright (C) 2002, Matt Mahoney. This program is distributed without warranty under terms of the GNU general public license. See http://www.gnu.org/licenses/gpl.txt Usage: lerad train.txt test.txt [seed] This program performs anomaly detection on the 1999 DARPA IDS data set. It reads input from 2 files, train.txt (attack-free data) and test.txt (data containing attacks). The input is free form (words separated by white space), consisting of: a number of attributes (m), m attribute names, and tuples containing m attributes. The first 4 attributes must be a date (in form mm/dd/yyyy), time (in form hh:mm:ss), and the 2 low bytes of the IP address of the target host (as 3 decimal digits with leading 0s). For instance: 23 Date Time DA1 DA0 DP SA3 SA2 SA1 SA0 SP DUR F1 F2 F3 LEN W1 W2 W3 W4 W5 W6 W7 W8 03/15/1999 08:00:18 113 105 25 196 037 037 158 1024 0 .S .AP .AF 1263 .^@EHLO 03/15/1999 08:00:35 112 050 23 172 016 016 204 1024 45 .S .AP .AF 389 .^@ .^C 03/15/1999 08:00:49 113 084 25 197 182 182 233 1026 0 .S .AP .AF 2165 .^@EHLO The output is a list of alarms in the format expected by EVAL3 and EVAL4, with fixed width fields as follows: 0 MM/DD/YYYY HH:MM:SS 172.016.DA1.DA0 S.SSSSSS #Comments where S.SSSSSS is a score from 0 to 9.999999. LERAD learns rules from train.txt. For instance, given the tuples A B C D 1 2 3 4 1 2 3 5 It might learn the following rules (output to rules.txt) 1 n=2 if then B=2 2 n=2 if A=1 then B=2 3 n=2 if A=1 C=3 then B=2 where n is the number of tuples satisfying the anteceedent. Then given the tuple, A B C D 1 3 4 5 then the anteceedent of the first 2 rules are satisfied, and they are updated: 1 n=3 if then B=2 3 2 n=3 if A=1 then B=2 3 3 n=2 if A=1 C=3 then B=2 (unchanged) If a tuple in test.txt satisfies the anteceedent but the consequent is not in the list, then an alarm is generated with an anomaly score proportional to tn/r, where t is the time (in number of tuples) since the last anomaly and r is the number of allowed values in the consequent (r=2 for rules 1 and 2, r=1 for rule 3). For instance, if the tuple below occurs 100 tuples later, A B C D 1 8 3 0 then since it violates rule 3, it generates an anomlay score of 100 * 2 / 1 = 200. The optional seed is for random number generation. If omitted, it defaults to 0 to produce repeatable results. */ #include #include #include #include #include #include #include #include #include #include using namespace std; const double LOG10=log(10); int m=0; // Global number of attributes typedef int Nominal; // Index of attribute value typedef map Val; // Set of allowed values typedef vector Anteceedent; // Conditions: 0=don't care, 1=cons. struct Consequent { // Consequent and counts of a rule int n; // Support int t; // Time of last anomaly Val val; // Allowed values Consequent(): n(0), t(0) {} Consequent(Nominal v): n(0), t(0) {val[v]=1;} int add(Nominal v) {++n; return ++val[v];} }; // A ruleset maps anteceedednts to consequents. // The anteceedent is m elements where 0 = *, 1 = ?, 2 or more is // the constrained value for the attribute. typedef map Ruleset; struct Rule: public Consequent { // Complete rule Anteceedent rule; // 0=* 1=? >1 = constraint int cons; // index of ? in rule vector ante; // Indexes of anteceedents >1 Rule(): cons(0) {} Rule(const Anteceedent& v, const Consequent& c): Consequent(c), rule(v) { cons=rule.size(); for (int i=0; i1) ante.push_back(i); } } bool match(const Anteceedent& a) const { // Rule applies to a? for (int i=0; inr2 || (nr1==nr2 && a.ante.size()3 ? atoi(argv[3]) : 0); // Translate Nominal to string, string to Nominal vector n2s; n2s.push_back("*"); n2s.push_back("?"); map s2n; s2n["*"]=0; s2n["?"]=1; // Read relation header cerr << "Reading train.txt\n"; ifstream in(argv[1]); if (!in) { cerr << argv[1] << " not found\n"; return 1; } in >> m; if (m<2) { cerr << "Only " << m << " attributes\n"; return 1; } vector attribute_names(m); string s; for (int i=0; i> s; attribute_names[i]=s; } // Read relation from train.txt vector relation; // [tuple][attribute] Anteceedent t(m); while (true) { int i; for (i=0; i>s; ++i) { Nominal n=s2n[s]; if (n==0) { n=n2s.size(); n2s.push_back(s); s2n[s]=n; } t[i]=n; } if (i==m) relation.push_back(t); else break; } if (relation.size()<2) { cerr << "Only " << relation.size() << " tuples\n"; return 1; } in.close(); // Select random samples from the relation for preliminary training vector samples; // Saved ascending random indexes of relation int r=min(int(relation.size()), 100); // Number of samples still needed for (int i=0; r>0 && i rules; // Sorted ruleset for (Ruleset::iterator p=ruleset.begin(); p!=ruleset.end(); ++p) { int k; for (k=0; kfirst[k]==1) break; if (k==m) { cerr << "oops, rule with missing ?\n"; return 1; } int i; for (i=0; ifirst[j]>=2 && p->first[j]!=relation[r][j]) break; if (j==m) // Rule applies? p->second.add(relation[r][k]); } rules.push_back(Rule(p->first, p->second)); } // Estimate n/r again so that each tuple attribute is predicted by // only the rule with the highest n/r from before cerr << "Removing duplicate rules from " << rules.size() << " rules\n"; sort(rules.begin(), rules.end()); vector > cover(samples.size()); // cover[i][j] is true // if there is a rule predicting attribute relation[sample[i]][j] for (int i=0; i int(relation.size()*9)) { // late anomaly, remove rule rules[i]=rules.back(); rules.pop_back(); --i; break; } } } } } // Print coverage cerr << "Coverage:"; for (int i=0; i1) out << " " << attribute_names[j] << "=" << n2s[rules[i].rule[j]]; } out << " then " << attribute_names[rules[i].cons] << " ="; for (Val::iterator q=rules[i].val.begin(); q!=rules[i].val.end(); ++q) out << " " << n2s[q->first]; out << "\n"; } } // Statistics int nsum=0, nrsum=0; for (int i=0; i> m1; if (m1!=m) { cerr << "train.txt width=" << m << " test.txt width=" << m1 << endl; return 1; } for (int i=0; i> s; } vector ta(rules.size()); // Time of last anomaly per rule while (in) { // Read tuple t ++now; int i; for (i=0; i> s; Nominal n=s2n[s]; if (n==0) { n=n2s.size(); n2s.push_back(s); s2n[s]=n; } t[i]=n; } if (!in) break; // Evaluate tuple t double score=0; // Total score of tuple double bs=0; // Highest scoring rule int brule=0; // Index of highest scoring rule for (int i=0; i0){ score1=double(now-rules[i].t)*rules[i].n/rules[i].val.size(); if (score1>bs) { brule=i; bs=score1; } score+=score1; rules[i].t=now; } } // Print anomaly double pct=0; if (score>0) { pct=100*bs/score; score=log(score)/LOG10-4.5; } if (score>0) { cout << " 0 " << n2s[t[0]] << " " << n2s[t[1]] << " 172.016." << n2s[t[2]] << "." << n2s[t[3]] << " " << int(score) << "." << setw(6) << setfill('0') << int(score*1000000)-int(score)*1000000 << " # " << setw(3) << brule+1 << " (" << setprecision(4) << pct << ")"; for (int i=0; i=1) { cout << " " << attribute_names[i]; if (rules[brule].rule[i]==1) cout << "?"; cout << "=" << n2s[t[i]]; } } cout << "\n"; } } cerr << s2n.size() << " strings in all data\n"; return 0; }