/* PAQ3 - File archiver and compressor. (C) 2003, Matt Mahoney, mmahoney@cs.fit.edu This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License version 2 as published by the Free Software Foundation at http://www.gnu.org/licenses/gpl.txt or (at your option) any later version. This program is distributed without any warranty. USAGE To compress: PAQ3 archive file file... (1 or more file names), or or (MSDOS): dir/b | PAQ3 archive (read file names from input) or (UNIX): ls | PAQ3 archive To decompress: PAQ3 archive To list contents: more < archive Compression: The files listed are compressed and stored in the archive, which is created. The archive must not already exist. File names may specify a path, which is stored. If there are no file names on the command line, then PAQ3 prompts for them, reading until the first blank line or end of file. Decompression: No file names are specified. The archive must exist. If a path is stored, the file is extracted to the appropriate directory, which must exist. PAQ3 does not create directories. If the file to be extracted already exists, it is not replaced; rather it is compared with the archived file, and the offset of the first difference is reported. It is not possible to add, remove, or update files in an existing archive. If you want to do this, extract the files, delete the archive, and create a new archive with just the files you want. PAQ3 is an improved version of PAQ1SSE/PAQ2 by Serge Osnach, who added SSE to my PAQ1 (both available at http://cs.fit.edu/~mmahoney/compression/ ). PAQ3 uses an improved SSE implementation, and adds update exclusion to the NonstationaryPPM and WordModel models. */ #include #include #include #include #include #include #include #include #include using namespace std; const int MEM=6; // 0-8 = 1M, 2M ... 256M memory model. 6 = 64M // 8-32 bit unsigned types, adjust as appropriate typedef unsigned char U8; typedef unsigned short U16; typedef unsigned long U32; class U24 { // 24-bit unsigned int U8 b0, b1, b2; // Low, mid, high byte public: explicit U24(int x=0): b0(x), b1(x>>8), b2(x>>16) {} operator int() const {return (((b2<<8)|b1)<<8)|b0;} }; // 32-bit random number generator based on r(i) = r(i-24) ^ r(i-55) class Random { U32 table[55]; // Last 55 random values int i; // Index of current random value in table public: Random(); U32 operator()() { // Return 32-bit random number if (++i==55) i=0; if (i>=24) return table[i]^=table[i-24]; else return table[i]^=table[i+31]; } } rnd; Random::Random(): i(0) { for (int j=0; j<55; ++j) table[j]=314159265*j; for (int j=0; j<10000; ++j) operator()(); } /* Model interface. A Predictor is made up of a collection of various models, whose outputs are summed to yield a prediction. Methods: Model.predict(int& n0, int& n1) - Adds to counts n0 and n1 such that it predicts the next bit will be a 1 with probability n1/(n0+n1) and confidence n0+n1. Model.update(int y) - Appends bit y (0 or 1) to the model. */ class Model { public: virtual void predict(int& n0, int& n1) const = 0; virtual void update(int y) = 0; virtual ~Model() {} }; /* Hash table element base class. It contains an 8-bit checksum to detect collisions, and a priority() method which is used to control replacement when full by replacing the element with the lowest priority (0 = unused). The derived class should supply the data to be stored and override priority(). */ class HashElement { U8 ch; // Checksum public: HashElement(int c=0): ch(c) {} // Initialize the checksum, 0 = unused int checksum() const {return ch;} // Collision if not matched int priority() const {return ch!=0;} // Override: lowest replaced first }; /* 3 byte counter, shown for reference only. It implements a nonstationary pair of counters of 0s and 1s such that preference is given to recent history by discarding old bits. */ class Counter3: public HashElement { U8 n[2]; // n[y] is the counts of ys (0s or 1s) public: Counter3(int c=0): HashElement(c) {n[0]=n[1]=0;} int get0() const {return n[0];} // Return count of 0s int get1() const {return n[1];} // Return count of 1s int priority() const {return get0()+get1();} // For hash replacement void add(int y) { // Add y (0 or 1) to n[y] and age the opposite count if (n[y]<255) ++n[y]; if (n[1-y]>2) (n[1-y]/=2)+=1; } }; /* Approximately equivalent 2 byte counter implementing the above. The representable counts (n0, n1) are 0-10, 12, 14, 16, 20, 24, 28, 32, 48, 64, 128, 256, 512. Both counts are represented by a single 8-bit state. Counts larger than 10 are incremented probabilistically. Although it uses 1/3 less memory, it is 8% slower and gives 0.05% worse compression than the 3 byte counter. */ class Counter: public HashElement { U8 state; struct E { // State table entry U16 n0, n1; // Counts represented by state U8 s00, s01; // Next state on input 0 without/with probabilistic incr. U8 s10, s11; // Next state on input 1 U32 p0, p1; // Probability of increment x 2^32-1 on inputs 0, 1 }; static E table[244]; // State table public: Counter(int c=0): HashElement(c), state(0) {} int get0() const {return table[state].n0;} int get1() const {return table[state].n1;} int priority() const {return state;} void add(int y) { if (y) { if (state<94 || rnd() is a hash table of 2^N elements of type T (derived from HashElement) with linear search of M elements in case of collision. If all elements collide, then the one with the lowest .priority() is replaced. Hashtable[i] returns a T& indexed by the lower bits of i whose checksum matches the upper bits of i, creating or replacing if needed. */ template class Hashtable { private: T* table; // Array of 2^N+M elements Hashtable(const Hashtable&); // No copy Hashtable& operator=(const Hashtable&); // No assignment public: Hashtable(): table(new Counter[(1< inline T& Hashtable::operator[](U32 i) { const U32 lb=i&((1<>24; int bj=lb; int bget=1000000000; for (U32 j=lb; j counter0; // Counters for context lengths 0 and 1 vector counter1; Hashtable counter2; // for lengths 2 to N-1 Counter *cp[N]; // Pointers to current counters U32 hash[N]; // Hashes of last 0 to N-1 bytes public: inline void predict(int& c0, int& c1) const; // Add to counts of 0s and 1s inline void update(int y); // Append bit y (0 or 1) to model NonstationaryPPM(); int getc0() const {return c0;} }; NonstationaryPPM::NonstationaryPPM(): c0(1), c1(0), cn(1), counter0(256), counter1(65536) { for (int i=0; iget0()*wt; n1+=cp[i]->get1()*wt; } } // Add bit y (0 or 1) to model void NonstationaryPPM::update(int y) { // Count y by context for (int i=N-1; i>=0; --i) { cp[i]->add(y); if ((cp[i]->get0()+cp[i]->get1())*i > 100) { // Update exclusion cp[i]->add(y); break; } } // Store bit y cn+=cn+y; if (cn>=53) cn-=53; c0+=c0+y; if (c0>=256) { // Start new byte for (int i=N-1; i>0; --i) hash[i]=(hash[i-1]+c0)*987660757; c1=c0-256; c0=1; cn=1; } // Set up pointers to next counters cp[0]=&counter0[c0]; cp[1]=&counter1[c0+(c1<<8)]; for (int i=2; i buf; // Input buffer, wraps at end vector ptr; // Hash table of pointers U32 hash; // Hash of current context up to pos-1 int pos; // Element of buf where next bit will be stored int bpos; // Number of bits (0-7) stored at buf[pos] int begin; // Points to first matching byte (does not wrap) int end; // Points to last matching byte + 1, 0 if no match public: MatchModel(): buf(0x10000*(1<1000) wt=3000000; else wt*=wt*3; if ((buf[end]>>(7-bpos))&1) n1+=wt; else n0+=wt; } } // Append bit y to buf and check that it matches at the end. // After a byte is completed, compute a new hash and store it. // If there is no current match, search for one. void update(int y) { (buf[pos]<<=1)+=y; // Store bit ++bpos; if (end && (buf[end]>>(8-bpos))!=buf[pos]) // Does it match? begin=end=0; // no if (bpos==8) { // New byte bpos=0; hash=hash*(16*123456791)+buf[pos]+1; if (++pos==int(buf.size())) pos=0; if (end) ++end; else { // If no match, search for one U32 h=(hash^(hash>>16))&(ptr.size()-1); end=ptr[h]; if (end>0) { begin=end; int p=pos; while (begin>0 && p>0 && begin!=p+1 && buf[begin-1]==buf[p-1]) { --begin; --p; } } if (end==begin) // No match found begin=end=0; ptr[h]=U24(pos); } } } }; /* A WordModel predicts in the context of whole words (a-z, case insensitive) using a trigram model. */ class WordModel { private: U32 word1, word0; // Hash of previous and current word int ww1, ww0; // Word weights (lengths) Hashtable t1; // Model int c1; // Previous char, lower case int c0; // 0-7 bits of current char with leading 1 bit int c0h; // Small hash of c0 Counter *cp0, *cp1; // Points into t1 current context int lettercount; // For detecting English enum {THRESHOLD=5}; // lettercount threshold for English public: WordModel(): word1(0), word0(0), ww1(0), ww0(0), c1(0), c0(1), c0h(1), cp0(&t1[0]), cp1(&t1[0]), lettercount(0) {} void predict(int& n0, int& n1) const { if (lettercount>=THRESHOLD) { const int wt0=(ww0+1)*(ww0+1); n0+=cp0->get0()*wt0; n1+=cp0->get1()*wt0; const int wt1=(ww0+ww1+2)*(ww0+ww1+2); n0+=cp1->get0()*wt1; n1+=cp1->get1()*wt1; } } void update(int y) { if (lettercount>=THRESHOLD) { cp1->add(y); if (cp1->get0()+cp1->get1() < 100) cp0->add(y); } // Update contexts with next bit c0+=c0+y; c0h+=c0h+y; if (c0h>=59) c0h-=59; if (c0>=256) { c0-=256; if (lettercount>0 && (c0>127 || (c0<32 && c0!='\n' && c0!='\r' && c0!='\t'))) --lettercount; if (isalpha(c0)) { word0=(word0+tolower(c0)+1)*234577751*16; if (++ww0>8) ww0=8; if (lettercount=THRESHOLD) { U32 h=word0*123456791+c1*345689647+c0h+(c0<<24); cp0=&t1[h]; cp1=&t1[word1+h]; } } }; /* CyclicModel models data with fixed length records such as tables, databases, images, audio, binary numeric data, etc. It models runs of 0s or 1s in bit columns. A table is detected when an 8-bit pattern occurs 4 or more times in a row spaced by the same interval. The table ends after no repetition detection for the same number of bits as were in the table. */ class CyclicModel: public Model { struct E { int p, n, r; // Position of last match, number of matches, interval E(): p(0), n(0), r(0) {} }; vector cpos; // Table of repeat patterns by char int pos; // Current bit position in input int c0; // Last 8 bits int cycle; // Most likely number of columns int column; // Column number, 0 to cycle-1 int size; // Number of bits before the table expires, 0 to 3*cycle Hashtable t; // Context is last 8 bits in column vector t1; // Context is the column number only Counter *cp, *cp1; // Points to t, t1 public: CyclicModel(): cpos(256), pos(0), c0(1), cycle(0), column(0), size(0), t1(2048), cp(&t[0]), cp1(&t1[0]) {} void predict(int& n0, int& n1) const { if (cycle>0) { int wt=16; n0+=cp->get0()*wt; n1+=cp->get1()*wt; wt=4; n0+=cp1->get0()*wt; n1+=cp1->get1()*wt; } } void update(int y) { if (++column>=cycle) column=0; if (size>0 && --size==0) cycle=0; cp->add(y); cp1->add(y); c0=(c0+c0+y)&0xff; ++pos; E& e=cpos[c0]; if (e.p+e.r==pos) { ++e.n; if (e.n>3 && e.r>8 && e.r*e.n>size) { size=e.r*e.n; if (cycle!=e.r) { cycle=e.r; column=pos%cycle; } } } else { e.n=1; e.r=pos-e.p; } e.p=pos; int h=column*3+c0*876546821; cp=&t[h]; cp1=&t1[column&2047]; } }; // Secondary source encoder struct SSEContext { U8 c1, n; // Count of 1's, count of bits U16 p() const {return U32(65535)*(c1*64+1)/(n*64+2);} void update(int y) { if (y) ++c1; if (++n>254) { c1/=2; n/=2; } } }; const int SSE1=256*3*3*3*3, SSE2=32; // dimensions const int SSESCALE=1024/SSE2; SSEContext sse[SSE1][SSE2+1]; // [context][mapped probability] // Scale probability p (0 to 64K-1) into a context in the range 0 to 1K-1 class SSEMap { enum {N=4096}; U16 table[N]; public: int operator()(int p) const {return table[p/(65536/N)];} SSEMap(); } ssemap; // global functoid SSEMap::SSEMap() { for (int i=0; i1023) p=1023; if (p<0) p=0; table[i]=p; } } /* A Predictor predicts the next bit given the bits so far using a collection of models. Methods: p() returns probability of a 1 being the next bit, P(y = 1) as a 16 bit number (0 to 64K-1). update(y) updates the models with bit y (0 or 1) */ class Predictor { NonstationaryPPM m1; MatchModel m2; WordModel m3; CyclicModel m4; mutable int context, ssep; public: Predictor(); ~Predictor(); U16 p() const { int n0=1, n1=n0; context=m1.getc0(); m4.predict(n0, n1); context=context*3+(n0*2>n1)+(n0>n1*2); m2.predict(n0, n1); context=context*3+(n0*4>n1)+(n0>n1*4); m3.predict(n0, n1); context=context*3+(n0*8>n1)+(n0>n1*8); m1.predict(n0, n1); context=context*3+(n0*16>n1)+(n0>n1*16); int n=n0+n1; while (n>32767) { n1/=16; n/=16; } U16 pr=65535*n1/n; ssep=ssemap(pr); int wt=ssep%SSESCALE; int i=ssep/SSESCALE; return (((sse[context][i].p()*(SSESCALE-wt)+sse[context][i+1].p()*wt) /SSESCALE)*3+pr)/4; } void update(int y) { m1.update(y); m2.update(y); m3.update(y); m4.update(y); sse[context][ssep/SSESCALE].update(y); sse[context][ssep/SSESCALE+1].update(y); } }; Predictor::Predictor() { int N=4096; int oldp=SSE2+1; for (int i=N-1; i>=0; --i) { int p=(ssemap(i*65536/N)+SSESCALE/2)/SSESCALE; int n=1+N*N/((i+1)*(N-i)); if (n>254) n=254; int c1=(i*n+N/2)/N; for (int j=oldp-1; j>=p; --j) { for (int k=0; k=0x10000000) xmid+=(xdiff>>16)*p; else if (xdiff>=0x1000000) xmid+=((xdiff>>12)*p)>>4; else if (xdiff>=0x100000) xmid+=((xdiff>>8)*p)>>8; else if (xdiff>=0x10000) xmid+=((xdiff>>4)*p)>>12; else xmid+=(xdiff*p)>>16; // Update the range if (y) x2=xmid; else x1=xmid+1; predictor.update(y); // Shift equal MSB's out while (((x1^x2)&0xff000000)==0) { putc(x2>>24, archive); x1<<=8; x2=(x2<<8)+255; } } /* Decode one bit from the archive, splitting [x1, x2] as in the encoder and returning 1 or 0 depending on which subrange the archive point x is in. */ inline int Encoder::decode() { // Split the range const U32 p=predictor.p(); // Probability P(1) * 64K rounded down const U32 xdiff=x2-x1; U32 xmid=x1; // = x1+p*(x2-x1) multiply without overflow, round down if (xdiff>=0x10000000) xmid+=(xdiff>>16)*p; else if (xdiff>=0x1000000) xmid+=((xdiff>>12)*p)>>4; else if (xdiff>=0x100000) xmid+=((xdiff>>8)*p)>>8; else if (xdiff>=0x10000) xmid+=((xdiff>>4)*p)>>12; else xmid+=(xdiff*p)>>16; // Update the range int y=0; if (x<=xmid) { y=1; x2=xmid; } else x1=xmid+1; predictor.update(y); // Shift equal MSB's out while ((x1>>24)==(x2>>24)) { x1<<=8; x2=(x2<<8)+255; x=(x<<8)+getc(archive); } return y; } // Should be called when there is no more to compress void Encoder::flush() { // In COMPRESS mode, write out the remaining bytes of x, x1 < x < x2 if (mode==COMPRESS) { while (((x1^x2)&0xff000000)==0) { putc(x2>>24, archive); x1<<=8; x2=(x2<<8)+255; } putc(x2>>24, archive); // First unequal byte } } // Read one byte from encoder e int decompress(Encoder& e) { // Decompress 8 bits, MSB first int c=0; for (int i=0; i<8; ++i) c=c+c+e.decode(); return c; } // Write one byte c to encoder e void compress(Encoder& e, int c) { for (int i=7; i>=0; --i) e.encode((c>>i)&1); } // Fail if out of memory void handler() { printf("Out of memory\n"); exit(1); } // Read and return a line of input from FILE f (default stdin) up to // first control character except tab. Skips CR in CR LF. string getline(FILE* f=stdin) { int c; string result=""; while ((c=getc(f))!=EOF && (c>=32 || c=='\t')) result+=char(c); if (c=='\r') (void) getc(f); return result; } // User interface int main(int argc, char** argv) { set_new_handler(handler); // Check arguments if (argc<2) { printf( "PAQ3 file compressor/archiver, (C) 2003, Matt Mahoney, mmahoney@cs.fit.edu\n" "This program is free software distributed without warranty under the terms\n" "of the GNU General Public License, see http://www.gnu.org/licenses/gpl.txt\n" "\n" "To compress: PAQ3 archive filenames... (archive will be created)\n" " or (MSDOS): dir/b | PAQ3 archive (reads file names from input)\n" "To extract/compare: PAQ3 archive (does not clobber existing files)\n" "To view contents: more < archive\n"); return 1; } // File names and sizes from input or archive vector filename; // List of names vector filesize; // Size or -1 if error int start_time=clock(); int uncompressed_bytes=0, compressed_bytes=0; // Input, output sizes // Extract files FILE* archive=fopen(argv[1], "rb"); if (archive) { if (argc>2) { printf("File %s already exists\n", argv[1]); return 1; } printf("Extracting archive %s ...\n", argv[1]); // Read "PAQ3\r\n" at start of archive if (getline(archive) != "PAQ3") { printf("Archive file %s not in PAQ3 format\n", argv[1]); return 1; } // Read "size filename" in "%d\t%s\r\n" format while (true) { string s=getline(archive); if (s.size()>1) { filesize.push_back(atol(s.c_str())); string::iterator tab=find(s.begin(), s.end(), '\t'); if (tab!=s.end()) filename.push_back(string(tab+1, s.end())); else filename.push_back(""); } else break; } // Test end of header for "\f\0" { int c1=0, c2=0; if ((c1=getc(archive))!='\f' || (c2=getc(archive))!=0) { printf("%s: Bad PAQ3 header format %d %d\n", argv[1], c1, c2); return 1; } } // Extract files from archive data Encoder e(DECOMPRESS, archive); for (int i=0; i2) for (int i=2; i=0) fprintf(archive, "%ld\t%s\r\n", filesize[i], filename[i].c_str()); } putc(032, archive); // MSDOS EOF putc('\f', archive); putc(0, archive); // Write data Encoder e(COMPRESS, archive); long file_start=ftell(archive); for (int i=0; i=0) { uncompressed_bytes+=size; printf("%-23s %10ld -> ", filename[i].c_str(), size); FILE* f=fopen(filename[i].c_str(), "rb"); int c; for (long j=0; j0 && elapsed_time>0) { printf(" (%1.4f bpc, %1.2f%% at %1.0f KB/s)", compressed_bytes*8.0/uncompressed_bytes, compressed_bytes*100.0/uncompressed_bytes, uncompressed_bytes/(elapsed_time*1000.0)); } printf("\n"); return 0; }