/* PAQC - Compression program for Calgary challenge (C) 2004, Matt Mahoney, mmahoney@cs.fit.edu Distributed free under GPL, http://www.gnu.org/licenses/gpl.txt This program was used to create a winning archive in the Calgary challenge (http://mailcom.com/challenge/) of 645,667 bytes on Jan. 10, 2004. Create the archive as follows: paqc -1 v news bib book1 book2 paper1 paper2 progc progl progp trans paqc -2 w pic paqc -3 x geo paqc -3 y obj1 paqc -3 z obj2 The decompressor, d.cpp, is a stripped down version of this program, removing compresion code, some error checking, comments, and excess spaces. Either this program or d.cpp could be used to decompres the above files. The winning archive was produced by packing d.cpp and the above 5 files into a RAR archive, choosing best compression for d.cpp and storing the other files. PAQC is derived from PAQ6 by tuning it to the Calgary corpus. It differs mainly in the addition of a model for pic, selected by the -2 option, and the removal of models not important to the Calgary corpus. It can be used as a general compresor. The compression option should be -1 for text, -2 for CCITT images (216 bytes per scan line), and -3 for other binary files. Memory usage is fixed at 190 MB. Other minor differences: - CharModel is called AllModel (order 8 context), used for -1 and -3. - CounerMap allows sharing hash table space among multiple contexts. - RecordModel, ExeModel, and AnalogModel are removed. - SparseModel is now ObjModel, used for -3. - Mixer context is changed slightly. There is no need to distinguish text and binary in the mixer context as the option selects this. - There are 4X more pointers in the MatchModel hash table. Compression is about 4K better than PAQ6. Of this, 3K is due to PicModel and 1K due to the removal of extraneous models for text. The other changes result in only very small improvements. The 5 archive files by themselves total 639,567 bytes. The previous challenge was a tuned variant of SLIM 12 submitted in Nov. 2002 by Serge Voskoboynikov, 653,720 bytes as a HA archive containing an executable decompressor (d.exe) and 5 compressed files (named v,w,x,y,z but in a different order) using the same grouping. The file sizes are shown below. Note that SLIM compresses pic better by 6K and geo by 1K. PAQC gets better compression mainly due to the sparse models (ObjModel) for obj1 and obj2, and improved text compression. Also, source code submissions were not allowed at the time of the SLIM submission. I found that RAR compresses d.cpp about 1K smaller than HA. SLIM PAQC ------ ------ Archive HA RAR Decompressor 17920 25671 (compressed) 9402 5871 text (10 files) 512519 507426 pic 20478 26072 geo 44051 45346 obj1 8698 8154 obj2 58426 52569 archive header 146 229 ----- ------ ------ Total 653720 645667 I should note that SLIM has improved greatly in the last year, now giving better text compression than PAQC (506906 on the 10 concatenated files), so I would not be surprised if it soon regains the lead. Thanks to Serge Osnach for introducing me to SSE (in PAQ1SSE/PAQ2) and the sparse models (PAQ3N). Also, credit to Eugene Shelwein, Dmitry Shkarin for suggestions on using multiple character SSE contexts. Credit to Eugene, Serge, and Jason Schmidt for developing faster and smaller executables of previous versions. Credit to Werner Bergmans and Berto Destasio for testing and evaluating them, including modifications that improve compression at the cost of more memory. Credit to Alexander Ratushnyak who found a bug in PAQ4 decompression, and also in PAQ6 decompression for very small files (both fixed). Thanks to Berto for tuning PAQ5, including revised counter state tables, resulting in PAQ6. Thanks to Jason for suggesting increasing PSCALE. Thanks to Fabio Buffoni who pointed out some optimizations in the mixer. */ #define PROGNAME "PAQC" // Please change this if you change the program #define hash ___hash // To avoid Digital MARS name collision #include #include #include #include #include #include #include #include #include #include #undef hash using namespace std; const int PSCALE=4096; // Integer scale for representing probabilities int MEM=1; // 1=text, 2=pic, 3=other binary (doesn't affect memory usage) template inline int size(const T& t) {return t.size();} // 8-32 bit unsigned types, adjust as appropriate typedef unsigned char U8; typedef unsigned short U16; typedef unsigned long U32; // Fail if out of memory void handler() { printf("Out of memory\n"); exit(1); } // A ProgramChecker verifies some environmental assumptions and sets the // out of memory handler. It also gets the program starting time. // The global object programChecker should be initialized before any // other global objects. class ProgramChecker { clock_t start; public: ProgramChecker() { start=clock(); set_new_handler(handler); // Test the compiler for common but not guaranteed assumptions assert(sizeof(U8)==1); assert(sizeof(U16)==2); assert(sizeof(U32)==4); assert(sizeof(int)==4); } clock_t start_time() const {return start;} // When the program started } programChecker; //////////////////////////// rnd //////////////////////////// // 32-bit random number generator based on r(i) = r(i-24) ^ r(i-55) class Random { U32 table[55]; // Last 55 random values int i; // Index of current random value in table public: Random(); U32 operator()() { // Return 32-bit random number if (++i==55) i=0; if (i>=24) return table[i]^=table[i-24]; else return table[i]^=table[i+31]; } } rnd; Random::Random(): i(0) { // Seed the table table[0]=123456789; table[1]=987654321; for (int j=2; j<55; ++j) table[j]=table[j-1]*11+table[j-2]*19/16; } //////////////////////////// hash //////////////////////////// // Hash functoid, returns 32 bit hash of 1-4 chars class Hash { U32 table[8][256]; // Random number table public: Hash() { for (int i=7; i>=0; --i) for (int j=0; j<256; ++j) table[i][j]=rnd(); assert(table[0][255]==3610026313LU); } U32 operator()(U8 i0) const { return table[0][i0]; } U32 operator()(U8 i0, U8 i1) const { return table[0][i0]+table[1][i1]; } U32 operator()(U8 i0, U8 i1, U8 i2) const { return table[0][i0]+table[1][i1]+table[2][i2]; } U32 operator()(U8 i0, U8 i1, U8 i2, U8 i3) const { return table[0][i0]+table[1][i1]+table[2][i2]+table[3][i3]; } } hash; //////////////////////////// Counter //////////////////////////// /* A Counter represents a pair (n0, n1) of counts of 0 and 1 bits in a context. get0() -- returns p(0) with weight n = get0()+get1() get1() -- returns p(1) with weight n add(y) -- increments n_y, where y is 0 or 1 and decreases n_1-y priority() -- Returns a priority (n) for hash replacement such that higher numbers should be favored. */ class Counter { U8 state; struct E { // State table entry U16 n0, n1; // get0(), get1() U8 s00, s01; // Next state on input 0 without/with probabilistic incr. U8 s10, s11; // Next state on input 1 U32 p0, p1; // Probability of increment x 2^32 on inputs 0, 1 }; static E table[]; // State table public: Counter(): state(0) {} int get0() const {return table[state].n0;} int get1() const {return table[state].n1;} int priority() const {return get0()+get1();} void add(int y) { if (y) { if (state<208 || rnd() N ch.bpos() -- The number of bits (0-7) of the current partial byte at (0) ch[i] -- ch(pos()-i) ch.lo() -- Low order nibble so far (1-15 with leading 1) ch.hi() -- Previous nibble, 0-15 (no leading 1 bit) ch.pos(c) -- Position of the last occurrence of byte c (0-255) ch.pos(c, i) -- Position of the i'th to last occurrence, i = 0 to 3 */ class Ch { U32 N; // Buffer size U8 *buf; // [N] last N bytes U32 p; // pos() U32 bp; // bpos() U32 hi_nibble, lo_nibble; // hi(), lo() U32 lpos[256][4]; // pos(c, i) public: Ch(): N(0), buf(0), p(0), bp(0), hi_nibble(0), lo_nibble(1) { memset(lpos, 0, 256*4*sizeof(U32)); } void init() { N = 1 << 22; buf=(U8*)calloc(N, 1); if (!buf) handler(); buf[0]=1; } U32 operator()(int i) const {return buf[(p-i)&(N-1)];} U32 operator()() const {return buf[p&(N-1)];} void update(int y) { U8& r=buf[p&(N-1)]; r+=r+y; if (++bp==8) { lpos[r][3]=lpos[r][2]; lpos[r][2]=lpos[r][1]; lpos[r][1]=lpos[r][0]; lpos[r][0]=p; bp=0; ++p; buf[p&(N-1)]=1; } if ((lo_nibble+=lo_nibble+y)>=16) { hi_nibble=lo_nibble-16; lo_nibble=1; } } U32 pos() const {return p;} U32 pos(U8 c, int i=0) const {return lpos[c][i&3];} U32 bpos() const {return bp;} U32 operator[](int i) const {return buf[i&(N-1)];} U32 hi() const {return hi_nibble;} U32 lo() const {return lo_nibble;} } ch; // Global object //////////////////////////// Hashtable //////////////////////////// /* A Hashtable stores Counters. It is organized to minimize cache misses for 64-byte cache lines. The size is fixed at 2^n bytes. It uses LRU replacement for buckets of size 4, except that the next to oldest element is replaced if it has lower priority than the oldest. Each bucket represents 15 counters for a context on a half-byte boundary. Hashtable ht(n) -- Create hash table of 2^n bytes (15/16 of these are 1-byte Counters). ht.set(h, sel) -- Set major context selected by sel to h, a 32 bit hash of a context ending on a nibble (4-bit) boundary. 0 <= sel < M. ht(c, sel) -- Retrieve a reference to counter associated with partial nibble c (1-15) in context h[sel]. Normally there should be 4 calls to ht(c, sel) after each ht.set(h, sel). */ template class Hashtable { private: const U32 N; // log2 size in 16-byte elements struct HashElement { U8 checksum; // Checksum of context, used to detect collisions T c[15]; // 1-byte counters in minor context c HashElement(): checksum(0) {} }; HashElement *table; // [2^N] U32 cxt[MODELS]; // major contexts public: Hashtable(U32 n); // Uncomment this to print hash table usage statistics /* ~Hashtable() { int c1=0, c2=0; for (int i=0; i<(1<>10, h>>21, sel); // Search 4 elements for h within a 64-byte cache line const U8 checksum=(h>>24)^h; const U32 lo= (h>>(32-N)) & -4; const U32 hi=lo+4; U32 i; for (i=lo; i Hashtable::Hashtable(U32 n): N(n>4?n-4:1), table(0) { assert(sizeof(HashElement)==16); assert(sizeof(char)==1); assert(sizeof(cxt[0])==4); memset(cxt, 0, MODELS*4); // Align the hash table on a 64 byte cache page boundary char *p=(char*)calloc((16<0 && n<=MODELS); assert(c_>=0 && c_2000000000/PSCALE) sum/=4, n1/=4; assert(sum>0); return (PSCALE-1)*n1/sum; } // Adjust the weights by gradient descent to reduce cost of bit y void Mixer::update(int y) { U32 s0=0, s1=0; for (int i=0; i0 && s1>0) { const U32 s=s0+s1; const U32 sy=y?s1:s0; const U32 sy1=0xffffffff/sy+(rnd()&1023) >> 10; const U32 s1 =0xffffffff/s +(rnd()&1023) >> 10; for (int i=0; i> 8; wt[c][i]=min(65535, max(1, int(wt[c][i]+dw))); } } n=0; } Mixer::Mixer(int C_): C(C_), bc0(new U32[MODELS]), bc1(new U32[MODELS]), wt(new U32[C_][MODELS]), n(0), c(0) { for (int i=0; i> 5); if (MIX2) { U32 p2=m2.predict(ch(1)/32%4+ch(2)/32%4*4); return (p1+p2)>>1; } else return p1; } void update(int y) { m1.update(y); if (MIX2) m2.update(y); } U32 getC() const {return 256;} U32 getN() const {return m1.getN();} }; MultiMixer mixer; //////////////////////////// CounterMap //////////////////////////// /* CounterMap maintains a model and one context Countermap cm(N); -- Create, size 2^N bytes cm.update(h); -- Update model, then set next context hash to h cm.write(); -- Predict next bit and write counts to mixer cm.add(); -- Predict and add to previously written counts There should be 8 calls to either write() or add() between each update(h). h is a 32-bit hash of the context which should be set after a whole number of bytes are read. */ // Stores only the most recent byte and its count per context (run length) // in a hash table without collision detection class CounterMap1 { const int N; struct S { U8 c; // char U8 n; // count }; S* t; // cxt -> c repeated last n times U32 cxt[MODELS]; public: CounterMap1(int n): N(n>1?n-1:1) { memset(cxt, 0, MODELS*4); assert(sizeof(S)==2); t=(S*)calloc(1<> 32-N; } void add(U32 sel) { if ((U32)((t[cxt[sel]].c+256) >> 8-ch.bpos())==ch()) { if ((t[cxt[sel]].c >> 7-ch.bpos()) & 1) mixer.add(0, t[cxt[sel]].n); else mixer.add(t[cxt[sel]].n, 0); } } void write(U32 sel) { mixer.write(0, 0); add(sel); } }; // Uses a nibble-oriented hash table of contexts (counter state) class CounterMap2 { const U32 N2; // Size of ht2 in elements U32 cxt[MODELS]; // Major contexts Hashtable ht2; // Secondary hash table Counter* cp[MODELS][8]; // Pointers into ht2 or 0 if not used public: CounterMap2(int n); // Use 2^n bytes memory void add(U32 sel); void update(U32 h, U32 sel); void write(U32 sel) { mixer.write(0, 0); add(sel); } }; CounterMap2::CounterMap2(int n): N2(n), ht2(N2) { for (int i=0; iget0(), cp[sel][bcount]->get1()); } // After 8 predictions, update the models with the last input char, ch(1), // then set the new context hash to h void CounterMap2::update(U32 h, U32 sel) { assert(seladd((c>>(7-i))&1); cp[sel][i]=0; } } cxt[sel]=h; ht2.set(cxt[sel], sel); } // Combines 1 and 2 above. class CounterMap3 { enum {CM1=1}; // Use cm1 CounterMap1 cm1; CounterMap2 cm2; public: CounterMap3(int n): cm1(CM1 ? n-2 : 0), cm2(n) {} void update(U32 h, U32 sel=0) { if (CM1) cm1.update(h, sel); cm2.update(h, sel); } void write(U32 sel=0) { cm2.write(sel); if (CM1) cm1.add(sel); } void add(U32 sel=0) { cm2.add(sel); if (CM1) cm1.add(sel); } }; #define CounterMap CounterMap3 //////////////////////////// Model //////////////////////////// // All models have a function model() which updates the model with the // last bit of input (in ch) then writes probabilities for the following // bit into mixer. class Model { public: virtual void model() = 0; virtual ~Model() {} }; //////////////////////////// allModel //////////////////////////// // Order 8 context model plus one sparse model (1-3) and default (p = .5) class AllModel: public Model { enum {NC=8}; U32 cxt[NC]; // Order 0-7 contexts CounterMap t; public: AllModel(): t(26) { memset(cxt, 0, NC*4); } void model() { if (ch.bpos()==0) { // Update character contexts t.update(0, 0); for (int i=NC-1; i>0; --i) { cxt[i]=cxt[i-1]^hash(ch(1), i); t.update(cxt[i], i); } t.update(hash(ch(1), ch(3), NC), NC); } mixer.write(1, 1); // Default model for (int i=0; i= 8 bytes between the current context and previous input, and predicts the next bit in the previous context with weight n. If the next bit is 1, then the mixer is assigned (0, n), else (n, 0). Matchies are found using an index (a hash table of pointers into ch). */ class MatchModel: public Model { const int N; // 2^N = hash table size enum {M=4}; // Number of strings to match U32 hash[2]; // Hashes of current context up to pos-1 U32 begin[M]; // Points to first matching byte U32 end[M]; // Points to last matching byte + 1, 0 if no match U32 *ptr; // Hash table of pointers [2^(MEM+17)] public: MatchModel(): N(22), ptr(new U32[1 << N]) { memset(ptr, 0, (1 << N)*sizeof(U32)); hash[0]=hash[1]=0; for (int i=0; i> (32-N); if ((hash[0]>>28)==0) h=hash[1] >> (32-N); // 1/16 of 8-contexts are hashed to 32 bytes for (int i=0; i0) { begin[i]=end[i]; U32 p=ch.pos(); while (begin[i]>0 && p>0 && begin[i]!=p+1 && ch[begin[i]-1]==ch[p-1]) { --begin[i]; --p; } } if (end[i]==begin[i]) // No match found begin[i]=end[i]=0; break; } } ptr[h]=ch.pos(); } // Test whether the current context is valid in the last 0-7 bits for (int i=0; i> (8-ch.bpos())) != ch()) begin[i]=end[i]=0; } // Predict the bit found in the matching contexts int n0=0, n1=0; for (int i=0; i511) wt=511; int y=(ch[end[i]]>>(7-ch.bpos()))&1; if (y) n1+=wt; else n0+=wt; } } mixer.write(n0, n1); } //////////////////////////// wordModel //////////////////////////// // A WordModel models words, which are any characters > 32 separated // by whitespace ( <= 32), or any sequence of letters (lower case). // There is a unigram, bigram and sparse // bigram model (skipping 1 word) for each type of word. class WordModel: public Model { enum {N=3}; CounterMap t; U32 cxt[N]; // Hashes of last N words broken on whitespace U32 word[N]; // Hashes of last N words of letters only, lower case public: WordModel(): t(26) { for (int i=0; i32) { cxt[0]^=hash(cxt[0], c); } else if (cxt[0]) { for (int i=N-1; i>0; --i) cxt[i]=cxt[i-1]; cxt[0]=0; } if (isalpha(c) || c>=192) word[0]^=hash(word[0], tolower(c), 1); else { for (int i=N-1; i>0; --i) word[i]=word[i-1]; word[0]=0; } t.update(hash((1), 1)^cxt[0], 0); t.update(hash((1), 2)^cxt[1]+cxt[0], 1); t.update(hash((1), 3)^cxt[2]+cxt[0], 2); t.update(hash(ch(1), 4)^word[0], 3); t.update(hash(ch(1), 5)^word[1]+word[0], 4); t.update(hash(ch(1), 6)^word[2]+word[0], 5); } t.write(0); t.write(1); t.write(2); t.write(3); t.write(4); t.write(5); } } wordModel; //////////////////////////// TextModel //////////////////////////// // Model text (-1 option) class TextModel: public Model { public: void model() { allModel.model(); wordModel.model(); matchModel.model(); } } textModel; //////////////////////////// picModel //////////////////////////// class PicModel: public Model { CounterMap t; public: PicModel(): t(22) {} void model() { if (ch.bpos()==0) { t.update(hash(ch(216), ch(432), 0), 0); t.update(hash(ch(216), ch(1), 1), 1); t.update(hash(ch(216), ch(432), ch(648), 2), 2); t.update(hash(ch(216), ch(432), ch(1), 3), 3); t.update(hash(ch(216), ch(217), ch(1), 4), 4); t.update(hash(ch(216), 5), 5); t.update(hash(ch(1), 6), 6); t.update(hash(ch(1), ch(2), 7), 7); t.update(hash(ch(216), ch(215), ch(1), 8), 8); } for (int i=0; i<9; ++i) { t.write(i); } mixer.write(1, 1); } } picModel; //////////////////////////// ObjModel //////////////////////////// class ObjModel: public Model { CounterMap t; int k; public: ObjModel(): t(22), k(0) {} void model() { allModel.model(); matchModel.model(); if (ch.bpos()==0) { k=0; for (int i=1; i<5; ++i) { for (int j=i+1; j<9; ++j) { t.update(hash(ch(i), ch(j), k), k); ++k; } } } mixer.write(1, 1); for (int i=0; i254) { // Roll over count overflows c1/=2; n/=2; } } SSEContext(): c1(0), n(0) {} }; SSEContext (*sse)[SSE2+1]; // [SSE1][SSE2+1] context, mapped probability U32 nextp; // p() U32 ssep; // Output of sse U32 context; // SSE context public: Predictor(); int p() const {return nextp;} // Returns pr(y = 1) * PSCALE void update(int y); // Update model with bit y = 0 or 1 }; Predictor::SSEMap::SSEMap() { for (int i=0; i1023) p=1023; if (p<0) p=0; table[i]=p; } } Predictor::Predictor(): sse(0), nextp(PSCALE/2), ssep(512), context(0) { ch.init(); // Initialize to sse[context][ssemap(p)] = p sse=(SSEContext(*)[SSE2+1]) new SSEContext[SSE1][SSE2+1]; int N=PSCALE; int oldp=SSE2+1; for (int i=N-1; i>=0; --i) { int p=(ssemap(i*PSCALE/N)+SSESCALE/2)/SSESCALE; int n=1+N*N/((i+1)*(N-i)); if (n>254) n=254; int c1=(i*n+N/2)/N; for (int j=oldp-1; j>=p; --j) { for (int k=0; k=0x4000000) xmid+=(xdiff>>12)*p; else if (xdiff>=0x100000) xmid+=((xdiff>>6)*p)>>6; else xmid+=(xdiff*p)>>12; // Update the range if (y) x2=xmid; else x1=xmid+1; predictor.update(y); // Shift equal MSB's out while (((x1^x2)&0xff000000)==0) { putc(x2>>24, archive); x1<<=8; x2=(x2<<8)+255; } } /* Decode one bit from the archive, splitting [x1, x2] as in the encoder and returning 1 or 0 depending on which subrange the archive point x is in. */ inline int Encoder::decode() { // Split the range const U32 p=predictor.p()*(4096/PSCALE)+2048/PSCALE; // P(1) * 4K assert(p<4096); const U32 xdiff=x2-x1; U32 xmid=x1; // = x1+p*(x2-x1) multiply without overflow, round down if (xdiff>=0x4000000) xmid+=(xdiff>>12)*p; else if (xdiff>=0x100000) xmid+=((xdiff>>6)*p)>>6; else xmid+=(xdiff*p)>>12; // Update the range int y=0; if (x<=xmid) { y=1; x2=xmid; } else x1=xmid+1; predictor.update(y); // Shift equal MSB's out while (((x1^x2)&0xff000000)==0) { x1<<=8; x2=(x2<<8)+255; int c=getc(archive); if (c==EOF) c=0; x=(x<<8)+c; } return y; } // Should be called when there is no more to compress void Encoder::flush() { // In COMPRESS mode, write out the remaining bytes of x, x1 < x < x2 if (mode==COMPRESS) { while (((x1^x2)&0xff000000)==0) { putc(x2>>24, archive); x1<<=8; x2=(x2<<8)+255; } putc(x2>>24, archive); // First unequal byte } } //////////////////////////// Transformer //////////////////////////// /* A transformer compresses 1 byte at a time. It also provides a place to insert transforms or filters in the future. Transformer tf(COMPRESS, f) -- Initialize for compression to archive f which must be open in "wb" mode with the header already written Transformer tf(DECOMPRESS, f) -- Initialize for decompression from f which must be open in "rb" mode past the header tf.encode(c) -- Compress byte c c = tf.decode() -- Decompress byte c tf.flush() -- Should be called when compression is finished */ class Transformer { Encoder e; public: Transformer(Mode mode, FILE* f): e(mode, f) {} void encode(int c) { for (int i=7; i>=0; --i) e.encode((c>>i)&1); } U32 decode() { U32 c=0; for (int i=0; i<8; ++i) c=c+c+e.decode(); return c; } void flush() { e.flush(); } }; //////////////////////////// main //////////////////////////// // Read and return a line of input from FILE f (default stdin) up to // first control character except tab. Skips CR in CR LF. string getline(FILE* f=stdin) { int c; string result=""; while ((c=getc(f))!=EOF && (c>=32 || c=='\t')) result+=char(c); if (c=='\r') (void) getc(f); return result; } // User interface int main(int argc, char** argv) { // Check arguments if (argc<2) { printf( PROGNAME " file compressor/archiver, (C) 2004, Matt Mahoney, mmahoney@cs.fit.edu\n" "This program is free software distributed without warranty under the terms\n" "of the GNU General Public License, see http://www.gnu.org/licenses/gpl.txt\n" "\n" "To compress: " PROGNAME " -1 archive filenames... (archive will be created)\n" " or (MSDOS): dir/b | " PROGNAME " -1 archive (reads file names from input)\n" "To extract/compare: " PROGNAME " archive (does not clobber existing files)\n" "To view contents: more < archive\n" "\n" "Compression option: -1 for text (default), -2 for CCITT images (pic)\n" "or -3 for other binary files (geo, obj1, obj2)\n"); return 1; } // Read and remove -MEM option if (argc>1 && argv[1][0]=='-') { if (isdigit(argv[1][1]) && argv[1][2]==0) { MEM=argv[1][1]-'0'; } else printf("Option %s ignored\n", argv[1]); argc--; argv++; } if (MEM<1 || MEM>3) MEM=1; // File names and sizes from input or archive vector filename; // List of names vector filesize; // Size or -1 if error int uncompressed_bytes=0, compressed_bytes=0; // Input, output sizes // Extract files FILE* archive=fopen(argv[1], "rb"); if (archive) { if (argc>2) { printf("File %s already exists\n", argv[1]); return 1; } // Read PROGNAME " -m\r\n" at start of archive string s=getline(archive); if (s.substr(0, string(PROGNAME).size()) != PROGNAME) { printf("Archive file %s not in " PROGNAME " format\n", argv[1]); return 1; } // Get option -m where m is a digit if (s.size()>2 && s[s.size()-2]=='-') { int c=s[s.size()-1]; if (c>='0' && c<='9') MEM=c-'0'; } printf("Extracting archive " PROGNAME " -%d %s ...\n", MEM, argv[1]); // Read "size filename" in "%d\t%s\r\n" format while (true) { string s=getline(archive); if (s.size()>1) { filesize.push_back(atol(s.c_str())); string::iterator tab=find(s.begin(), s.end(), '\t'); if (tab!=s.end()) filename.push_back(string(tab+1, s.end())); else filename.push_back(""); } else break; } // Test end of header for "\f\0" { int c1=0, c2=0; if ((c1=getc(archive))!='\f' || (c2=getc(archive))!=0) { printf("%s: Bad " PROGNAME " header format %d %d\n", argv[1], c1, c2); return 1; } } // Extract files from archive data Transformer e(DECOMPRESS, archive); for (int i=0; i2) for (int i=2; i=0) fprintf(archive, "%ld\t%s\r\n", filesize[i], filename[i].c_str()); } putc(032, archive); // MSDOS EOF putc('\f', archive); putc(0, archive); // Write data Transformer e(COMPRESS, archive); long file_start=ftell(archive); for (int i=0; i=0) { uncompressed_bytes+=size; printf("%-23s %10ld -> ", filename[i].c_str(), size); FILE* f=fopen(filename[i].c_str(), "rb"); int c; for (long j=0; j0 && elapsed_time>0) { printf(" (%1.4f bpc, %1.2f%% at %1.0f KB/s)", compressed_bytes*8.0/uncompressed_bytes, compressed_bytes*100.0/uncompressed_bytes, uncompressed_bytes/(elapsed_time*1000.0)); } printf("\n"); return 0; }