/* barf.cpp - Better Archiver with Recursive Functionality (C) 2003, Matt Mahoney. This is free software and may be copied, modified, and distributed under the GNU General Public License, http://www.gnu.org/licenses/gpl.txt To compress: barf c files... (appends .x or .x?? extension) To decompress: barf d file.x (removes extension) If file names are omitted, reads names from standard input If compression/decompresion is successful, then the original files are removed (like with gzip). BARF compresses ANY nonempty file by at least one byte, including previously compressed files. Thus, by recursive compression, it is possible to eventually compress any file to 0 bytes. BARF is fast and efficient. In tests on the 14 file Calgary corpus, it compresses all of the files to 1 byte each in a single pass, or 0 bytes each in two passes. Both passes run in under 1 second on a 750 MHz PC. BARF achieves this remarkable result by trying 257 different compression methods on each file, and choosing the best one. The compression format is indicated by the file name extension, either .x or .x?? where ?? is a digit and a letter, e.g. .x3a. To compile (Borland) with the Calgary corpus in a subdirectory: (g++ chokes on the huge barf.h header, over 11 MB). bcc32 mkstring.cpp cd calgary dir/o/b | ..\mkstring > ..\barf.h cd .. bcc32 barf.cpp (barf.exe is about 3 MB) upx barf.exe (optional, shrinks barf.exe to about 1 MB) To test: cd calgary dir/b | ..\barf c (should produce bib.x, book1.x ... 1 byte each) dir/b | ..\barf c (bib.x.x0a, book1.x.x0b, ... 0 bytes each) dir/b | ..\barf c (Can't compress any more, sorry) dir/b | ..\barf d (bib.x, book1.x ... 1 byte each) dir/b | ..\barf d (bib, book1, ... in original form) You can premodel up to 255 files using mkstring, and those files will then be compressed to 1 byte, even if you rename them first. Other files are still compressed a little using a really crude LZ77 code, but of course you can do this over and over again :-) */ #include #include #include #include using namespace std; // Prebuilt models struct Model { long len; // Modeled file length const char* filename; // 0 marks end of array const char* contents; // Modeled file contents } model[] = #include "barf.h" // Generated by mkstring.cpp // Compress file to file.x or file.x?? void compress(const string& file) { // Open file FILE* f1=fopen(file.c_str(), "rb"); if (!f1) { perror(file.c_str()); return; } // Get input length fseek(f1, 0, SEEK_END); long len=ftell(f1); // Empty files cannot be compressed if (len==0) { printf("%s is empty, cannot compress further\n", file.c_str()); return; } // Test whether the file is modeled int i; for (i=0; model[i].filename; ++i) { if (len==model[i].len) { fseek(f1, 0, SEEK_SET); int j; for (j=0; j 0) { int b=0; // Number of chars written for (int i=0; i= 2) { // Output i-b-2 pending literals, symbol at i-2 for (int j=i-2; j >= 2 && i-j < 223; --j) { if (buf[i-1]==buf[j-1] && buf[i-2]==buf[j-2]) { if (i-b > 2) { putc(i-b-2, f2); fwrite(buf+b, 1, i-b-2, f2); b=i-2; } putc(i-j+32, f2); b+=2; break; } } } } if (b %s (%ld bytes)\n", file.c_str(), len, outfile.c_str(), len2); return; } // If the file is not smaller after LZ compression, then try again. // "Compress" by removing the first byte and adding an // extension of .x?? consisting of a digit (0-9) and letter (a-z) // encoding that byte in base 26, e.g. .x0a for 0, .x0b for 1,... // up to .x9v for 255. remove(outfile.c_str()); fseek(f1, 0, SEEK_SET); int c=getc(f1); outfile=file+".x"+char('0'+c/26)+char('a'+c%26); f2=fopen(outfile.c_str(), "wb"); if (!f2) { perror(outfile.c_str()); fclose(f1); return; } while ((c=getc(f1))!=EOF) putc(c, f2); len2=ftell(f2); fclose(f2); fclose(f1); remove(file.c_str()); printf("%s (%ld bytes) -> %s (%ld bytes)\n", file.c_str(), len, outfile.c_str(), len2); } // Decompress file.x or file.x0a to file.x9v to file void decompress(const string& file) { // Open input file FILE *f1=fopen(file.c_str(), "rb"); if (!f1) { perror(file.c_str()); return; } FILE *f2=0; // Test for .x?? extension. Remove it, and prepend byte ?? to file // where ?? is a base 26 [0-9][a-z] representation of the byte. const int n=file.size(); if (n>4 && file[n-4]=='.' && (file[n-3]=='x' || file[n-3]=='X') && isdigit(file[n-2]) && isalpha(file[n-1])) { int c=26*(file[n-2]-'0')+file[n-1]-'a'; if (isupper(file[n-1])) c=c+'a'-'A'; if (c<0 || c>255) { fclose(f1); printf("%s was not compressed by barf\n", file.c_str()); return; } string outfile=file.substr(0, n-4); f2=fopen(outfile.c_str(), "wb"); if (!f2) { fclose(f1); perror(outfile.c_str()); return; } putc(c, f2); while ((c=getc(f1))!=EOF) putc(c, f2); long len=ftell(f1); long len2=ftell(f2); fclose(f2); fclose(f1); remove(file.c_str()); printf("%s (%ld bytes) -> %s (%ld bytes)\n", file.c_str(), len, outfile.c_str(), len2); return; } // Test for .x extension else if (n<3 || file[n-2]!='.' || (file[n-1]!='x' && file[n-1]!='X')) { fclose(f1); printf("%s was not compressed by barf\n", file.c_str()); return; } // Chop .x to get outfile string outfile=file.substr(0, n-2); f2=fopen(outfile.c_str(), "wb"); if (!f2) { fclose(f1); perror(outfile.c_str()); return; } // Decompress a model file int c=getc(f1); // Index into model if (c!=EOF && c<255) { for (int i=0; i<=c; ++i) { // Check model array bounds if (!model[i].filename) { printf("%s corrupted\n", file.c_str()); fclose(f2); fclose(f1); remove(outfile.c_str()); return; } } for (long i=0; i0) { buf[pos]=c; if (++pos==256) pos=0; --lit; putc(c, f2); } else if (c<32) lit=c; else { buf[pos]=buf[(pos-c+32)&255]; putc(buf[pos], f2); if (++pos==256) pos=0; buf[pos]=buf[(pos-c+32)&255]; putc(buf[pos], f2); if (++pos==256) pos=0; } } } long len=ftell(f1); long len2=ftell(f2); fclose(f2); fclose(f1); remove(file.c_str()); printf("%s (%ld bytes) -> %s (%ld bytes)\n", file.c_str(), len, outfile.c_str(), len2); } // Read a line of input from f into s, return true if successful bool getline(FILE* f, string& s) { s=""; int c; while ((c=getc(f))!=EOF && c!='\n') s+=char(c); return !s.empty() || c!=EOF; } // Command line is: barf (c|d) [files...] int main(int argc, char** argv) { // Check for proper arguments, or print help message if (argc<2 || (string(argv[1])!="c" && string(argv[1])!="d")) { printf( "barf - Basic Archiver with Recursive Features\n" "\n" "To compress: barf c file1 file2 ... (adds .x or .x?? extensions)\n" "Or: dir/b | barf c (reads file names)\n" "\n" "To decompress: barf d file1.x file2.x ... (removes extensions)\n" "Or: dir/b | barf d\n" "\n" "Copyright (C) 2003, Matt Mahoney, http://cs.fit.edu/~mmahoney/compression/\n" "\n" "This is free software. It may be distributed, copied, and modified\n" "under terms of the GNU General Public License, see\n" "http://www.gnu.org/licenses/gpl.txt\n\n"); return 1; } // Get file names and compress/decompress if (argc > 2) { for (int i=2; i