/*
 * Huffman Compression Encoder  Aaron Logue 1999
 */
#include <stdio.h>
#include <stdlib.h>
#include <fcntl.h>
#include <string.h>
#include "huffman.h"

/* Command line flags */
#define HUF_PRINT 1
#define HUF_COUNT 2
#define HUF_WALK  4

int flags;

/*  Convert bitstream to ascii representation for display purposes
 */
void btoa(char * bitcode, char * str, int bitcount) {
   int i = 0;
   while (bitcount--) {
      if (IS_HBIT(bitcode, i)) {
         *str++ = '1';
      } else {
         *str++ = '0';
      }
      i++;
   }
   *str = 0;
   return;
}

/*
 * This function walks the decoder tree and displays all of its
 * characters and their bitcodes.  It's not very useful, but it
 * does help to demonstrate how the decoder tree works.
 */
void walk_decoder_tree(HUFFMAN_ENCODING_HANDLE * h, int t, int depth) {
   char bits[256];
   CLR_HBIT(h->walkbits, depth);
   if (IS_HBIT(h->tflags, t<<1)) {
      btoa(h->walkbits, bits, depth+1);
      printf("0x%02x [%c] - %s\n", h->tree[t].left, h->tree[t].left, bits);
   } else {
      walk_decoder_tree(h, h->tree[t].left, depth+1);
   }
   SET_HBIT(h->walkbits, depth);
   if (IS_HBIT(h->tflags, (t<<1)+1)) {
      btoa(h->walkbits, bits, depth+1);
      printf("0x%02x [%c] - %s\n", h->tree[t].right, h->tree[t].right, bits);
   } else {
      walk_decoder_tree(h, h->tree[t].right, depth+1);
   }
   return;
}

void huffman_print_bitcodes(HUFFMAN_ENCODING_HANDLE * h) {
   int i, j, k, biggest;
   char printed[256];
   char bits[256];
   char c;

   for (i=0;i<256;i++) {
      printed[i] = 0;
   }
   for (i=0;i<256;i++) {
      biggest = 0;
      k = 0;
      for (j=0;j<256;j++) {
         if (h->bitcodes[j].count > biggest && !printed[j]) {
            k = j;
            biggest = h->bitcodes[k].count;
         }
      }
      if (biggest > 0) {
         btoa(h->bitcodes[k].bitcode, bits, h->bitcodes[k].bitcount);
         c = (char)k;
         if (k < 32 || k > 126) {
            c = '.';
         }
         printf("%03d - 0x%02x [%c] - %1.3f %8d - %s\n", i+1,
            (unsigned char)k, c,
            (float)h->bitcodes[k].count/h->totalbytes,
            h->bitcodes[k].count, bits);
         printed[k] = 1;
      } else {
         /* We didn't print anything, so early out. */
         i = 256;
      }
   }
   return;
}

/*
 * Make a pass through the input file, accumulating occurence counts
 * for each character value that appears in the file.
 */
int count_characters(HUFFMAN_ENCODING_HANDLE * h, char * infile) {
   unsigned char * p;
   unsigned int i, j, k, big;
   char one_line[300];
   int   count[256];
   FILE * fp;
   int bytes;
   unsigned char c;

   for (i=0;i<256;i++) {
      count[i] = 0;
   }

   fp = fopen(infile, "r");
   if (fp == 0) {
      printf("Unable to open %s.\n", infile);
      return 0;
   }
   while (bytes = fread(one_line, 1, 256, fp)) {
      p = one_line;
      while (bytes--) {
         count[(unsigned int)*p++]++;
      }
   }
   fclose(fp);

   /*  Create 'counted' array, sorted from highest to least probability
    */
   h->totalbytes = 0;
   for (i=0;i<256;i++) {
      h->totalbytes += count[i];
   }
   for (i=0;i<256;i++) {
      big = 0;
      k = 0;
      for (j=0;j<256;j++) {
         if (count[j] > big) {
            k = j;
            big = count[k];
         }
      }
      if (count[k] > 0) {
         c = (unsigned char)k;
         if (k < 32 || k > 126) {
            c = '.';
         }
         if (flags & HUF_COUNT) {
            printf("%03d - 0x%02x [%c] - %1.3f %8d\n", i+1, (unsigned char)k, c,
               (float)count[k]/h->totalbytes, count[k]);
         }
         h->counted[h->ocount].count = count[k];
         h->counted[h->ocount].c = (char)k;
         h->counted[h->ocount].nptr = NULL;
         h->ocount++;
         count[k] = 0;
      } else {
         i = 256;
      }
   }
   printf("%d unique characters to encode\n", h->ocount);
   printf("%d total bytes to encode\n", h->totalbytes);

   /*
    * Copy 'counted' array into 'bitcodes' array
    */
   for (i=0;i<256;i++) {
      h->bitcodes[i].count = 0;
      h->bitcodes[i].bitcode = NULL;
      h->bitcodes[i].bitcount = 0;
   }
   for (i=0;i<h->ocount;i++) {
      h->bitcodes[(unsigned int)h->counted[i].c].count = h->counted[i].count;
   }
   return 1;
}

/*
 * 
 */
PBIG_HUFFMAN_NODE build_huffman_tree(HUFFMAN_ENCODING_HANDLE * h) {
   int i;
   int dst;
   int ocount;
   ACCUM combined;

   /*
    * While we've got more than 2 elements in the counted
    * array, remove the last two elements, combine them into one,
    * and reinsert the combined element back into the array
    * according to its probability.
    */
   ocount = h->ocount;
   while (ocount > 1) {
      ocount--;
      combined.nptr = malloc(sizeof(BIG_HUFFMAN_NODE));
      combined.count = h->counted[ocount-1].count + h->counted[ocount].count;
      if (h->counted[ocount-1].nptr) {
         combined.nptr->pleft = h->counted[ocount-1].nptr;
         combined.nptr->left = 0;
      } else {
         combined.nptr->pleft = NULL;
         combined.nptr->left = h->counted[ocount-1].c;
      }
      if (h->counted[ocount].nptr) {
         combined.nptr->pright = h->counted[ocount].nptr;
         combined.nptr->right = 0;
      } else {
         combined.nptr->pright = NULL;
         combined.nptr->right = h->counted[ocount].c;
      }
      i = 0;
      while (i < ocount && combined.count < h->counted[i].count) {
         i++;
      }
      /*
       * We have an insertion point.  If it's not the end of the
       * list, slide all of the counted elements down to
       * make room for the element we're inserting
       */
      if (i < ocount-1) {
         dst = ocount-1;
         while (i < dst) {
            h->counted[dst].count = h->counted[dst-1].count;
            h->counted[dst].nptr = h->counted[dst-1].nptr;
            h->counted[dst].c = h->counted[dst-1].c;
            dst--;
         }
      }
      h->counted[i].count = combined.count;
      h->counted[i].nptr = combined.nptr;
      h->counted[i].c = 0;
   }
   return (h->counted[0].nptr);	/* Return pointer to root node */
}

void walk_huffman_tree(HUFFMAN_ENCODING_HANDLE * h, PBIG_HUFFMAN_NODE root, int depth) {
   char bits[256];
   int i;
   unsigned char c;

   if (root->pleft) {
      walk_huffman_tree(h, root->pleft, depth+1);
   } else {
      for (i=0;i<depth;i++) {
         printf(" ");
      }
      c = root->left;
      if (root->left < 32 || root->left > 126) {
         c = '.';
      }
      printf("0x%02x/%c\n", root->left, c); 
   }

   if (root->pright) {
      walk_huffman_tree(h, root->pright, depth+1);
   } else {
      for (i=0;i<depth;i++) {
         printf(" ");
      }
      c = root->right;
      if (root->right < 32 || root->right > 126) {
         c = '.';
      }
      printf("0x%02x/%c\n", root->right, c); 
   }
   return;
}

/*
 * Walk huffman tree to find each character's bitcode.  Create 'bitcodes'
 * array for quick lookup during the encoding phase.
 */
void build_encoder_array(HUFFMAN_ENCODING_HANDLE * h, PBIG_HUFFMAN_NODE root, int depth) {
   CLR_HBIT(h->walkbits, depth);
   if (root->pleft) {
      build_encoder_array(h, root->pleft, depth+1);
   } else {
      int i = ((depth+1)>>3) + 1; /* Number of bytes required by bitcode */
      h->bitcodes[(unsigned int)root->left].bitcount = depth+1;
      h->bitcodes[(unsigned int)root->left].bitcode = (char *)malloc(i);
      memcpy(h->bitcodes[(unsigned int)root->left].bitcode, h->walkbits, i);
   }
   SET_HBIT(h->walkbits, depth);
   if (root->pright) {
      build_encoder_array(h, root->pright, depth+1);
   } else {
      int i = ((depth+1)>>3) + 1; /* Number of bytes required by bitcode */
      h->bitcodes[(unsigned int)root->right].bitcount = depth+1;
      h->bitcodes[(unsigned int)root->right].bitcode = (char *)malloc(i);
      memcpy(h->bitcodes[(unsigned int)root->right].bitcode, h->walkbits, i);
   }
   return;
}

/*
 * We want to compress our big huffman tree into a tighter, more
 * efficient one for storage in disk file or memory for use in decoding.
 */
void build_decoder_tree(HUFFMAN_ENCODING_HANDLE * h) {
   int i;
   int t;      /* Index of position in huffman node array */
   int b;      /* Bit position as we move through character's bitcode */
   int u = 1;   /* Next unallocated huffman node */
   /* allocate space for h->ocount-1 nodes */
   h->tree = malloc((h->ocount-1) * sizeof(HUFFMAN_NODE));
   for (i=0;i<h->ocount-1;i++) {
      h->tree[i].left = 0;
      h->tree[i].right = 0;
   }
   /* allocate enough bytes to store (h->ocount-1)<<1 node flag bits */
   /* A 1 in the flags means it's a character.  A 0 is a node pointer. */
   h->tflags = malloc(((h->ocount-2)>>2)+1);
   for (i=0;i<256;i++) {
      if (h->bitcodes[i].bitcount) {
         // Building the condensed decoder tree, we maintain
         //   b - Our current bit position in h->bitcodes[i].bitcode
         //   t - Index of our current huffman node in h->tree[]
         //   u - Next available unused node
         t = 0;
         for (b=0;b<h->bitcodes[i].bitcount-1;b++) {
            /* Are we going to the left or the right */
            if (IS_HBIT(h->bitcodes[i].bitcode, b)) {
               if (!h->tree[t].right) {
                  h->tree[t].right = u; /* set right link to new node */
                  u++; /* advance new-node pointer */
                  /* clear right bit for node t flags */
                  CLR_HBIT(h->tflags, (t<<1)+1);
               }
               t = h->tree[t].right; /* follow right link to next node */
            } else {
               if (!h->tree[t].left) {
                  h->tree[t].left = u; /* set left link to new node */
                  u++; /* advance new-node pointer */
                  /* clear left bit for node t flags */
                  CLR_HBIT(h->tflags, t<<1);
               }
               t = h->tree[t].left; /* follow left link to next node */
            }
         }
         /*  We've run through all the bits of character i except the
             last one.  Set left or right half to character and set
            node flag to 1 to indicate a leaf.
          */
         if (IS_HBIT(h->bitcodes[i].bitcode, b)) {
            h->tree[t].right = (char)i; /* set right leaf to character */
            /* set right bit for node t flags */
            SET_HBIT(h->tflags, (t<<1)+1);
         } else {
            h->tree[t].left = (char)i; /* set left leaf to character */
            /* set left bit for node t flags */
            SET_HBIT(h->tflags, t<<1);
         }
      }
   }
   return;
}

/*
 * This function writes the beginning of a huffman-encoded file.
 * The first byte is a count of the number of nodes in the tree.
 * The count is followed by an array of nodes and a list of node flags.
 */
void write_decoder_tree(FILE * fp, HUFFMAN_ENCODING_HANDLE * h) {
   unsigned char c;
   c = (unsigned char)h->ocount-1;
   fwrite(&c, 1, sizeof(unsigned char), fp);
   fwrite(h->tree, 1, (unsigned int)c * sizeof(HUFFMAN_NODE), fp);
   fwrite(h->tflags, 1, ((unsigned int)(c-1)>>2)+1, fp);
   if (flags & HUF_PRINT) {
      printf("Decoder tree header is %d bytes long.\n",
         sizeof(unsigned char) + (unsigned int)c * sizeof(HUFFMAN_NODE) +
         ((unsigned int)(c-1)>>2)+1);
   }
   return;
}

void write_encoded_file(FILE * fp, HUFFMAN_ENCODING_HANDLE * h, char * infile) {
   FILE * ifp;
   int tocopy;
   int bytes;
   int ibit;
   int obit;
   char ibuf[256];
   char obuf[256];
   unsigned char * p;

   ifp = fopen(infile, "r");
   if (ifp == 0) {
      printf("Unable to open input file %s.\n", infile);
      return;
   }

   /* Write 3 bits of file size mod 8 */
   obuf[0] = 0;
   obit = 0;
   if (h->totalbytes & 1) {
      SET_HBIT(obuf, 0);
   }
   if (h->totalbytes & 2) {
      SET_HBIT(obuf, 1);
   }
   if (h->totalbytes & 4) {
      SET_HBIT(obuf, 2);
   }
   obit = 3;

   /* This is a horribly-inefficient bit-by-bit copy */
   while (bytes = fread(ibuf, 1, 256, ifp)) {
      p = ibuf;
      while (bytes--) {
         /* How many bits need to be moved */
         tocopy = h->bitcodes[(unsigned int)*p].bitcount;
         ibit = 0;
         while (tocopy) {
            while (tocopy && obit < (256*8)) {
               if (IS_HBIT(h->bitcodes[(unsigned int)*p].bitcode, ibit)) {
                  SET_HBIT(obuf, obit);
               } else {
                  CLR_HBIT(obuf, obit);
               }
               ibit++;
               obit++;
               tocopy--;
            }
            if (obit == (256*8)) {
               fwrite(obuf, 1, 256, fp);
               obit = 0;
            }
         }
         p++;
      }
   }
   if (obit) {
      fwrite(obuf, 1, (obit+7)>>3, fp);
   }
   fclose(ifp);
   return;
}

/*
 * It may be possible to eliminate the first huffman tree and construct
 * the condensed one directly, then build the encoder array from it.
 */
int huffman_encode_file(HUFFMAN_ENCODING_HANDLE * h,
      char * infile, char * outfile) {
   FILE * fp;
   if (!count_characters(h, infile)) {
      return(0);
   }
   build_huffman_tree(h);
   if (flags & HUF_WALK) {
      walk_huffman_tree(h, h->counted[0].nptr, 0);
   }
   build_encoder_array(h, h->counted[0].nptr, 0);
   build_decoder_tree(h);

   fp = fopen(outfile, "wb");
   if (fp == 0) {
      printf("Unable to create %s.\n", outfile);
      return;
   }

   /*
    * Write file flags, node count, huffman nodes, and node flags
    */
   write_decoder_tree(fp, h);

   /*
    * Write the huffman-encoded bitstream
    */
   write_encoded_file(fp, h, infile);

   fclose(fp);

   return(1);
}

/*
 * This function is called to allocate the data structures used
 * to accumulate character occurence counts.  It must be called
 * prior to calling huffman_encode_file.
 */
HUFFMAN_ENCODING_HANDLE * huffman_init_encoder(void) {
   HUFFMAN_ENCODING_HANDLE * result;
   result = malloc(sizeof(HUFFMAN_ENCODING_HANDLE));
   result->ocount = 0;
   result->totalbytes = 0;
   return(result);
}

void free_huffman_tree(HUFFMAN_ENCODING_HANDLE * h, PBIG_HUFFMAN_NODE root) {
   if (root->pleft) {
      free_huffman_tree(h, root->pleft);
   }
   if (root->pright) {
      free_huffman_tree(h, root->pright);
   }
   /* Having walked both branches of this node, we can free it */
   free(root);
   return;
}

void huffman_deinit_encoder(HUFFMAN_ENCODING_HANDLE * h) {
   int i;
   /* Walk huffman tree, freeing as we go */
   free_huffman_tree(h, h->counted[0].nptr);
   /* Free character arrays where bitcodes were stored */
   for (i=0;i<256;i++) {
      if (h->bitcodes[i].bitcode) {
         free(h->bitcodes[i].bitcode);
      }
   }
   free(h->tree);
   free(h->tflags);
   free(h);
   return;
}


int main(int argc, char **argv) {
   char * p;
   char ifile[256];
   char ofile[256];
   HUFFMAN_ENCODING_HANDLE * h;

   printf("Minimum redundancy variable length character encoding.\n\n");
   flags = 0;
   ifile[0] = 0;
   ofile[0] = 0;
   if (argc > 2) {
      while (argc > 1) {
         argc--;
         if (*argv[argc] == '-' || *argv[argc] == '/') {
            p = argv[argc];
            while (*p) {
               switch (*p++) {
                  case 'c': flags |= HUF_COUNT;
                     break;
                  case 'p': flags |= HUF_PRINT;
                     break;
                  case 'w': flags |= HUF_WALK;
                     break;
                  case '-':
                  case '/':
                  default:
                     break;
               }
            }
         } else {
            if (ofile[0] == 0) {
               strcpy(ofile, argv[argc]);
            } else {
               strcpy(ifile, argv[argc]);
            }
         }
      }

      h = huffman_init_encoder();
      if (!huffman_encode_file(h, ifile, ofile)) {
         printf("Unable to encode file %s\n", ifile);
         exit(-1);
      }
      if (flags & HUF_PRINT) {
         huffman_print_bitcodes(h);
         /* walk_decoder_tree(h, 0, 0); */
      }
      huffman_deinit_encoder(h);

   } else {
      printf("Usage: huffman <source> <destination> [switches]\n\n");
      printf("The following switches may be used:\n");
      printf("   -p Print characters, probabilities, and bitcodes\n");
      exit(-1);
   }
}
