信息论_香农-费诺-埃利斯码编码

解压缩的程序都已经完成,程序为c语言。目前有一点点小bug(长文本的处理),但总体来说能用!

要求

实现香农-费诺-埃利斯对英文文本进行编码和解码

课设作业已经提交,所以我没什么功夫完善这个代码。代码内容仅作参考,交作业的话可以用一个简短的纯英语文本演示。长文本的压缩很大概率会出问题

参考效果

  • source.txt是需要编码的内容
  • code.dat是编码后的二进制文件
  • data.key是相对应的编码表

未实现的优化方案:这里可以更精简key文件,可以尝试使用四字节来描述二进制,用一个字节来描述码长。128*5=640字节。这样已经非常小了。

程序直接利用ascii表来做下标,可以实现只遍历一次就将整个文件计数完毕。

另外,修改了原有的码长计算公式,原来的公式似乎会重码。所以我在原本公式的基础上将码长多加了一个1;这样可以保证不会重码,压缩率变低了。(不知道是不是理解问题,参考的百度文库里的某一个文档)

必须保证文本里面的所有字符内容都是在ascii表能找到的。否则不能实现计数

代码

encode.c

/**
 * @> Author       : BinBla
 * @> Date         : 2020-12-07 18:38:49
 * @> LastEditors  : BinBla
 * @> LastEditTime : 2020-12-10 20:03:26
 * @> FilePath     : /work/c/encoding.c
 * @> Description  : 压缩 输入文件 输出文件
 * @> Gold         : 实现香农-费诺-埃利斯码压缩
 **/
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define CODE_MAX_SIZE 20  //编码大小(长文本需要很大,出现概率越小,编码越长)
#define TABLE_MAX_SIZE 128  //表大小
#define UNIT_SIZE 4096      //每次处理单位大小(过大会报错
typedef unsigned char uint8_t;
typedef unsigned short uint16_t;
typedef unsigned int uint32_t;
typedef unsigned long uint64_t;

typedef struct CodeTable {
  char symbol;                   // 符号
  long count;                    // 个数
  double probability;            // 概率
  double cumProbability;         // 累积概率
  double fixCumulation;          // 修正累积概率
  uint8_t codeLen;               // 码字长度
  char codeWord[CODE_MAX_SIZE];  // 码字(临时使用字符数组存放伪二进制)
} codeTable;

typedef struct CodeKeyTable {
  char symbol;                   //符号
  char codeWord[CODE_MAX_SIZE];  //伪二进制
} codeKeyTable;

typedef struct KeyTable {
  int unitsize;  //读写单元大小,这个解码必须
  long allSize;
  codeKeyTable table[TABLE_MAX_SIZE];  //精简后的编码表
} keyTable;

void countChar(codeTable *table, char *buff, long size) {
  int c = 0;
  for (long i = 0; i < size; ++i) {
    c = buff[i];
    table[c].count += 1;
  }
}

void printTable(codeTable *table) {
  for (uint8_t i = 0; i < TABLE_MAX_SIZE; ++i) {
    if (table[i].count != 0)
      printf("%c\t%ld\t%lf\t%lf\t%lf\t%d\t%s\n", table[i].symbol,
             table[i].count, table[i].probability, table[i].cumProbability,
             table[i].fixCumulation, table[i].codeLen, table[i].codeWord);
  }
}

void initTable(codeTable *table) {
  for (uint8_t i = 0; i < TABLE_MAX_SIZE; ++i) {
    table[i].symbol = i;
    table[i].count = 0.0;
    table[i].probability = 0.0;
    table[i].cumProbability = 0.0;
    table[i].fixCumulation = 0.0;
    table[i].codeLen = 0;
  }
}

void getBinaryList(char *dest, double tag, int allLen) {
  int temp = 0;
  for (int i = 0; i < CODE_MAX_SIZE; ++i) {
    if (i < allLen) {
      tag *= 2;
      temp = tag;  //强制类型转换
      dest[i] = temp == 1 ? '1' : '0';
      tag -= temp;
    } else {
      dest[i] = 0;
    }
  }
}

void finishCodeTable(codeTable *table, long size) {
  double cum = 0;
  for (uint8_t i = 0; i < TABLE_MAX_SIZE; ++i) {
    if (table[i].count != 0) {
      table[i].probability = ((double)(table[i].count) / size);  // 获取概率
      table[i].fixCumulation =
          cum +
          0.5 *
              table[i]
                  .probability;  // 获取修正累积概率(使用这个数据来构建二进制序列
      cum += table[i].probability;    // 累积当前概率
      table[i].cumProbability = cum;  // 获取累积概率(无用
      table[i].codeLen =
          (uint8_t)(log2(1 / table[i].probability) + 2);  // 计算码长
      getBinaryList(table[i].codeWord, table[i].fixCumulation,
                    table[i].codeLen);
    }
  }
}

void buildTableFinal(keyTable *key, codeTable *table, long allSize) {
  key->unitsize = UNIT_SIZE;
  key->allSize = allSize;
  for (uint8_t i = 0; i < TABLE_MAX_SIZE; ++i) {
    key->table[i].symbol = i;
    strcpy(key->table[i].codeWord, table[i].codeWord);
  }
}
void showTemp(char temp) {
  putchar('\t');
  for (int j = 0; j < 8; ++j) {
    char c = (temp >> (7 - j)) & 1;
    printf("%d", c);
  }
  // putchar('\n');
}
void writeDatOut(FILE *outFile, char *buff, long allLen, codeTable *table) {
  uint8_t temp = 0;                 //以单个字节写入
  char c;                           //当前压缩的字符(用作定位)
  uint8_t last = sizeof(temp) * 8;  //作为位运算的指针
  uint8_t t;
  for (long i = 0; i < allLen; ++i) {
    c = buff[i];
    for (uint8_t j = 0; j < table[c].codeLen; ++j) {
      if (last == 0) {
        fwrite(&temp, 1, 1, outFile);
        last += 8;
        temp ^= temp;
        t = 0;
      }
      if (table[c].codeWord[j] == '1') {
        temp |= 1 << (last - 1);
        t = 1;
      }
      --last;
    }
  }
  if (t) {
    fwrite(&temp, 1, 1, outFile);
  }
}

void preDeal(codeTable *table, long lastLen, FILE *in) {
  char *buff = (char *)malloc(UNIT_SIZE);
  long readLen;
  while (lastLen > 0) {
    if (lastLen > UNIT_SIZE) {
      readLen = UNIT_SIZE;
      lastLen -= UNIT_SIZE;
    } else {
      readLen = lastLen;
      lastLen ^= lastLen;
    }
    fread(buff, readLen, 1, in);
    countChar(table, buff, readLen);
  }
}

void wirteDat(FILE *in, FILE *out, long lastLen, codeTable *table) {
  char *buff = (char *)malloc(UNIT_SIZE);
  int readLen;
  while (lastLen > 0) {
    if (lastLen > UNIT_SIZE) {
      readLen = UNIT_SIZE;
      lastLen -= UNIT_SIZE;
    } else {
      readLen = lastLen;
      lastLen ^= lastLen;
    }
    fread(buff, readLen, 1, in);
    writeDatOut(out, buff, readLen, table);
  }
}

void printKey(keyTable *kTable) {
  for (int i = 0; i < 128; ++i) {
    if (kTable->table[i].codeWord[0] == '0' ||
        kTable->table[i].codeWord[0] == '1')
      printf("%c\t%s\n", kTable->table[i].symbol, kTable->table[i].codeWord);
  }
}

int main(int argc, char *argv[]) {
  char *buff;    //临时数据
  long allLen;   //完整长度
  long lastLen;  //剩余需要录入的长度
  long readLen;  //即将录入的长度
  if (argc != 4) {
    printf(
        "\n\tPz add file-true-path to the arg table!\n\n\tHear is an "
        "example:\n\n\tencode.bin source.txt code.dat data.key\n\n");
    return -1;
  }
  printf("\n您的key文件:%s\n\n您的编码文件:%s\n\n你的输入文件:%s\n\n", argv[1],
         argv[2], argv[3]);
  puts("---------\t回车确认输入正确\t----------");
  getchar();
  // 要求输入文本的输入路径(绝对路径相对路径均可)和压缩后的输出路径(绝对路径相对路径均可)
  // 例如: ./encoding.bin ./inputFile.txt ./textCompress.dat ./dat.key
  FILE *inputFile = fopen(argv[1], "r");
  fseek(inputFile, 0, SEEK_END);
  allLen = ftell(inputFile);
  lastLen = allLen;
  fseek(inputFile, 0, SEEK_SET);
  codeTable *table = (codeTable *)malloc(sizeof(codeTable) * TABLE_MAX_SIZE);
  initTable(table);
  preDeal(table, allLen, inputFile);
  finishCodeTable(table, allLen);  //建立编码表
  keyTable *kTable = (keyTable *)malloc(sizeof(keyTable));
  buildTableFinal(kTable, table, allLen);  //建立精简编码表(用于保存
  // printTable(table);
  //表构建完毕
  //接下来再读一次文件开始编码压缩(这里可以优化,采用队列的方式实现文件只读一次)
  FILE *outputFile = fopen(argv[2], "wb");
  fseek(inputFile, 0, SEEK_SET);
  wirteDat(inputFile, outputFile, allLen, table);
  fclose(inputFile);
  fclose(outputFile);

  FILE *key = fopen(argv[3], "wb");
  fwrite(kTable, sizeof(keyTable), 1, key);
  fclose(key);

  puts("编码完成!感谢您的使用!\n");
  return 0;
}

decode.c

/**
 * @> Author       : BinBla
 * @> Date         : 2020-12-07 18:38:49
 * @> LastEditors  : BinBla
 * @> LastEditTime : 2020-12-10 19:57:52
 * @> FilePath     : /work/c/decoding.c
 * @> Description  : 压缩 输入文件 输出文件
 * @> Gold         : 实现香农-费诺-埃利斯码解压
 **/
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
typedef unsigned char uint8_t;
typedef unsigned short uint16_t;
typedef unsigned int uint32_t;
typedef unsigned long uint64_t;
#define TABLE_MAX_SIZE 128  //表大小
#define CODE_MAX_SIZE 20
typedef struct CodeKeyTable {
  char symbol;                   //符号
  char codeWord[CODE_MAX_SIZE];  //伪二进制
} codeKeyTable;

typedef struct KeyTable {
  int unitSize;  //读写单元大小,这个解码必须
  long allSize;
  codeKeyTable table[TABLE_MAX_SIZE];  //精简后的编码表
} keyTable;

typedef struct Tree {
  char *code;
  struct Tree *left;
  struct Tree *right;
} tree;

void initNode(tree *node) {
  node->code = NULL;
  node->left = NULL;
  node->right = NULL;
}

void buildTree(tree *root, char *c, char *tag) {
  switch (tag[0]) {
    case '0': {
      if (root->left == NULL) {
        root->left = (tree *)malloc(sizeof(tree));
        initNode(root->left);
      }
      buildTree(root->left, c, ++tag);
      break;
    }
    case '1': {
      if (root->right == NULL) {
        root->right = (tree *)malloc(sizeof(tree));
        initNode(root->right);
      }
      buildTree(root->right, c, ++tag);
      break;
    }
    default: {
      root->code = c;
      break;
    }
  }
}

void printTable(keyTable *key) {
  for (uint8_t i = 0; i < TABLE_MAX_SIZE; ++i) {
    if (key->table[i].codeWord[0] == '0' || key->table[i].codeWord[0] == '1')
      printf("%c,%s\n", key->table[i].symbol, key->table[i].codeWord);
  }
}

void initTree(tree *root, keyTable *key) {
  for (uint8_t i = 0; i < TABLE_MAX_SIZE; ++i) {
    if (key->table[i].codeWord[0] == '0' || key->table[i].codeWord[0] == '1') {
      // printf("获取到一个有效字符:%c\t%s\n", *(&key->table[i].symbol),
      //        key->table[i].codeWord);
      buildTree(root, &(key->table[i].symbol),
                key->table[i].codeWord);  //递归构建当前字符的编码节点
    }
  }
}

void rebuildKeyTable(keyTable *key, FILE *keyFile) {
  fread(key, sizeof(keyTable), 1, keyFile);
}

void showTemp(char temp) {
  putchar('\t');
  for (int j = 0; j < 8; ++j) {
    char c = (temp >> (7 - j)) & 1;
    printf("%d", c);
  }
  // putchar('\n');
}

void decompressUnit(FILE *out, FILE *in, tree *root, int size) {
  char temp = 0;  //操作空间
  uint8_t tempLast = 8;
  tree *treePointer;
  char buff;
  int z = 1;
  int eof = feof(in);
  while (size > 0) {  //需要读取size数量的字符
    treePointer = root;
    while (1) {                         //读取一个字符
      if (tempLast == 8 && eof == 0) {  //获取一个字节新数据
        fread(&temp, 1, 1, in);
        eof = feof(in);
        tempLast ^= tempLast;
      }
      if (temp < 0) {                      //高位1
        if (treePointer->right == NULL) {  //成功取出字符
          // printf("成功取出字符:%c\n", *(treePointer->code));
          // showTemp(temp);
          if ((treePointer->code) == NULL) {
            puts("一定是重码了!");
            exit(-1);
          }
          buff = *(treePointer->code);
          break;
        }
        treePointer = treePointer->right;
      } else {                            //高位0
        if (treePointer->left == NULL) {  //成功取出字符
          // printf("成功取出字符:%c\n", *(treePointer->code));
          // showTemp(temp);
          if ((treePointer->code) == NULL) {
            puts("一定是重码了!");
            exit(-1);
          }
          buff = *(treePointer->code);
          break;
        }
        treePointer = treePointer->left;
      }
      temp = temp << 1;
      ++tempLast;
    }
    // printf("%c", buff);
    fwrite(&buff, 1, 1, out);
    --size;
  }
}

void decompress(FILE *out, FILE *in, tree *root, long allSize, int unitSize) {
  long lastSize = allSize;
  int optionSize;
  while (lastSize > 0) {
    if (lastSize > unitSize) {
      optionSize = unitSize;
      lastSize -= unitSize;
    } else {
      optionSize = lastSize;
      lastSize ^= lastSize;
    }
    decompressUnit(out, in, root, optionSize);
  }
}

int main(int argc, char *argv[]) {
  if (argc != 4) {
    printf(
        "\n\tPz add file-true-path to the arg table!\n\n\tHear is an "
        "example:\n\n\tdecode.bin data.key code.dat decode.txt\n\n");
    return -1;
  }
  FILE *keyFile = fopen(argv[1], "rb");
  FILE *datFile = fopen(argv[2], "rb");
  FILE *decodeFile = fopen(argv[3], "wb");
  keyTable keyTable;
  rebuildKeyTable(&keyTable, keyFile);
  printf("\n您的key文件:%s\n\n您的编码文件:%s\n\n你的输入文件:%s\n\n", argv[1],
         argv[2], argv[3]);
  puts("---------\t回车确认输入正确\t----------");
  getchar();
  puts("重建编码表完成\n");
  fclose(keyFile);
  // printTable(&keyTable);
  tree root = {NULL, NULL, NULL};
  initTree(&root, &keyTable);
  puts("构建编码树完成\n");
  decompress(decodeFile, datFile, &root, keyTable.allSize, keyTable.unitSize);
  printf("解压完成,感谢您的使用!\n");
  fclose(datFile);
  fclose(decodeFile);
  return 0;
}

后记

这是一个非常新手的写法,代码质量很差,仅作训练参考

2 Comments

  • 好像是会重码,每次decoding的时候会走到这个分支,请问可不可以改善一下这个问题?

    • 这个编码公式给我的感觉就是字符多了肯定要重码。我在做这个的时候google了一圈,并没有看到这个编码的完整代码示例。所以我也不敢确定到底是不是这个编码方式的问题。所以很抱歉,这个问题我暂时解决不了。如果你只是做做密码学的课程设计的话,可以用短一点的ASCII文本敷衍一下,这个编码用得不多,意义不大。

回复 BinBla 取消回复

您的电子邮箱地址不会被公开。 必填项已用 * 标注