#===================== # PART A: HUFFMAN CODE #===================== >>> ================================ RESTART ================================ # executing the huffman cycle >>> char_count("live and let live") {'d': 1, 'n': 1, 'i': 2, 'a': 1, 'e': 3, 'l': 3, 'v': 2, ' ': 3, 't': 1} >>> build_huffman_tree(char_count("live and let live")) [[' ', ['i', 'v']], [[['d', 'n'], ['a', 't']], ['e', 'l']]] >>> d = generate_code(build_huffman_tree(char_count("live and let live"))) >>> d {'d': '1000', 't': '1011', 'e': '110', 'v': '011', 'n': '1001', 'i': '010', 'a': '1010', 'l': '111', ' ': '00'} >>> compress("tell",d) '1011110111111' >>> len(compress("tell",d)) 13 >>> e = build_decoding_dict(d) >>> e {'010': 'i', '1000': 'd', '1010': 'a', '1011': 't', '00': ' ', '111': 'l', '110': 'e', '1001': 'n', '011': 'v'} >>> decompress(compress("tell",d),e) 'tell' >>> ================================ RESTART ================================ # checking the compression ratio of some examples >>> huffman_ratio("ab","ab") 0.14285714285714285 >>> 1/7 0.14285714285714285 >>> huffman_ratio("ab","abcd") 0.2857142857142857 >>> 2/7 0.2857142857142857 >>> huffman_ratio("hello","a"*100) Traceback (most recent call last): File "", line 1, in compression_ratio("hello","a"*100) File "/Users/Yael/Google Drive/cs1001/week12/4class/huff/_5_compression_ratio.py", line 6, in compression_ratio len_compress = len(compress(text, d)) File "/Users/Yael/Google Drive/cs1001/week12/4class/huff/huffman.py", line 73, in compress return "".join(encoding_dict[ch] for ch in text if ord(ch)<=128) File "/Users/Yael/Google Drive/cs1001/week12/4class/huff/huffman.py", line 73, in return "".join(encoding_dict[ch] for ch in text if ord(ch)<=128) KeyError: 'h' # crashed because text includes a char that does not appear in the corpus >>> asci = [chr(i) for i in range(128)] >>> asci ['\x00', '\x01', '\x02', '\x03', '\x04', '\x05', '\x06', '\x07', '\x08', '\t', '\n', '\x0b', '\x0c', '\r', '\x0e', '\x0f', '\x10', '\x11', '\x12', '\x13', '\x14', '\x15', '\x16', '\x17', '\x18', '\x19', '\x1a', '\x1b', '\x1c', '\x1d', '\x1e', '\x1f', ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '=', '>', '?', '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', '\\', ']', '^', '_', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '|', '}', '~', '\x7f'] >>> asci = "".join(asci) >>> asci '\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f' >>> huffman_ratio("hello","a"*100+asci) # problem solved 1.1428571428571428 >>> 8/7 1.1428571428571428 # why 8/7 ? let's take a look at the code dictionary >>> d = generate_code(build_huffman_tree(char_count("a"*100+asci))) >>> d["a"] '0' >>> d["z"] '11011101' # "a" is so frequent that it's represented by a single bit, # increasing the representation length of the rest of the chars to 8 bits # what is the minimal k such that a code generated from the corpus "a"*k+asci # will encode "a" using a single bit ? >>> d = generate_code(build_huffman_tree(char_count("a"*64+asci))) >>> d['a'] '0' >>> d = generate_code(build_huffman_tree(char_count("a"*63+asci))) >>> d['a'] '11' #======================= # PART B: LZ compression #======================= >>> ================================ RESTART ================================ # executing the LZ cycle >>> inter1,bits,inter2,text2 = process("abcdabc") >>> inter1 ['a', 'b', 'c', 'd', [4, 3]] >>> len(bits) 50 >>> 4*8+18 50 >>> [bits[8*i:8*(i+1)] for i in range(4)] ['01100001', '01100010', '01100011', '01100100'] >>> [bin(ord(c))[2:] for c in "abcd"] ['1100001', '1100010', '1100011', '1100100'] >>> [chr(int(e,2)) for e in [bits[8*i:8*(i+1)] for i in range(4)]] ['a', 'b', 'c', 'd'] >>> bits[32] '1' >>> bits[33:45],bits[45:] ('000000000100', '00011') >>> inter1,bits,inter2,text2 = process("xyzxyzwxyzw") >>> inter1 ['x', 'y', 'z', [3, 3], 'w', [4, 4]] >>> ================================ RESTART ================================ >>> inter1,bits,inter2,text2 = process("a"*40) >>> inter1 ['a', [1, 31], [1, 8]] # what is the compression ratio of strings of the form "a"*k # when k tends to infinity ? >>> lz_ratio("a"*1000) 0.086 >>> 18/(7*31) 0.08294930875576037 >>> ================================ RESTART ================================ # compare compression ratios of the two algorithms on a random string # comparing huffman and lz on two strings # run y_r12_compare.py experiment 1: # random string # huffman doesn't help but also doesn't hurt; back to fixed-length code Huffman compression ratio: 0.9884285714285714 # lz pays the price of the extra bit LZ compression ratio: 1.1428571428571428 experiment 2: # a string with both non-uniform char distribution and repetitions Huffman compression ratio: 0.6385836385836385 LZ compression ratio: 0.2629222629222629 # both manage to compress but lz does better, # because more chars are decoded in 26 bits only. # when repetitions exist (i.e. chars are not independent of each other, as huffman assumes) # we definitely want to take advantage of that.