#=====================
# PART A: HUFFMAN CODE
#=====================
>>> ================================ RESTART ================================
# executing the huffman cycle
>>> char_count("live and let live")
{'d': 1, 'n': 1, 'i': 2, 'a': 1, 'e': 3, 'l': 3, 'v': 2, ' ': 3, 't': 1}
>>> build_huffman_tree(char_count("live and let live"))
[[' ', ['i', 'v']], [[['d', 'n'], ['a', 't']], ['e', 'l']]]
>>> d = generate_code(build_huffman_tree(char_count("live and let live")))
>>> d
{'d': '1000', 't': '1011', 'e': '110', 'v': '011', 'n': '1001', 'i': '010', 'a': '1010', 'l': '111', ' ': '00'}
>>> compress("tell",d)
'1011110111111'
>>> len(compress("tell",d))
13
>>> e = build_decoding_dict(d)
>>> e
{'010': 'i', '1000': 'd', '1010': 'a', '1011': 't', '00': ' ', '111': 'l', '110': 'e', '1001': 'n', '011': 'v'}
>>> decompress(compress("tell",d),e)
'tell'
>>> ================================ RESTART ================================
# checking the compression ratio of some examples
>>> huffman_ratio("ab","ab")
0.14285714285714285
>>> 1/7
0.14285714285714285
>>> huffman_ratio("ab","abcd")
0.2857142857142857
>>> 2/7
0.2857142857142857
>>> huffman_ratio("hello","a"*100)
Traceback (most recent call last):
  File "<pyshell#15>", line 1, in <module>
    compression_ratio("hello","a"*100)
  File "/Users/Yael/Google Drive/cs1001/week12/4class/huff/_5_compression_ratio.py", line 6, in compression_ratio
    len_compress = len(compress(text, d))
  File "/Users/Yael/Google Drive/cs1001/week12/4class/huff/huffman.py", line 73, in compress
    return "".join(encoding_dict[ch] for ch in text if ord(ch)<=128)
  File "/Users/Yael/Google Drive/cs1001/week12/4class/huff/huffman.py", line 73, in <genexpr>
    return "".join(encoding_dict[ch] for ch in text if ord(ch)<=128)
KeyError: 'h'
# crashed because text includes a char that does not appear in the corpus
>>> asci = [chr(i) for i in range(128)]
>>> asci
['\x00', '\x01', '\x02', '\x03', '\x04', '\x05', '\x06', '\x07', '\x08', '\t', '\n', '\x0b', '\x0c', '\r', '\x0e', '\x0f', '\x10', '\x11', '\x12', '\x13', '\x14', '\x15', '\x16', '\x17', '\x18', '\x19', '\x1a', '\x1b', '\x1c', '\x1d', '\x1e', '\x1f', ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '=', '>', '?', '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', '\\', ']', '^', '_', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '|', '}', '~', '\x7f']
>>> asci = "".join(asci)
>>> asci
'\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f'
>>> huffman_ratio("hello","a"*100+asci) # problem solved
1.1428571428571428
>>> 8/7
1.1428571428571428
# why 8/7 ? let's take a look at the code dictionary
>>> d = generate_code(build_huffman_tree(char_count("a"*100+asci)))
>>> d["a"]
'0'
>>> d["z"]
'11011101'
# "a" is so frequent that it's represented by a single bit,
# increasing the representation length of the rest of the chars to 8 bits
# what is the minimal k such that a code generated from the corpus "a"*k+asci
# will encode "a" using a single bit ?
>>> d = generate_code(build_huffman_tree(char_count("a"*64+asci)))
>>> d['a']
'0'
>>> d = generate_code(build_huffman_tree(char_count("a"*63+asci)))
>>> d['a']
'11'
#=======================
# PART B: LZ compression
#=======================
>>> ================================ RESTART ================================
# executing the LZ cycle
>>> inter1,bits,inter2,text2 = process("abcdabc")
>>> inter1
['a', 'b', 'c', 'd', [4, 3]]
>>> len(bits)
50
>>> 4*8+18
50
>>> [bits[8*i:8*(i+1)] for i in range(4)]
['01100001', '01100010', '01100011', '01100100']
>>> [bin(ord(c))[2:] for c in "abcd"]
['1100001', '1100010', '1100011', '1100100']
>>> [chr(int(e,2)) for e in [bits[8*i:8*(i+1)] for i in range(4)]]
['a', 'b', 'c', 'd']
>>> bits[32]
'1'
>>> bits[33:45],bits[45:]
('000000000100', '00011')
>>> inter1,bits,inter2,text2 = process("xyzxyzwxyzw")
>>> inter1
['x', 'y', 'z', [3, 3], 'w', [4, 4]]
>>> ================================ RESTART ================================
>>> inter1,bits,inter2,text2 = process("a"*40)
>>> inter1
['a', [1, 31], [1, 8]]
# what is the compression ratio of strings of the form "a"*k
# when k tends to infinity ?
>>> lz_ratio("a"*1000)
0.086
>>> 18/(7*31)
0.08294930875576037
>>> ================================ RESTART ================================
# compare compression ratios of the two algorithms on a random string
# comparing huffman and lz on two strings
# run y_r12_compare.py
experiment 1: # random string
# huffman doesn't help but also doesn't hurt; back to fixed-length code
Huffman compression ratio: 0.9884285714285714
# lz pays the price of the extra bit
LZ      compression ratio: 1.1428571428571428
experiment 2: # a string with both non-uniform char distribution and repetitions
Huffman compression ratio: 0.6385836385836385
LZ      compression ratio: 0.2629222629222629
# both manage to compress but lz does better,
# because more chars are decoded in 26 bits only.
# when repetitions exist (i.e. chars are not independent of each other, as huffman assumes)
# we definitely want to take advantage of that.