Python 3.2.5 (default, May 15 2013, 23:07:10) [MSC v.1500 64 bit (AMD64)] on win32 Type "copyright", "credits" or "license()" for more information. >>> ================================ RESTART ================================ #Our corpus >>> s = "live and let live" #we generate the huffman code >>> char_count(s) {'a': 1, ' ': 3, 'e': 3, 'd': 1, 'i': 2, 'l': 3, 'n': 1, 't': 1, 'v': 2} >>> count_dict = char_count(s) >>> count_dict {'a': 1, ' ': 3, 'e': 3, 'd': 1, 'i': 2, 'l': 3, 'n': 1, 't': 1, 'v': 2} >>> huff_tree = build_huffman_tree(count_dict) >>> huff_tree [['l', ['i', 'v']], [[['a', 'd'], ['n', 't']], [' ', 'e']]] >>> d = generate_code(huff_tree) >>> d {'a': '1000', ' ': '110', 'e': '111', 'd': '1001', 'i': '010', 'l': '00', 'n': '1010', 't': '1011', 'v': '011'} #d is a dictionary that maps every charachter to a binary string (the code for the character) #Now we can use this dictionary to compress different text strings >>> text1 = "tell" >>> compress(text1, d) '10111110000' >>> len(compress(text1, d)) 11 #4+3+2+2 >>> unicode_conversion = str_to_bin("tell") #when we convert every character to 7 bits >>> len(unicode_conversion) 28 #4*7 #we can create a decoding dictionary from d and use it to decompress binary strings (that were compressed using d) >>> e = build_decoding_dict(d) >>> e {'00': 'l', '010': 'i', '011': 'v', '111': 'e', '110': ' ', '1010': 'n', '1011': 't', '1001': 'd', '1000': 'a'} >>> decompress('10111110000', e) 'tell' #Another example in which all characters are in d >>> text2 = "tel aviv" >>> compress(text2, d) '1011111001101000011010011' >>> str_to_bin(text2) '11101001100101110110001000001100001111011011010011110110' >>> len(compress(text2, d)) 25 >>> len(str_to_bin(text2)) 56