For the impatient, the full implementation is at the bottom of this page.
Huffman Encoding is an easy to understand, and efficient data compression algorithm (in this example, ~75% compression ratio is attained). On this page we go through a working implementation. I wrote it in a couple hours whilst watching this great lecture on Huffman Encoding; needless to say this is for educational purposes only.
Please note, it uses the third-party bitarray python module.
Generate the Huffman tree

The first step with Huffman encoding is to generate the encoding tree. This is done by taking the counts of each letter, and creating a hierarchy of counts.
from collections import Counter
from collections import namedtuple
from bitarray import bitarray
node = namedtuple('node', 'char count left right')
hcode = namedtuple('hcode', 'char count code')
def gen_huffman_tree(message):
counts = Counter(message)
htree = {node(x[0], x[1], 0, 0) for x in sorted(counts.items(), key=lambda x: x[1])}
huff_min = lambda: min(htree, key=lambda x: x.count)
while len(htree) > 1:
a = huff_min()
htree.remove(a)
b = huff_min()
htree.remove(b)
htree.add(node('', a.count + b.count, a, b))
return next(iter(htree)), len(counts)
def main():
# this is the message we want to encode
message = 'BCCABBDDAECCBBAEDDCC'
# compute the huffman tree
htree, count = gen_huffman_tree(message)
print(htree)
if __name__ == '__main__':
main()
Generate the Huffman Codes
def gen_huffman_codes(htree, count):
code = [0]*count
hcodes = {}
def gen_codes(htree, depth=0):
if htree.left:
code[depth] = 0
gen_codes(htree.left, depth+1)
if htree.right:
code[depth] = 1
gen_codes(htree.right, depth+1)
is_leaf = htree.left == htree.right == 0
if is_leaf:
hcodes[htree.char] = hcode(htree.char, htree.count, bitarray(code[:depth]))
gen_codes(htree)
return hcodes
def print_huffman_codec(hcodes):
for c, v in hcodes.items():
print(v)
def main():
# this is the message we want to encode
message = 'BCCABBDDAECCBBAEDDCC'
# compute the huffman tree
htree, count = gen_huffman_tree(message)
# compute our huffman encoder
hcodes = gen_huffman_codes(htree, count)
print_huffman_codec(hcodes)
Output:
hcode(char='D', count=4, code=bitarray('00'))
hcode(char='E', count=2, code=bitarray('010'))
hcode(char='A', count=3, code=bitarray('011'))
hcode(char='B', count=5, code=bitarray('10'))
hcode(char='C', count=6, code=bitarray('11'))
Compress the Message
Next we’ll want to encode the actual message.
def encode_message(message, hcodes):
huff_encoding = bitarray()
for c in message:
huff_encoding.extend(hcodes[c].code)
return huff_encoding
def main():
# this is the message we want to encode
message = 'BCCABBDDAECCBBAEDDCC'
# compute the huffman tree
htree, count = gen_huffman_tree(message)
# compute our huffman encoder
hcodes = gen_huffman_codes(htree, count)
# print our huffman codec
print_huffman_codec(hcodes)
# compress the message into our huffman encoding
huff_encoding = encode_message(message, hcodes)
Output:
bitarray('101111011101000000110101111101001101000001111')
Decompress the Message
Next we’ll want to decode our Huffman encoded message. For that we simply perform a traveral, while following the directional rules of our encoded message. When we hit a leaf, that’s the character we are after.
def decode_huffman_encoding(htree, huff_encoding):
def decode(depth, htree):
is_leaf = htree.left == htree.right == 0
if is_leaf:
offset[0] += depth
decompressed_message.append(htree.char)
else:
bit = huff_encoding[depth+offset[0]]
if htree.left and not bit:
decode(depth+1, htree.left)
elif htree.right and bit:
decode(depth+1, htree.right)
decompressed_message = []
offset = [0]
while offset[0] < len(huff_encoding):
decode(0, htree)
decompressed_message = ''.join(decompressed_message)
return decompressed_message
def main():
# this is the message we want to encode
message = 'BCCABBDDAECCBBAEDDCC'
# compute the huffman tree
htree, count = gen_huffman_tree(message)
# compute our huffman encoder
hcodes = gen_huffman_codes(htree, count)
# print our huffman codec
print_huffman_codec(hcodes)
# compress the message into our huffman encoding
huff_encoding = encode_message(message, hcodes)
# decompress our huffman encoding back into a string
decompressed_message = decode_huffman_encoding(htree, huff_encoding)
print(decompressed_message)
Output:
BCCABBDDAECCBBAEDDCC
Printing some stats
The final step is to print some stats. Please note, i did not include the step of including the hcodes and htree into a final output, which would have indeed lead to a slight drop in the compression ratio.
def main():
# this is the message we want to encode
message = 'BCCABBDDAECCBBAEDDCC'
# compute the huffman tree
htree, count = gen_huffman_tree(message)
# compute our huffman encoder
hcodes = gen_huffman_codes(htree, count)
# print our huffman codec
print_huffman_codec(hcodes)
# compress the message into our huffman encoding
huff_encoding = encode_message(message, hcodes)
# decompress our huffman encoding back into a string
decompressed_message = decode_huffman_encoding(htree, huff_encoding)
# print stats and sanity check
print(f'Original message: {message}')
print(f'Huffman Encoded Message: {huff_encoding}')
print(f'Original Size: {len(message)*8} bits.')
print(f'New Size: {len(huff_encoding)} bits.')
print(f'Compression: {(1-len(huff_encoding)/(len(message)*8))*100}%.')
print(f'decompressed message: {decompressed_message}')
print(f'decompression success: {message == decompressed_message}')
Output:
Original message: BCCABBDDAECCBBAEDDCC
Huffman Encoded Message: bitarray('101111011101000000110101111101001101000001111')
Original Size: 160 bits.
New Size: 45 bits.
Compression: 71.875%.
decompressed message: BCCABBDDAECCBBAEDDCC
decompression success: True
Calculating Encoding Size
Using the codes
The size of the final encoding can be calculated from the hcode var:

It can also be calculated with with htree var:

Where di
is the depth of the leaf, and fi
is the count for the leaf.
Putting it all together
Here’s the full implementation for your fun and enjoyment.
from collections import Counter
from collections import namedtuple
from bitarray import bitarray
node = namedtuple('node', 'char count left right')
hcode = namedtuple('hcode', 'char count code')
def gen_huffman_tree(message):
counts = Counter(message)
htree = {node(x[0], x[1], 0, 0) for x in sorted([*counts.items()], key=lambda x: x[1])}
huff_min = lambda: min(htree, key=lambda x: x.count)
while len(htree) > 1:
a = huff_min()
htree.remove(a)
b = huff_min()
htree.remove(b)
htree.add(node('', a.count + b.count, a, b))
return next(iter(htree)), len(counts)
def gen_huffman_codes(htree, count):
code = [0]*count
hcodes = {}
def gen_codes(htree, depth=0):
if htree.left:
code[depth] = 0
gen_codes(htree.left, depth+1)
if htree.right:
code[depth] = 1
gen_codes(htree.right, depth+1)
is_leaf = htree.left == htree.right == 0
if is_leaf:
hcodes[htree.char] = hcode(htree.char, htree.count, bitarray(code[:depth]))
gen_codes(htree)
return hcodes
def decode_huffman_encoding(htree, huff_encoding):
def decode(depth, htree):
is_leaf = htree.left == htree.right == 0
if is_leaf:
offset[0] += depth
decompressed_message.append(htree.char)
else:
bit = huff_encoding[depth+offset[0]]
if htree.left and not bit:
decode(depth+1, htree.left)
elif htree.right and bit:
decode(depth+1, htree.right)
decompressed_message = []
offset = [0]
while offset[0] < len(huff_encoding):
decode(0, htree)
decompressed_message = ''.join(decompressed_message)
return decompressed_message
def print_huffman_codec(hcodes):
for c, v in hcodes.items():
print(v)
def encode_message(message, hcodes):
huff_encoding = bitarray()
for c in message:
huff_encoding.extend(hcodes[c].code)
return huff_encoding
def main():
# this is the message we want to encode
message = 'BCCABBDDAECCBBAEDDCC'
# compute the huffman tree
htree, count = gen_huffman_tree(message)
# compute our huffman encoder
hcodes = gen_huffman_codes(htree, count)
# print our huffman codec
print_huffman_codec(hcodes)
# compress the message into our huffman encoding
huff_encoding = encode_message(message, hcodes)
# decompress our huffman encoding back into a string
decompressed_message = decode_huffman_encoding(htree, huff_encoding)
# print stats and sanity check
print(f'Original message: {message}')
print(f'Huffman Encoded Message: {huff_encoding}')
print(f'Original Size: {len(message)*8} bits.')
print(f'New Size: {len(huff_encoding)} bits.')
print(f'Compression: {(1-len(huff_encoding)/(len(message)*8))*100}%.')
print(f'decompressed message: {decompressed_message}')
print(f'decompression success: {message == decompressed_message}')
if __name__ == '__main__':
main()
Related Posts:
Post by Shyal Beardsley – https://ioloop.io, https://shyal.com