Home

For the impatient, the full implementation is at the bottom of this page.

Huffman Encoding is an easy to understand, and efficient data compression algorithm (in this example, ~75% compression ratio is attained). On this page we go through a working implementation. I wrote it in a couple hours whilst watching this great lecture on Huffman Encoding; needless to say this is for educational purposes only.

Please note, it uses the third-party bitarray python module.

Generate the Huffman tree

The first step with Huffman encoding is to generate the encoding tree. This is done by taking the counts of each letter, and creating a hierarchy of counts.

from collections import Counter
from collections import namedtuple
from bitarray import bitarray

node = namedtuple('node', 'char count left right')
hcode = namedtuple('hcode', 'char count code')

def gen_huffman_tree(message):
    counts = Counter(message)
    htree = {node(x[0], x[1], 0, 0) for x in sorted(counts.items(), key=lambda x: x[1])}
    huff_min = lambda: min(htree, key=lambda x: x.count)
    while len(htree) > 1:
        a = huff_min()
        htree.remove(a)
        b = huff_min()
        htree.remove(b)
        htree.add(node('', a.count + b.count, a, b))
    return next(iter(htree)), len(counts)

def main():
    # this is the message we want to encode
    message  = 'BCCABBDDAECCBBAEDDCC'
    
    # compute the huffman tree
    htree, count = gen_huffman_tree(message)

    print(htree)

if __name__ == '__main__':
    main()

Generate the Huffman Codes

def gen_huffman_codes(htree, count):
    code = [0]*count
    hcodes = {}
    def gen_codes(htree, depth=0):
        if htree.left:
            code[depth] = 0
            gen_codes(htree.left, depth+1)
        if htree.right:
            code[depth] = 1
            gen_codes(htree.right, depth+1)
        is_leaf = htree.left == htree.right == 0
        if is_leaf:
            hcodes[htree.char] = hcode(htree.char, htree.count, bitarray(code[:depth]))
    gen_codes(htree)
    return hcodes

def print_huffman_codec(hcodes):
    for c, v in hcodes.items():
        print(v)

def main():
    # this is the message we want to encode
    message  = 'BCCABBDDAECCBBAEDDCC'
    
    # compute the huffman tree
    htree, count = gen_huffman_tree(message)

    # compute our huffman encoder
    hcodes = gen_huffman_codes(htree, count)
	
    print_huffman_codec(hcodes)

Output:

hcode(char='D', count=4, code=bitarray('00'))
 hcode(char='E', count=2, code=bitarray('010'))
 hcode(char='A', count=3, code=bitarray('011'))
 hcode(char='B', count=5, code=bitarray('10'))
 hcode(char='C', count=6, code=bitarray('11'))

Compress the Message

Next we’ll want to encode the actual message.

def encode_message(message, hcodes):
    huff_encoding = bitarray()
    for c in message:
        huff_encoding.extend(hcodes[c].code)
    return huff_encoding


def main():
    # this is the message we want to encode
    message  = 'BCCABBDDAECCBBAEDDCC'
    
    # compute the huffman tree
    htree, count = gen_huffman_tree(message)

    # compute our huffman encoder
    hcodes = gen_huffman_codes(htree, count)

    # print our huffman codec
    print_huffman_codec(hcodes)

    # compress the message into our huffman encoding
    huff_encoding = encode_message(message, hcodes)

Output:

bitarray('101111011101000000110101111101001101000001111')

Decompress the Message

Next we’ll want to decode our Huffman encoded message. For that we simply perform a traveral, while following the directional rules of our encoded message. When we hit a leaf, that’s the character we are after.

def decode_huffman_encoding(htree, huff_encoding):
    def decode(depth, htree):
        is_leaf = htree.left == htree.right == 0
        if is_leaf:
            offset[0] += depth
            decompressed_message.append(htree.char)
        else:
            bit = huff_encoding[depth+offset[0]]
            if htree.left and not bit:
                decode(depth+1, htree.left)
            elif htree.right and bit:
                decode(depth+1, htree.right)

    decompressed_message = []
    offset = [0]

    while offset[0] < len(huff_encoding):
        decode(0, htree)

    decompressed_message = ''.join(decompressed_message)
    return decompressed_message

def main():
    # this is the message we want to encode
    message  = 'BCCABBDDAECCBBAEDDCC'
    
    # compute the huffman tree
    htree, count = gen_huffman_tree(message)

    # compute our huffman encoder
    hcodes = gen_huffman_codes(htree, count)

    # print our huffman codec
    print_huffman_codec(hcodes)

    # compress the message into our huffman encoding
    huff_encoding = encode_message(message, hcodes)

    # decompress our huffman encoding back into a string
    decompressed_message = decode_huffman_encoding(htree, huff_encoding)
	
    print(decompressed_message)

Output:

BCCABBDDAECCBBAEDDCC

Printing some stats

The final step is to print some stats. Please note, i did not include the step of including the hcodes and htree into a final output, which would have indeed lead to a slight drop in the compression ratio.

def main():
    # this is the message we want to encode
    message  = 'BCCABBDDAECCBBAEDDCC'
    
    # compute the huffman tree
    htree, count = gen_huffman_tree(message)

    # compute our huffman encoder
    hcodes = gen_huffman_codes(htree, count)

    # print our huffman codec
    print_huffman_codec(hcodes)

    # compress the message into our huffman encoding
    huff_encoding = encode_message(message, hcodes)

    # decompress our huffman encoding back into a string
    decompressed_message = decode_huffman_encoding(htree, huff_encoding)

    # print stats and sanity check
    print(f'Original message: {message}')
    print(f'Huffman Encoded Message: {huff_encoding}')
    print(f'Original Size: {len(message)*8} bits.')
    print(f'New Size: {len(huff_encoding)} bits.')
    print(f'Compression: {(1-len(huff_encoding)/(len(message)*8))*100}%.')
    print(f'decompressed message: {decompressed_message}')
    print(f'decompression success: {message == decompressed_message}')

Output:

Original message: BCCABBDDAECCBBAEDDCC
 Huffman Encoded Message: bitarray('101111011101000000110101111101001101000001111')
 Original Size: 160 bits.
 New Size: 45 bits.
 Compression: 71.875%.
 decompressed message: BCCABBDDAECCBBAEDDCC
 decompression success: True

Calculating Encoding Size

Using the codes

The size of the final encoding can be calculated from the hcode var:

It can also be calculated with with htree var:

Where di is the depth of the leaf, and fi is the count for the leaf.

Putting it all together

Here’s the full implementation for your fun and enjoyment.

from collections import Counter
from collections import namedtuple
from bitarray import bitarray

node = namedtuple('node', 'char count left right')
hcode = namedtuple('hcode', 'char count code')

def gen_huffman_tree(message):
    counts = Counter(message)
    htree = {node(x[0], x[1], 0, 0) for x in sorted([*counts.items()], key=lambda x: x[1])}
    huff_min = lambda: min(htree, key=lambda x: x.count)
    while len(htree) > 1:
        a = huff_min()
        htree.remove(a)
        b = huff_min()
        htree.remove(b)
        htree.add(node('', a.count + b.count, a, b))
    return next(iter(htree)), len(counts)

def gen_huffman_codes(htree, count):
    code = [0]*count
    hcodes = {}
    def gen_codes(htree, depth=0):
        if htree.left:
            code[depth] = 0
            gen_codes(htree.left, depth+1)
        if htree.right:
            code[depth] = 1
            gen_codes(htree.right, depth+1)
        is_leaf = htree.left == htree.right == 0
        if is_leaf:
            hcodes[htree.char] = hcode(htree.char, htree.count, bitarray(code[:depth]))
    gen_codes(htree)
    return hcodes

def decode_huffman_encoding(htree, huff_encoding):
    def decode(depth, htree):
        is_leaf = htree.left == htree.right == 0
        if is_leaf:
            offset[0] += depth
            decompressed_message.append(htree.char)
        else:
            bit = huff_encoding[depth+offset[0]]
            if htree.left and not bit:
                decode(depth+1, htree.left)
            elif htree.right and bit:
                decode(depth+1, htree.right)

    decompressed_message = []
    offset = [0]

    while offset[0] < len(huff_encoding):
        decode(0, htree)

    decompressed_message = ''.join(decompressed_message)
    return decompressed_message

def print_huffman_codec(hcodes):
    for c, v in hcodes.items():
        print(v)

def encode_message(message, hcodes):
    huff_encoding = bitarray()
    for c in message:
        huff_encoding.extend(hcodes[c].code)
    return huff_encoding

def main():
    # this is the message we want to encode
    message  = 'BCCABBDDAECCBBAEDDCC'
    
    # compute the huffman tree
    htree, count = gen_huffman_tree(message)

    # compute our huffman encoder
    hcodes = gen_huffman_codes(htree, count)

    # print our huffman codec
    print_huffman_codec(hcodes)

    # compress the message into our huffman encoding
    huff_encoding = encode_message(message, hcodes)

    # decompress our huffman encoding back into a string
    decompressed_message = decode_huffman_encoding(htree, huff_encoding)

    # print stats and sanity check
    print(f'Original message: {message}')
    print(f'Huffman Encoded Message: {huff_encoding}')
    print(f'Original Size: {len(message)*8} bits.')
    print(f'New Size: {len(huff_encoding)} bits.')
    print(f'Compression: {(1-len(huff_encoding)/(len(message)*8))*100}%.')
    print(f'decompressed message: {decompressed_message}')
    print(f'decompression success: {message == decompressed_message}')

if __name__ == '__main__':
    main()

Related Posts:

Max Array

Flatten a dictionary

Binary Search

Valid Palindrome II

Post by Shyal Beardsleyhttps://ioloop.io, https://shyal.com

Create your website at WordPress.com
Get started