Huffman Coding
Description
Huffman coding is a data compression technique that uses variable-length codes to represent characters based on their frequency of occurrence. More frequent characters are assigned shorter codes, while less frequent characters receive longer codes, resulting in efficient data compression.
History

Huffman coding was developed by David A. Huffman while he was a graduate student at MIT in 1952. The algorithm came about as a result of a term paper assignment in an Information Theory class taught by Robert M. Fano. Fano had been working on a similar problem with Claude Shannon, and had developed the Shannon-Fano coding. When given the assignment, Huffman worked on the problem for months until he finally came up with a more efficient solution than the existing Shannon-Fano coding. His professor, Robert Fano, was impressed and helped Huffman publish his findings. The paper, "A Method for the Construction of Minimum-Redundancy Codes", was published in the Proceedings of the IRE in 1952 and became one of the most cited papers in computer science. Today, Huffman coding is used in various file compression formats including JPEG, PNG, and MP3, and serves as a fundamental building block in many modern data compression algorithms.
How It Works
- Calculate the frequency of each character in the input
- Create a leaf node for each character and add it to a priority queue
- While there is more than one node in the queue:
- Remove the two nodes with lowest frequency
- Create a new internal node with these nodes as children
- Add the new node back to the queue
- The remaining node is the root of the Huffman tree
- Traverse the tree to assign codes (left=0, right=1)
Visualization
Character | Frequency | Code |
---|
Enter text and click Start
Advantages and Disadvantages
Advantages
- Provides optimal prefix codes
- Lossless compression
- Simple and efficient implementation
- Widely used in practice
Disadvantages
- Requires two passes over the data
- Need to store the tree structure
- Not optimal for adaptive compression
- May not compress well with uniform frequency distribution
Implementation
from heapq import heappush, heappop
from collections import defaultdict
class HuffmanNode:
def __init__(self, char, freq):
self.char = char
self.freq = freq
self.left = None
self.right = None
def __lt__(self, other):
return self.freq < other.freq
def build_huffman_tree(text):
# Calculate frequency of each character
frequency = defaultdict(int)
for char in text:
frequency[char] += 1
# Create priority queue
heap = []
for char, freq in frequency.items():
heappush(heap, HuffmanNode(char, freq))
# Build the Huffman tree
while len(heap) > 1:
left = heappop(heap)
right = heappop(heap)
internal = HuffmanNode(None, left.freq + right.freq)
internal.left = left
internal.right = right
heappush(heap, internal)
return heap[0] if heap else None
def generate_codes(root, code="", codes=None):
if codes is None:
codes = {}
if root is None:
return codes
if root.char is not None:
codes[root.char] = code
return codes
generate_codes(root.left, code + "0", codes)
generate_codes(root.right, code + "1", codes)
return codes
def huffman_encoding(text):
# Handle empty text
if not text:
return "", None
# Handle text with only one unique character
if len(set(text)) == 1:
return "0" * len(text), {text[0]: "0"}
# Build Huffman tree
root = build_huffman_tree(text)
# Generate codes
codes = generate_codes(root)
# Encode text
encoded = "".join(codes[char] for char in text)
return encoded, codes
def huffman_decoding(encoded_text, codes):
if not encoded_text:
return ""
# Create reverse lookup
reverse_codes = {code: char for char, code in codes.items()}
# Decode text
current_code = ""
decoded = []
for bit in encoded_text:
current_code += bit
if current_code in reverse_codes:
decoded.append(reverse_codes[current_code])
current_code = ""
return "".join(decoded)
# Example usage
if __name__ == "__main__":
text = "hello world"
print(f"Original text: {text}")
encoded, codes = huffman_encoding(text)
print(f"\nHuffman Codes:")
for char, code in codes.items():
print(f"'{char}': {code}")
print(f"\nEncoded text: {encoded}")
decoded = huffman_decoding(encoded, codes)
print(f"Decoded text: {decoded}")
#include <queue>
#include <string>
#include <unordered_map>
#include <vector>
#include <iostream>
class HuffmanNode {
public:
char character;
int frequency;
HuffmanNode* left;
HuffmanNode* right;
HuffmanNode(char c, int f) :
character(c), frequency(f), left(nullptr), right(nullptr) {}
};
struct CompareNodes {
bool operator()(HuffmanNode* a, HuffmanNode* b) {
return a->frequency > b->frequency;
}
};
class HuffmanCoding {
private:
HuffmanNode* root;
std::unordered_map<char, std::string> codes;
void generateCodes(HuffmanNode* node, std::string code) {
if (!node) return;
if (!node->left && !node->right) {
codes[node->character] = code;
return;
}
generateCodes(node->left, code + "0");
generateCodes(node->right, code + "1");
}
void cleanup(HuffmanNode* node) {
if (!node) return;
cleanup(node->left);
cleanup(node->right);
delete node;
}
public:
HuffmanCoding() : root(nullptr) {}
~HuffmanCoding() { cleanup(root); }
std::pair<std::string, std::unordered_map<char, std::string>>
encode(const std::string& text) {
// Calculate frequencies
std::unordered_map<char, int> freq;
for (char c : text) freq[c]++;
// Create min heap
std::priority_queue<HuffmanNode*, std::vector<HuffmanNode*>,
CompareNodes> minHeap;
for (const auto& pair : freq) {
minHeap.push(new HuffmanNode(pair.first, pair.second));
}
// Build Huffman tree
while (minHeap.size() > 1) {
HuffmanNode* left = minHeap.top(); minHeap.pop();
HuffmanNode* right = minHeap.top(); minHeap.pop();
HuffmanNode* internal = new HuffmanNode('\0',
left->frequency + right->frequency);
internal->left = left;
internal->right = right;
minHeap.push(internal);
}
root = minHeap.top();
// Generate codes
codes.clear();
generateCodes(root, "");
// Encode text
std::string encoded;
for (char c : text) {
encoded += codes[c];
}
return {encoded, codes};
}
std::string decode(const std::string& encoded,
const std::unordered_map<char, std::string>& codes) {
// Create reverse lookup
std::unordered_map<std::string, char> reverseMap;
for (const auto& pair : codes) {
reverseMap[pair.second] = pair.first;
}
std::string current;
std::string decoded;
for (char bit : encoded) {
current += bit;
if (reverseMap.count(current)) {
decoded += reverseMap[current];
current.clear();
}
}
return decoded;
}
};
int main() {
std::string text = "hello world";
std::cout << "Original text: " << text << std::endl;
HuffmanCoding huffman;
auto [encoded, codes] = huffman.encode(text);
std::cout << "\nHuffman Codes:" << std::endl;
for (const auto& [c, code] : codes) {
std::cout << "'" << c << "': " << code << std::endl;
}
std::cout << "\nEncoded text: " << encoded << std::endl;
std::string decoded = huffman.decode(encoded, codes);
std::cout << "Decoded text: " << decoded << std::endl;
return 0;
}
using System;
using System.Collections.Generic;
using System.Linq;
public class HuffmanNode : IComparable<HuffmanNode>
{
public char? Character { get; set; }
public int Frequency { get; set; }
public HuffmanNode Left { get; set; }
public HuffmanNode Right { get; set; }
public HuffmanNode(char? c, int freq)
{
Character = c;
Frequency = freq;
Left = null;
Right = null;
}
public int CompareTo(HuffmanNode other)
{
return Frequency.CompareTo(other.Frequency);
}
}
public class HuffmanCoding
{
private HuffmanNode root;
private Dictionary<char, string> codes;
public HuffmanCoding()
{
root = null;
codes = new Dictionary<char, string>();
}
private void GenerateCodes(HuffmanNode node, string code)
{
if (node == null) return;
if (node.Character.HasValue)
{
codes[node.Character.Value] = code;
return;
}
GenerateCodes(node.Left, code + "0");
GenerateCodes(node.Right, code + "1");
}
public (string encoded, Dictionary<char, string> codes) Encode(string text)
{
// Calculate frequencies
var frequencies = text.GroupBy(c => c)
.ToDictionary(g => g.Key, g => g.Count());
// Create priority queue
var priorityQueue = new SortedSet<HuffmanNode>();
foreach (var pair in frequencies)
{
priorityQueue.Add(new HuffmanNode(pair.Key, pair.Value));
}
// Build Huffman tree
while (priorityQueue.Count > 1)
{
var left = priorityQueue.Min;
priorityQueue.Remove(left);
var right = priorityQueue.Min;
priorityQueue.Remove(right);
var internal = new HuffmanNode(null, left.Frequency + right.Frequency)
{
Left = left,
Right = right
};
priorityQueue.Add(internal);
}
root = priorityQueue.Any() ? priorityQueue.Min : null;
// Generate codes
codes.Clear();
GenerateCodes(root, "");
// Encode text
var encoded = string.Concat(text.Select(c => codes[c]));
return (encoded, new Dictionary<char, string>(codes));
}
public string Decode(string encoded, Dictionary<char, string> codes)
{
if (string.IsNullOrEmpty(encoded)) return "";
// Create reverse lookup
var reverseMap = codes.ToDictionary(pair => pair.Value, pair => pair.Key);
var current = "";
var decoded = new List<char>();
foreach (char bit in encoded)
{
current += bit;
if (reverseMap.ContainsKey(current))
{
decoded.Add(reverseMap[current]);
current = "";
}
}
return new string(decoded.ToArray());
}
}
public class Program
{
public static void Main()
{
string text = "hello world";
Console.WriteLine($"Original text: {text}");
var huffman = new HuffmanCoding();
var (encoded, codes) = huffman.Encode(text);
Console.WriteLine("\nHuffman Codes:");
foreach (var (c, code) in codes)
{
Console.WriteLine($"'{c}': {code}");
}
Console.WriteLine($"\nEncoded text: {encoded}");
string decoded = huffman.Decode(encoded, codes);
Console.WriteLine($"Decoded text: {decoded}");
}
}
Complexity Analysis
Time Complexity
The time complexity of Huffman coding is O(n log n), where n is the number of unique characters in the input text. This can be broken down into:
-
Building Frequency Table: O(n)
- Single pass through input text
-
Building Huffman Tree: O(n log n)
- Priority queue operations: O(log n)
- n-1 merge operations: O(n)
-
Generating Codes: O(n)
- Traversing tree once for each character
-
Encoding: O(m)
- m = length of input text
- Single pass through text
Space Complexity
The space complexity is O(n), where n is the number of unique characters:
-
Frequency Table: O(n)
- Stores frequency for each unique character
-
Huffman Tree: O(n)
- Tree nodes for each character
- Internal nodes for merging
-
Code Table: O(n)
- Stores binary codes for each character
-
Output String: O(m)
- m = length of encoded/decoded text