CSC212 Huffman Ecoding in Java

From dftwiki3
Jump to: navigation, search

--D. Thiebaut (talk) 16:10, 10 December 2014 (EST)


import java.util.HashMap;
import java.util.PriorityQueue;

/**
 * Implement a simplified version of Huffman encoding on a string with
 * a given distribution for each character in it.  The chars are the
 * vowels of the latin alphabet.
 * @author thiebaut
 *
 */
public class HuffmanDT {
	static int IdCounter = 0;				// used to number each node with unique Id
	
	/**
	 * the node used to create the Huffman tree
	 * @author thiebaut
	 *
	 */
	static class Node implements Comparable {
		public char letter;				// the letter from the string, or '#' if inner node
		public int Id;					// unique Id (used to generate DOT graph)	
		public int freq;				// counts number of occurrences
		public Node left;				// pointer to left child, if any
		public Node right;				// pointer to right child, if nay
		
		Node( char l, int f, Node lft, Node rt ) {
			Id = IdCounter++;
			letter = l; freq = f; left = lft; right = rt; 
		}
		/**
		 * returns whether node is a leaf.
		 * @return true if leaf, false otherwise.
		 */
		public boolean isLeaf() { return left==null && right==null; }
		@Override
		/**
		 * compareTo: needed because nodes will be kept in priority queue that
		 * keeps node with smallest freq value at the root. 
		 */
		public int compareTo(Object o) {
			return freq - ((Node) o).freq;
		}
	}
	
	/**
	 * retuns a synthetic string of characters (a, e, i, o, u) with a fixed distribution.
	 * @return the string.
	 */
	public static String getStringOfLetters() {
		String s = "";
		for ( int i=0; i<50; i++ ) s += "e";   // 50% of e
		for ( int i=0; i<30; i++ )  s += "a";   // 30% of a
		for ( int i=0; i<10; i++ )  s += "i";   // 10% of i
		for ( int i=0; i<8; i++ )   s += "o";   // 8% of o
		for ( int i=0; i<2; i++ )   s += "u";   // 2% of u
		return s;
	}
	
	/**
	 * computes the frequency associated with each letter found in the string s.
	 * @param s the string containing the characters to encode.
	 * @return a hashMap of the characters and their associated frequencies (as ints)
	 */
	private static HashMap<Character, Integer> getLetterFrequencies( String s ) {
		HashMap<Character, Integer> freq = new HashMap<Character, Integer>();
		for ( int i=0; i< s.length(); i++ ) {
			char c = s.charAt(i);
			if ( freq.containsKey( c ) )
				freq.put( c, freq.get( c )+1 );
			else
				freq.put( c,  1 );
		}
		return freq;
	}
	
	/**
	 * buildHuffmanTree: builds a Huffman trie 
	 * @param freqs the hashmap of frequencies associated with each char.
	 * @return the root of the Huffman tree.
	 */
	private static Node buildHuffmanTree( HashMap<Character, Integer> freqs ) {
		PriorityQueue<Node> prioQ = new PriorityQueue<Node>();
		for ( char c: freqs.keySet() ) 
			prioQ.add( new Node( c, freqs.get(c), null, null ) );
		
		while ( prioQ.size() > 1 ) {
			Node left = prioQ.poll();
			Node right = prioQ.poll();
			prioQ.add( new Node( '#', left.freq + right.freq, left, right ) );
		}
		
		return prioQ.poll();
	}

	/**
	 * generates the DOT representation of the Hoffman tree
	 * @param root
	 */
	private static void generateHuffmanTreeDOT( Node root ) {

		System.out.println( "\ndigraph Huffman {" );
		DotDFS( root );
		System.out.println( "}\n" );
	}
	
	/**
	 * recursive trunk of the DOT generation.
	 * @param node
	 */
	private static void DotDFS( Node node ) {
		if ( node.isLeaf() ) {
			System.out.println(  String.format( "%d [label=\"%s (%d)\"];", 
					node.Id, node.letter, node.freq) );
			return;
		}
		if ( node.left != null ) {
			System.out.println(  String.format( "%d [label=\"%s (%d)\"];", 
					node.Id, node.letter, node.freq) );
			System.out.println( String.format( "%d -> %d [label=\"0\"];", 
					node.Id, node.left.Id ) );
			DotDFS( node.left );
		}
		if ( node.right != null ) {
			System.out.println(  String.format( "%s [label=\"%s (%d)\"];", 
					node.Id, node.letter, node.freq) );
			System.out.println( String.format( "%s -> %s [label=\"1\"];", 
					node.Id, node.right.Id ) );
			DotDFS( node.right );
		}
	}

	/**
	 * gets the string of 0s and 1s associated with each character.  Should normally
	 * be a bit pattern, but is a String for the purpose of simplifying the code.
	 * @param root
	 * @return a hashMap of characters and their associated code.
	 */
	private static HashMap<Character, String> getEncoding(Node root) {
		HashMap<Character, String> encoding = new HashMap<Character, String>();
		DFS( root, "", encoding );
		return encoding;
	}

	/**
	 * Recursively performs a DFS to visit the Huffman trie and assign code to each
	 * leaf. 
	 * @param node
	 * @param code
	 * @param encoding
	 */
	private static void DFS(Node node, String code, HashMap<Character, String> encoding) {
		if ( node.isLeaf() ) 
			encoding.put( node.letter, code );
		else {
			if ( node.left != null ) 
				DFS( node.left, code+"0", encoding );
			if ( node.right != null )
				DFS( node.right, code+"1", encoding );
		}
	}

	/**
	 * MAIN ENTRY POINT
	 * @param args
	 */
	public static void main(String[] args) {
		// create a synthetic string with know distribution of chars
		String s = getStringOfLetters();
		
		HashMap<Character, Integer> freqs = getLetterFrequencies( s );
		Node root = buildHuffmanTree( freqs );
		
		//generateHuffmanTreeDOT( root );
		
		HashMap<Character, String> encoding = getEncoding( root );
		System.out.println( encoding );
		
		int encodedStringLength = 0;
		int sumFreqs = 0;
		for ( char c: encoding.keySet() ) {
			int freq = freqs.get( c );
			sumFreqs += freq;
			String code = encoding.get( c );
			encodedStringLength += freq * code.length();
		}
		System.out.println( "Coding of aeiou with 3 bits/char = " + sumFreqs * 3 + " bits" );
		System.out.println( "Huffman encoding of same chars   = " + encodedStringLength + " bits" );
		System.out.println( "Compression ratio                = " + sumFreqs * 3.0 / encodedStringLength );
	}



}