Citerus Programming Challenge at JFokus 2011

by Peter Jaric

The company Citerus held a programming competition at the JFokus 2011 developer conference. I was one of the more than 100 contestants. Read more about it at http://www.citerus.se/post/vinnare (in Swedish).

I have included both the winning solution from Carin Lidberg and my own solution below. They are quite different :)

The winning solution:

public class CiterusChallenge {
  static String[] substrings =
    {"TDD", "DDD", "DI", "DO", "OO", "UI",
     "ANT", "CV", "IOC", "LOC", "SU", "VO"};

  public static void main(String[] args) {
    System.out.println(removeSubstringsToTheRight(
        "VOCDIITEIOCRUDOIANTOCSLOIOCVESTAIOCVOLIOCENTSU", 0));
  }

  public static String removeSubstringsToTheRight(String s, int start) {
    int minimum = s.length();
    String result = s;
    for (String substr : substrings) {
      int i = s.indexOf(substr);
      while (i >= 0) {
        if (i + substr.length() > start) {
          String newString = s.substring(0, i) +
                s.substring(i + substr.length());
          newString = removeSubstringsToTheRight(newString, i);
          if (newString.length() < minimum) {
            minimum = newString.length();
            result = newString;
          }
        }
        i = s.indexOf(substr, i + 1);
      }
    }
    return result;
  }
}

And here is my solution. The shortenString method was recursive up to before this version, but I finally made it iterative for performance reasons.

import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * This class attempts to solve the Citerus JFokus 2011 Challenge
 * (http://www.citerus.se/jfokus).
 * 
 * Run this program like this:
 * java Shortener   [ [...]]
 *
 * The example input given in the challenge is:
 * 
 * The string we want to shorten:
 * VOCDIITEIOCRUDOIANTOCSLOIOCVESTAIOCVOLIOCENTSU
 * Words that can be removed:
 * TDD, DDD, DI, DO, OO, UI, ANT, CV, IOC, LOC, SU, VO
 *
 * The algorithm basically does an exhaustive search for all possible
 * ways to remove the given words from the input string, but with a few 
 * optimizations (all timings based on a few test runs on my computer):
 *
 * 1) Cache
 * Before trying to remove words we check if it is already done for
 * this particular substring. If it is, we just skip it.
 *
 * 2) Divide and conquer
 * The input string might contain characters that can not be removed since
 * they do not exist in any of the words. We can used these characters as
 * delimiters between which we can extract substrings to shorten, one at a
 * time. This optimization can be very effective if the input string has
 * such characters. Compared to the version above, this runs about 
 * 35 - 40 times faster on the example input (~2.3ms vs ~85ms).
 * 
 * There is at least one more optimization that could be added (in a future 
 * version), however it wouldn't have any effect on the example input, since
 * it doesn't meet the requirements:
 * 
 * 3) Safely removable words
 * Words that do not overlap with themselves or any other words (DO and OO
 * overlap, IOC and LOC does not) can always be removed directly from a 
 * string that is about to be shortened. This is because of that since no
 * other word may remove characters from substrings that match these 
 * non-overlapping words (NOWs) the substrings will stay in the string until
 * they are removed by the NOW, even far down in the search tree. Cutting 
 * them out directly will make the search tree smaller and the search quicker.
 * 
 * Known issues:
 * For large inputs the algorithm will take quite some time.
 * For very large inputs the VM will run out of heap space.
 * 
 * CAVEAT:
 * This class does not attempt to be general in any other way than that it
 * accepts different inputs. Specifically, it isn't meant to be subclassed
 * or used from other code and hence it is put in the default package for
 * simplicity. For the same reasons it contains a main method.
 * 
 * @author Peter Jaric 
 * @version 6
 */
public class Shortener {
  /**
	 * Words that can be removed.
	 */
	private String[] allWords;
	
	/**
	 * Creates a Shortener that can be run on different input strings
	 * with the given words.
	 * 
	 * @param words to remove from strings
	 */
	public Shortener(String[] words) {
		this.allWords = words;
	}
	
	/**
	 * Finds the shortest string possible by removing this Shortener's words
	 * in the most advantageous order. 
	 * 
	 * @param input The string to shorten
	 * @return The shortest possible string
	 */
	public String shorten(String input) {
		// Implementation of the divide and conquer optimization follows:

		// Find all possible characters that can be removed by
		// simply concatenating all words
		String possibleChars = "";
		for (String word : allWords) {
			possibleChars += word;
		}
			
		// Create a (quite ugly) regexp that matches possible removals
		Pattern p = Pattern.compile("[" + possibleChars + "]+");
	
		// Start matching on the input string
		String todo = input;
		Matcher m = p.matcher(todo);
		String done = "";
		
		// For each match of possible removals, shorten it and save it
		while (m.find()) {
			// Find the match
			String match = m.group();
			
			// Add the shortened match together with any possible non-matching
			// stuff before it to what we have done already
			done += todo.substring(0, m.start()) + shortenString(match);
			
			// Shorten the todo string with what we already have done.
			todo = todo.substring(m.end());
			
			// Tell the matcher to use the todo string in next iteration
			m.reset(todo);
		}

		// There might be some unmatchable stuff left at the end. Add it
		// if so.
		if (!todo.equals("")) {
			done += todo;
		}
    
		// End of implementation of the divide and conquer optimization

		return done;
	}
	
	/**
	 * Finds the shortest string possible by removing this Shortener's words.
	 *  
	 * @param input The string to shorten
	 * @return The shortest possible string
	 */
	private String shortenString(String input) {
	  // The list of strings to process, i.e. to 
	  // shorten. We are using an ArrayList for 
	  // performance reasons.
	  List toProcess = new ArrayList();

    // Set to hold strings that we have added to the list.
    // Note that all strings will go here, even those 
    // that can not be shortened further.
	  // This implements the cache optimization.
    HashSet seen = new HashSet();
    
    // String builder to use for removing words inside strings
    StringBuilder sb = new StringBuilder();
    
    // Index of the current string to process.
    int marker;
    
    // Seed the list with the input string
    toProcess.add(input);
    seen.add(input);
    marker = 0;

    // Work through all strings in the queue until all are 
    // processed. Shorter strings will be added to the list as we 
    // go along.
    while (marker < toProcess.size()) {
      // Get the string to process.
      String current = toProcess.get(marker);
      // Move the marker one step forward.
      marker++;
      
      // Loop through all ways of shortening the string
      // by removing one of the words.
      for (String word : allWords) {
        // Loop through all matches for the word in the input string, remove
        // the word, and add the result to the list.
        for (int i = current.indexOf(word); i != -1; i = current.indexOf(word, i + 1)) {
          // Remove the word
          sb.append(current.substring(0, i));
          sb.append(current.substring(i + word.length()));
          String shorter = sb.toString();
          
          // If the shorter string hasn't already been
          // seen, add it to the list.
          if (!seen.contains(shorter)) {
            seen.add(shorter);
            toProcess.add(shorter);
          } 
          
          // Clear the string builder for the next loop.
          sb.setLength(0);
        }
      }    
    }
    
    // Loop through all strings that have been 
    // processed and select the shortest one.
    // Start with the longest one.
    String shortest = input;
    for (String current : seen) {
      if (current.length() < shortest.length()) {
        shortest = current;
      }
    }
    return shortest;
	}
	
	/**
	 * Main method...
	 */
	public static void main(String[] args) {
		// Check arguments
		if (args.length < 2) {
			System.err.println("Too few arguments.");
			System.exit(1);
		}
		
		// Extract input string and words from arguments
		String input = args[0];
		String[] words = Arrays.copyOfRange(args, 1, args.length);
	
		// Create Shortener, shorten string and print result
		Shortener shortener = new Shortener(words);
		String shortest = shortener.shorten(input);
	}
}