teaching machines

CS 240: Lecture 35 – Set Brawl

November 15, 2021 by . Filed under cs3, fall-2021, lectures.

Dear students:

Today we close out our discussion of hashing with an empirical brawl. We’ll look at solving a particular problem with a hash set, an unsorted array, a sorted array, an unbalanced binary search tree, and a balanced binary search tree.

The problem we will solve is coming up with a collection of unique words in a body of text. We want to avoid duplicates and will therefore apply a simplistic strategy to add words to the set: no word that appears in the set is a prefix of another word in the set. This won’t catch all such duplicates. The words “mouse” and “mice” will both appear. And it will catch some words that are unrelated. For example, “beard” will cause “bear” to be evicted.

A sophisticated language model is not our goal. Really we want to compare the different tools we have at our disposal for doing fast inserts, contains, and remove operations.

TODO

You have some work to do before the next class:

See you next time.

Sincerely,

P.S. It’s time for a haiku!

Hammer is to screws
As screwdriver is to nails
As I am to bugs

P.P.S. Here’s the code we wrote together.

Suffices.java

package lecture1115.ref;

import utilities.Stopwatch;

import java.io.File;
import java.io.FileNotFoundException;
import java.util.*;

public class Suffices {
  public static void main(String[] args) throws FileNotFoundException {
//    File file = new File("/Users/twodee/Desktop/foo.txt");
    File file = new File("/Users/twodee/Desktop/hugo.txt");
    findSuffices(file, new HashWordSet(), new HashWordSet(), "hash");
    findSuffices(file, new BalancedTreeWordSet(), new BalancedTreeWordSet(), "balanced bst");
    findSuffices(file, new UnbalancedTreeWordSet(), new UnbalancedTreeWordSet(), "unbalanced bst");
    findSuffices(file, new SortedArrayWordSet(), new SortedArrayWordSet(), "sorted array");
//    findSuffices(file, new lecture1115.ref.UnsortedArrayWordSet(), new lecture1115.ref.UnsortedArrayWordSet(), "unsorted array");
  }

  private static void findSuffices(File file, WordSet set, WordSet prefixes, String label) throws FileNotFoundException {
    Stopwatch timer = new Stopwatch();
    timer.start();
    Scanner in = new Scanner(file);
    while (in.hasNext()) {
      String word = in.next();
      if (!prefixes.contains(word)) {
        for (int i = 1; i < word.length(); ++i) {
          String prefix = word.substring(0, i);
          prefixes.add(prefix);
          set.remove(prefix);
        }
        set.add(word);
      }
    }
    in.close();
    double elapsedSeconds = timer.stop();
    System.out.printf("%s: %f (%d)%n", label, elapsedSeconds, set.size());
  }
}

interface WordSet {
  void add(String word);
  void remove(String word);
  boolean contains(String word);
  int size();
}

class UnsortedArrayWordSet implements WordSet {
  private final ArrayList<String> words;

  public UnsortedArrayWordSet() {
    this.words = new ArrayList<String>();
  }

  @Override
  public void add(String word) {
    if (!words.contains(word)) {
      words.add(word);
    }
  }

  @Override
  public void remove(String word) {
    words.remove(word);
  }

  public boolean contains(String word) {
    return words.contains(word);
  }

  public int size() {
    return words.size();
  }
}

class SortedArrayWordSet implements WordSet {
  private final ArrayList<String> words;

  public SortedArrayWordSet() {
    this.words = new ArrayList<String>();
  }

  @Override
  public void add(String word) {
    int left = 0;
    int right = words.size() - 1;
    while (left <= right) {
      int mid = (left + right) / 2;
      int order = word.compareTo(words.get(mid));
      if (order < 0) {
        right = mid - 1;
      } else if (order > 0) {
        left = mid + 1;
      } else {
        return;
      }
    }
    words.add(left, word);
  }

  public int indexOf(String word) {
    int left = 0;
    int right = words.size() - 1;
    while (left <= right) {
      int mid = (left + right) / 2;
      int order = word.compareTo(words.get(mid));
      if (order < 0) {
        right = mid - 1;
      } else if (order > 0) {
        left = mid + 1;
      } else {
        return mid;
      }
    }
    return -1;
  }

  public boolean contains(String word) {
    return indexOf(word) >= 0;
  }

  @Override
  public void remove(String word) {
    int index = indexOf(word);
    if (index >= 0) {
      words.remove(index);
    }
  }

  public int size() {
    return words.size();
  }
}

class UnbalancedTreeWordSet implements WordSet {
  private Node root;
  private int size;

  public UnbalancedTreeWordSet() {
    root = null;
    size = 0;
  }

  public int size() {
    return size;
  }

  public void add(String word) {
    root = rebuildWithWord(root, word);
  }

  public void remove(String word) {
    root = rebuildWithoutWord(root, word);
  }

  public boolean contains(String word) {
    Node node = root;
    while (node != null) {
      int order = word.compareTo(node.word);
      if (order < 0) {
        node = node.left;
      } else if (order > 0) {
        node = node.right;
      } else {
        return true;
      }
    }
    return false;
  }

  private Node rebuildWithWord(Node root, String word) {
    if (root == null) {
      ++size;
      return new Node(word);
    } else {
      int order = word.compareTo(root.word);
      if (order < 0) {
        root.left = rebuildWithWord(root.left, word);
      } else if (order > 0) {
        root.right = rebuildWithWord(root.right, word);
      }
      return root;
    }
  }

  private Node rebuildWithoutWord(Node root, String word) {
    if (root == null) {
      return null;
    } else {
      int order = word.compareTo(root.word);
      if (order < 0) {
        root.left = rebuildWithoutWord(root.left, word);
        return root;
      } else if (order > 0) {
        root.right = rebuildWithoutWord(root.right, word);
        return root;
      } else if (root.left == null) {
        --size;
        return root.right;
      } else if (root.right == null) {
        --size;
        return root.left;
      } else {
        --size;
        Node predecessorParent = null;
        Node predecessor = root.left;
        while (predecessor.right != null) {
          predecessorParent = predecessor;
          predecessor = predecessor.right;
        }

        root.word = predecessor.word;
        if (predecessorParent == null) {
          root.left = predecessor.left;
        } else {
          predecessorParent.right = predecessor.left;
        }

        return root;
      }
    }
  }

  static class Node {
    String word;
    Node left;
    Node right;

    public Node(String word) {
      this.word = word;
    }
  }
}

class BalancedTreeWordSet implements WordSet {
  private final TreeSet<String> set;

  public BalancedTreeWordSet() {
    set = new TreeSet<>();
  }

  public void add(String word) {
    set.add(word);
  }

  public void remove(String word) {
    set.remove(word);
  }

  public int size() {
    return set.size();
  }

  public boolean contains(String word) {
    return set.contains(word);
  }
}

class HashWordSet implements WordSet {
  private final HashSet<String> set;

  public HashWordSet() {
    set = new HashSet<>();
  }

  public void add(String word) {
    set.add(word);
  }

  public void remove(String word) {
    set.remove(word);
  }

  public int size() {
    return set.size();
  }

  public boolean contains(String word) {
    return set.contains(word);
  }
}

Stopwatch.java

package utilities;

import java.lang.management.ManagementFactory;
import java.lang.management.ThreadMXBean;

public class Stopwatch {
  private ThreadMXBean bean;
  private long startNanos;

  public Stopwatch() {
    bean = ManagementFactory.getThreadMXBean();
  }

  public void start() {
    startNanos = bean.getCurrentThreadCpuTime();
  }

  public double stop() {
    long endNanos = bean.getCurrentThreadCpuTime();
    return (endNanos - startNanos) / 1e9;
  }
}