juan_gandhi: (Default)
[personal profile] juan_gandhi
Tried to figure out whether String.contains() is better or worse than Pattern.compile() (do this once) .matcher().matches():

here's what I got:
StringContains(ab,1000):    0.4291536470 sec/million
PatternMatches(ab,1000):    0.1891851651 sec/million
StringContains(abcd,1000):    0.4248620093 sec/million
PatternMatches(abcd,1000):    0.2675056776 sec/million
StringContains(abcdefgh,1000):    0.4405976392 sec/million
PatternMatches(abcdefgh,1000):    0.1933575138 sec/million


Oh fcuk! The pattern was wrong, it was successfully failing on the first character!

Here are the right results:
StringContains(1,5000):  	    0.2350807750 sec/million
PatternMatches(1,5000):  	   74.5895135201 sec/million
StringContains(2,5000):  	    1.8787401950 sec/million
PatternMatches(2,5000):  	  154.4377975827 sec/million
StringContains(4,5000):  	    2.3345991795 sec/million
PatternMatches(4,5000):  	  169.6984495178 sec/million
StringContains(8,5000):  	    1.6679779701 sec/million
PatternMatches(8,5000):  	  105.2310321675 sec/million
StringContains(16,5000):  	    2.3345991795 sec/million
PatternMatches(16,5000):  	  172.8726651203 sec/million
StringContains(32,5000):  	    2.2831006681 sec/million
PatternMatches(32,5000):  	  165.0592113295 sec/million
StringContains(64,5000):  	    2.2811933159 sec/million
PatternMatches(64,5000):  	  162.8616774509 sec/million
StringContains(128,5000):  	    2.2935889183 sec/million
PatternMatches(128,5000):  	  168.4775973630 sec/million
StringContains(256,5000):  	    2.3403212363 sec/million
PatternMatches(256,5000):  	  170.7972164571 sec/million
StringContains(512,5000):  	    2.3403212363 sec/million
PatternMatches(512,5000):  	  171.5297277500 sec/million


The first parameter is the sample length ("a", "ab", "abcd" etc); the second is the length of a randomly-built string that, in half of the cases, contains the sample at a random position.


package com.kaching.experimental.users.vlad;

import java.util.Random;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class PatternMatchingBenchmark {

  static Random r = new Random();

  static String[] randomStrings = new String[100000];

  private static void buildRandomStrings(String sample, int textlength) {
    for (int i = 0; i < randomStrings.length; i++) {
      randomStrings[i] = newRandomString(sample, textlength);
    }
  }

  private static String newRandomString(String pattern, int stringLength) {
    int pos = r.nextInt((stringLength - pattern.length()));
    StringBuilder sb = new StringBuilder();
    fillRandomChars(pos, sb);
    if (r.nextBoolean()) sb.append(pattern);
    fillRandomChars(stringLength - sb.length(), sb);
    return sb.toString();
  }

  private static void fillRandomChars(int n, StringBuilder sb) {
    for (int i = 0; i < n; i++) {
      sb.append(randomChar());
    }
  }

  private static char randomChar() {
    return (char)(32 + r.nextInt(95));
  }

  interface Runner {
    void run(boolean actually);
  }

  static double benchmark(BenchmarkRunner benchmarkRunner, long timeToSpend) {
    long timeSpent = 0;
    long numRuns = 0;
    while (timeSpent < timeToSpend) {
      numRuns = numRuns * 2 + 1;
      timeSpent = runloop(benchmarkRunner, numRuns, true);
    }
    if (numRuns < 100) {
      throw new RuntimeException("Benchmark won't run just " + numRuns + " runs");
    }

    runloop(benchmarkRunner, numRuns, false);
    long dryTime = runloop(benchmarkRunner, numRuns, false);
    timeSpent = runloop(benchmarkRunner, numRuns, true) - dryTime;
    return ((double) (timeSpent < (dryTime / 50) ? 0 : timeSpent)) / numRuns * 1000;
  }

  private static long runloop(BenchmarkRunner benchmarkRunner, long numRuns, boolean actually) {
    long now = System.currentTimeMillis();
    for (int i = 0; i < numRuns; i++) {
      benchmarkRunner.run(actually);
    }
    if (actually && numRuns > 10 && !benchmarkRunner.found) {
      throw new RuntimeException("Benchmark " + benchmarkRunner + " failed to find anything");
    }
    return System.currentTimeMillis() - now;
  }

  /**
   * @param args
   */
  static double x = 0;

  static abstract class BenchmarkRunner {
    private String sample;
    int counter = -1;
    boolean found = false;

    BenchmarkRunner(String sample) {
      this.sample = sample;
    }

    protected String nextString() {
      counter = (counter + 1) % randomStrings.length;
      return randomStrings[counter++];
    };

    abstract void run(boolean actually);
  }

  static BenchmarkRunner stringContains(final String sample, final int textlength) {

    return new BenchmarkRunner(sample) {
      @Override
      public void run(boolean actually) {
        String s = nextString();
        if (actually) {
          found |= s.contains(sample);
        }
      }
    };
  }

  static BenchmarkRunner patternMatches(final String sample, final int textlength) {
    final Pattern p = Pattern.compile("^.*" + sample);
    return new BenchmarkRunner(sample) {
      @Override
      public void run(boolean actually) {
        String s = nextString();
          if (actually) {
            Matcher m = p.matcher(s);
            found |= m.find();
          }
        }
      };
    }

  public static void main(String[] args) {
    for (int n = 1; n < 600; n *= 2) {
      StringBuilder sb = new StringBuilder();
      for (int i = 0; i < n; i++) {
        sb.append((char) ('a' + i % 26));
      }
      compareBenchmarks(sb.toString(), 5000);
    }
  }

  private static void compareBenchmarks(String sample, int textlength) {
    buildRandomStrings(sample, textlength);
    System.out.printf("StringContains(%s,%s):  \t %15.10f sec/million\n", sample.length(), textlength, benchmark(stringContains(sample, textlength), 1000));
    System.out.printf("PatternMatches(%s,%s):  \t %15.10f sec/million\n", sample.length(), textlength, benchmark(patternMatches(sample, textlength), 1000));
  }
}


This account has disabled anonymous posting.
If you don't have an account you can create one now.
HTML doesn't work in the subject.
More info about formatting

Profile

juan_gandhi: (Default)
Juan-Carlos Gandhi

June 2025

S M T W T F S
1 2345 6 7
8 9 10 11 121314
15161718 1920 21
222324252627 28
2930     

Most Popular Tags

Style Credit

Expand Cut Tags

No cut tags
Page generated Jun. 30th, 2025 05:38 pm
Powered by Dreamwidth Studios