diff options
-rw-r--r-- | ext/http11/http11.c | 64 | ||||
-rw-r--r-- | test/test_bmhsearch.rb | 93 |
2 files changed, 142 insertions, 15 deletions
diff --git a/ext/http11/http11.c b/ext/http11/http11.c index b25607b..f2c7828 100644 --- a/ext/http11/http11.c +++ b/ext/http11/http11.c @@ -195,9 +195,11 @@ void header_done(void *data, const char *at, size_t length) inline size_t max(size_t a, size_t b) { return a>b ? a : b; } void BMHSearch_free(void *data) { + BMHSearchState *S = (BMHSearchState *)data; + if(data) { - free(((BMHSearchState *)data)->needle); - free(((BMHSearchState *)data)->found_at); + if(S->needle) free(S->needle); + if(S->found_at) free(S->found_at); free(data); } } @@ -229,21 +231,29 @@ VALUE BMHSearch_init(VALUE self, VALUE needle, VALUE max_find) DATA_GET(self, BMHSearchState, S); - S->htotal = 0; S->nfound = 0; S->skip = 0; S->nlen = RSTRING(needle)->len; S->max_find = FIX2INT(max_find); + S->needle = NULL; + S->found_at = NULL; if(S->nlen <= 0) rb_raise(eBMHSearchError, "Needle can't be 0 length."); // copy it so we don't have to worry about Ruby messing it up - S->needle = malloc(S->nlen); + S->needle = ALLOC_N(unsigned char, S->nlen); memcpy(S->needle, RSTRING(needle)->ptr, S->nlen); + // the only restriction on the needle so far is that you can't have repeat chars + if(S->needle[0] == S->needle[1]) { + free(S->needle); + S->needle = NULL; + rb_raise(eBMHSearchError, "Needle can't begin with > 1 of the same char."); + } + // setup the number of finds they want - S->found_at = malloc(sizeof(size_t) * S->max_find); + S->found_at = ALLOC_N(size_t, S->max_find); /* Preprocess */ /* Initialize the table to default value */ @@ -279,7 +289,7 @@ VALUE BMHSearch_find(VALUE self, VALUE hay) { size_t hpos = 0, npos = 0, skip = 0; unsigned char* haystack = NULL; - size_t hlen = 0; + size_t hlen = 0, last = 0; BMHSearchState *S = NULL; DATA_GET(self, BMHSearchState, S); @@ -311,14 +321,16 @@ VALUE BMHSearch_find(VALUE self, VALUE hay) } } + while(hpos < hlen) { /* Compare the S->needle backwards, and stop when first mismatch is found */ npos = S->nlen-1; - while(haystack[hpos] == S->needle[npos]) { + while(hpos < hlen && haystack[hpos] == S->needle[npos]) { if(npos == 0) { // found one, log it in found_at, but reduce by the skip if we're at the beginning S->found_at[S->nfound++] = hpos + S->htotal; + last = hpos; // keep track of the last one we found in this chunk if(S->nfound > S->max_find) { rb_raise(eBMHSearchError, "More than max requested needles found. Use pop."); @@ -334,23 +346,45 @@ VALUE BMHSearch_find(VALUE self, VALUE hay) } } + // exhausted the string already, done + if(hpos > hlen) break; + /* Find out how much ahead we can skip based on the byte that was found */ skip = max(S->nlen - npos, S->occ[haystack[hpos]]); hpos += skip; } - size_t back = S->nlen - S->occ[haystack[hlen-1]]; - if(back < S->nlen && back > 0) { - for(hpos = hlen - back; hpos < hlen; hpos++) { - skip = hlen - hpos; - if(haystack[hpos] == S->needle[0] && !memcmp(haystack+hpos, S->needle, skip)) break; + + skip = 0; // skip defaults to 0 + + // don't bother if the string ends in the needle completely + if(S->nfound == 0 || last != hlen - S->nlen) { + // invert the occ array to figure out how much to jump back + size_t back = S->nlen - S->occ[haystack[hlen-1]]; + + // the needle could have an ending char that's also repeated in the needle + // in that case, we have to adjust to search for nlen-1 just in case + if(haystack[hlen-1] == S->needle[S->nlen - 1]) { + back = S->nlen - 1; } - } else { - skip = 0; + + int found_it = 0; // defaults to 0 for test at end of it + if(back < S->nlen && back > 0) { + // search for the longest possible prefix + for(hpos = hlen - back; hpos < hlen; hpos++) { + skip = hlen - hpos; + if(!memcmp(haystack+hpos, S->needle, skip)) { + found_it = 1; // make sure we actually found it + break; + } + } + } + // skip will be 1 if nothing was found, so we have to force it 0 + if(!found_it) skip = 0; } + // keep track of the skip, it's set to 0 by the code above S->skip = skip; - // update the total processed so far so indexes will be in sync S->htotal += hlen; diff --git a/test/test_bmhsearch.rb b/test/test_bmhsearch.rb index 98c83b9..4fd7ee8 100644 --- a/test/test_bmhsearch.rb +++ b/test/test_bmhsearch.rb @@ -57,6 +57,99 @@ class BMHSearchTest < Test::Unit::TestCase end end + def test_boundaries_over_chunks + hay = ["zed is a tool ba", + "ggiethis has the baggieb", + "aggie of baggie-baggie tricks in another ", + "baggie"] + total = "zed is a tool baggiethis has the baggiebaggie of baggie-baggie tricks in another baggie" + + s = BMHSearch.new("baggie", 20) + + hay.each {|h| s.find(h) } + hay_found = s.pop + + s = BMHSearch.new("baggie", 20) + s.find(total) + total_found = s.pop + + assert_equal hay_found.length, total_found.length, "wrong number of needles found" + + total_found.length.times do |i| + assert_equal total_found[i], hay_found[i], "boundary doesn't match" + end + end + + def test_no_repeat_begin_chars + assert_raises BMHSearchError do + b = BMHSearch.new("FFI5LguQEg==", 10) + end + end + + def test_fuzzing + begin + has_rfuzz = require 'rfuzz/random' + rescue Object + has_rfuzz = false + end + + if has_rfuzz + r = RFuzz::RandomGenerator.new + needles = r.base64(2000, 100).collect {|n| "\r\n" + n.strip } + needles.each do |needle| + next if needle.length == 0 + + nchunks = r.num(1000) + 10 + bmh = BMHSearch.new(needle, nchunks+1) + total = "" # used to collect the full string for compare + + # each needle is sprinkled into up to 100 chunks + nchunks.times do + # each chunk is up to 16k in size + chunk = r.bytes(r.num(16 * 1024) + needle.length * 2) + chunk.gsub! needle, "" + + # make about 60% go across boundaries + if r.num(10) < 6 + # this one gets cut in two + cut_at = r.num(needle.length - 1) + 1 + n1 = needle[0 ... cut_at] + n2 = needle[cut_at .. -1] + + assert_equal n1+n2, needle, "oops, messed up breaking the needle" + + last_nfound = bmh.nfound + bmh.find(chunk + n1) + assert bmh.has_trailing?, "should have trailing on #{n1}:#{n2} on chunk length: #{chunk.length}" + assert_equal last_nfound, bmh.nfound, "shouldn't find it yet" + + bmh.find(n2 + chunk) + assert_equal last_nfound+1, bmh.nfound, "should have found the boundary for #{n1}:#{n2} on chunk length: #{chunk.length}" + total << chunk + n1 + n2 + chunk + else + # this one is put in complete + bmh.find(chunk + needle) + bmh.find(chunk) + + total << chunk + needle + chunk + end + end + + tbmh = BMHSearch.new(needle, nchunks+1) + tbmh.find(total) + + assert_equal total.length, bmh.total, "totals don't match" + assert_equal tbmh.nfound, bmh.nfound, "nfound don't match" + + total_found = tbmh.pop + hay_found = bmh.pop + + total_found.length.times do |i| + assert_equal total_found[i], hay_found[i], "boundary doesn't match" + end + end + end + end end |