From 1b9b3bcb734778037d7bc872e1c4d2290a7415e8 Mon Sep 17 00:00:00 2001
From: zedshaw <zedshaw@19e92222-5c0b-0410-8929-a290d50e31e9>
Date: Thu, 2 Feb 2006 06:53:32 +0000
Subject: Improved the trie searching to only require one search and work more
 correctly.  Fixed a few bugs found by people.

git-svn-id: svn+ssh://rubyforge.org/var/svn/mongrel/trunk@19 19e92222-5c0b-0410-8929-a290d50e31e9
---
 README                     |   4 +-
 Rakefile                   |   2 +-
 ext/http11/http11.c        | 101 +++++++++++++++++++++------------------------
 ext/http11/tst_search.c    |  21 ++++++++--
 test/test_uriclassifier.rb |  53 +++++++++++++++++++++++-
 5 files changed, 120 insertions(+), 61 deletions(-)

diff --git a/README b/README
index cd9e868..050c3cf 100644
--- a/README
+++ b/README
@@ -11,7 +11,7 @@ scream without too many portability issues.
 
 == Status
 
-The 0.2.0 release of Mongrel features an HTTP core server that is the fastest possible
+The 0.2.1 release of Mongrel features an HTTP core server that is the fastest possible
 thing I could get without using something other than Ruby.  It features a few bug fixes,
 but mostly just a change to the Mongrel::HttpResponse class to make it more feature 
 complete.  The remaining development will be spent getting Mongrel to work with 
@@ -68,7 +68,7 @@ basic way to give a more complex 404 message.
 
 == Speed
 
-The 0.2.0 release probably consists of the most effort I've ever put into 
+The 0.2.1 release probably consists of the most effort I've ever put into 
 tuning a Ruby library for speed.  It consists of nearly everything I could think
 of to make Mongrel the fastest Ruby HTTP library possible.  I've tried about 
 seven different architectures and IO processing methods and none of them 
diff --git a/Rakefile b/Rakefile
index e379779..f084c88 100644
--- a/Rakefile
+++ b/Rakefile
@@ -27,4 +27,4 @@ setup_extension("http11", "http11")
 
 summary = "An experimental fast simple web server for Ruby."
 test_file = "test/test_ws.rb"
-setup_gem("mongrel", "0.2.0",  "Zed A. Shaw", summary, [], test_file)
+setup_gem("mongrel", "0.2.1",  "Zed A. Shaw", summary, [], test_file)
diff --git a/ext/http11/http11.c b/ext/http11/http11.c
index 1cdb525..c01798a 100644
--- a/ext/http11/http11.c
+++ b/ext/http11/http11.c
@@ -11,15 +11,21 @@ static VALUE cHttpParser;
 static VALUE cURIClassifier;
 static int id_handler_map;
 
+static VALUE global_http_prefix;
+static VALUE global_request_method;
+static VALUE global_path_info;
+static VALUE global_query_string;
+static VALUE global_http_version;
+
 
 void http_field(void *data, const char *field, size_t flen, const char *value, size_t vlen)
 {
   char *ch, *end;
   VALUE req = (VALUE)data;
-  VALUE f = rb_str_new2("HTTP_");
   VALUE v = rb_str_new(value, vlen);
-  
-  rb_str_buf_cat(f, field, flen); 
+  VALUE f = rb_str_dup(global_http_prefix);
+
+  f = rb_str_buf_cat(f, field, flen); 
   
   for(ch = RSTRING(f)->ptr, end = ch + RSTRING(f)->len; ch < end; ch++) {
     if(*ch == '-') {
@@ -36,16 +42,14 @@ void request_method(void *data, const char *at, size_t length)
 {
   VALUE req = (VALUE)data;
   VALUE val = rb_str_new(at, length);
-  VALUE id = rb_str_new2("REQUEST_METHOD");
-  rb_hash_aset(req, id, val);
+  rb_hash_aset(req, global_request_method, val);
 }
 
 void path_info(void *data, const char *at, size_t length)
 {
   VALUE req = (VALUE)data;
   VALUE val = rb_str_new(at, length);
-  VALUE id = rb_str_new2("PATH_INFO");
-  rb_hash_aset(req, id, val);
+  rb_hash_aset(req, global_path_info, val);
 }
 
 
@@ -53,16 +57,14 @@ void query_string(void *data, const char *at, size_t length)
 {
   VALUE req = (VALUE)data;
   VALUE val = rb_str_new(at, length);
-  VALUE id = rb_str_new2("QUERY_STRING");
-  rb_hash_aset(req, id, val);
+  rb_hash_aset(req, global_query_string, val);
 }
 
 void http_version(void *data, const char *at, size_t length)
 {
   VALUE req = (VALUE)data;
   VALUE val = rb_str_new(at, length);
-  VALUE id = rb_str_new2("HTTP_VERSION");
-  rb_hash_aset(req, id, val);
+  rb_hash_aset(req, global_http_version, val);
 }
 
 
@@ -258,23 +260,6 @@ VALUE URIClassifier_alloc(VALUE klass)
  * will do two searches most of the time in order to find the right handler for the
  * registered prefix portion.
  *
- * Here's how it all works.  Let's say you register "/blog" with a BlogHandler.  Great.
- * Now, someone goes to "/blog/zedsucks/ass".  You want SCRIPT_NAME to be "/blog" and
- * PATH_INFO to be "/zedsucks/ass".  URIClassifier first does a TST search and comes
- * up with a failure, but knows that the failure ended at the "/blog" part.  So, that's
- * the SCRIPT_NAME.  It then tries a second search for just "/blog".  If that comes back
- * good then it sets the rest ("/zedsucks/ass") to the PATH_INFO and returns the BlogHandler.
- *
- * The optimal approach would be to not do the search twice, but the TST lib doesn't
- * really support returning prefixes.  Might not be hard to add later.
- *
- * The key though is that it will try to match the *longest* match it can.  If you 
- * also register "/blog/zed" then the above URI will give SCRIPT_NAME="/blog/zed", 
- * PATH_INFO="sucks/ass".  Probably not what you want, so your handler will need to
- * do the 404 thing.
- *
- * Take a look at the postamble of example/tepee.rb to see how this is handled for
- * Camping. 
  */
 VALUE URIClassifier_init(VALUE self)
 {
@@ -283,6 +268,8 @@ VALUE URIClassifier_init(VALUE self)
   // we create an internal hash to protect stuff from the GC
   hash = rb_hash_new();
   rb_ivar_set(self, id_handler_map, hash);
+
+  return self;
 }
 
 
@@ -356,7 +343,19 @@ VALUE URIClassifier_unregister(VALUE self, VALUE uri)
  *
  * Attempts to resolve either the whole URI or at the longest prefix, returning
  * the prefix (as script_info), path (as path_info), and registered handler
- * (usually an HttpHandler).
+ * (usually an HttpHandler).  If it doesn't find a handler registered at the longest
+ * match then it returns nil,nil,nil.
+ *
+ * Because the resolver uses a trie you are able to register a handler at *any* character
+ * in the URI and it will be handled as long as it's the longest prefix.  So, if you 
+ * registered handler #1 at "/something/lik", and #2 at "/something/like/that", then a
+ * a search for "/something/like" would give you #1.  A search for "/something/like/that/too"
+ * would give you #2.
+ * 
+ * This is very powerful since it means you can also attach handlers to parts of the ; 
+ * (semi-colon) separated path params, any part of the path, use off chars, anything really.
+ * It also means that it's very efficient to do this only taking as long as the URI has
+ * characters.
  *
  * It expects strings.  Don't try other string-line stuff yet.
  */
@@ -366,8 +365,6 @@ VALUE URIClassifier_resolve(VALUE self, VALUE uri)
   int pref_len = 0;
   struct tst *tst = NULL;
   VALUE result;
-  VALUE script_name;
-  VALUE path_info;
   unsigned char *uri_str = NULL;
   unsigned char *script_name_str = NULL;
 
@@ -379,32 +376,17 @@ VALUE URIClassifier_resolve(VALUE self, VALUE uri)
   // setup for multiple return values
   result = rb_ary_new();
 
-
-  if(handler == NULL) {
-    script_name = rb_str_substr (uri, 0, pref_len);
-    script_name_str = (unsigned char *)StringValueCStr(script_name);
-
-    handler = tst_search(script_name_str, tst, NULL);
-
-    if(handler == NULL) {
-      // didn't find the script name at all
-      rb_ary_push(result, Qnil);
-      rb_ary_push(result, Qnil);
-      rb_ary_push(result, Qnil);
-      return result;
-    } else {
-      // found a handler, setup the path info and we're good
-      path_info = rb_str_substr(uri, pref_len, RSTRING(uri)->len);
-    }
+  if(handler) {
+    rb_ary_push(result, rb_str_substr (uri, 0, pref_len));
+    rb_ary_push(result, rb_str_substr(uri, pref_len, RSTRING(uri)->len));
+    rb_ary_push(result, (VALUE)handler);
   } else {
-    // whole thing was found, so uri is the script name, path info empty
-    script_name = uri;
-    path_info = rb_str_new2("");
+    // not found so push back nothing
+    rb_ary_push(result, Qnil);
+    rb_ary_push(result, Qnil);
+    rb_ary_push(result, Qnil);
   }
 
-  rb_ary_push(result, script_name);
-  rb_ary_push(result, path_info);
-  rb_ary_push(result, (VALUE)handler);
   return result;
 }
 
@@ -414,6 +396,17 @@ void Init_http11()
 
   mMongrel = rb_define_module("Mongrel");
   id_handler_map = rb_intern("@handler_map");
+
+  global_http_prefix = rb_str_new2("HTTP_");
+  rb_global_variable(&global_http_prefix);
+  global_request_method = rb_str_new2("REQUEST_METHOD");
+  rb_global_variable(&global_request_method);
+  global_path_info = rb_str_new2("PATH_INFO");
+  rb_global_variable(&global_path_info);
+  global_query_string = rb_str_new2("QUERY_STRING");
+  rb_global_variable(&global_query_string);
+  global_http_version = rb_str_new2("HTTP_VERSION");
+  rb_global_variable(&global_http_version);
   
   cHttpParser = rb_define_class_under(mMongrel, "HttpParser", rb_cObject);
   rb_define_alloc_func(cHttpParser, HttpParser_alloc);
diff --git a/ext/http11/tst_search.c b/ext/http11/tst_search.c
index efa1cfa..4cfe6ff 100644
--- a/ext/http11/tst_search.c
+++ b/ext/http11/tst_search.c
@@ -7,18 +7,20 @@
 void *tst_search(unsigned char *key, struct tst *tst, int *prefix_len)
 {
   struct node *current_node;
+  void *longest_match = NULL;
   int key_index;
   
   assert(key != NULL && "key can't be NULL");
   assert(tst != NULL && "tst can't be NULL");
   
-  
   if(key[0] == 0)
     return NULL;
   
   if(tst->head[(int)key[0]] == NULL)
     return NULL;
+
   
+  if(prefix_len) *prefix_len = 0;
   current_node = tst->head[(int)key[0]];
   key_index = 1;
   
@@ -30,7 +32,13 @@ void *tst_search(unsigned char *key, struct tst *tst, int *prefix_len)
 	    if(prefix_len) *prefix_len = key_index;
 	    return current_node->middle;
 	  } else {
+
 	    current_node = current_node->middle;
+	    if(current_node && current_node->value == 0) {
+	      if(prefix_len) *prefix_len = key_index+1;
+	      longest_match = current_node->middle;
+	    }
+
             key_index++;
 	    continue;
 	  }
@@ -39,16 +47,23 @@ void *tst_search(unsigned char *key, struct tst *tst, int *prefix_len)
 	       ((current_node->value != 0) && (key[key_index] <
 					       current_node->value)) )
 	{
+	  if(current_node->left && current_node->value == 0) {
+	    if(prefix_len) *prefix_len = key_index;
+	    longest_match = current_node->middle;
+	  }
 	  current_node = current_node->left;
 	  continue;
 	}
       else
 	{
+	  if(current_node->right && current_node->value == 0) {
+	    if(prefix_len) *prefix_len = key_index;
+	    longest_match = current_node->middle;
+	  }
 	  current_node = current_node->right;
 	  continue;
 	}
     }
   
-  if(prefix_len) *prefix_len = key_index;
-  return NULL;
+  return longest_match;
 }
diff --git a/test/test_uriclassifier.rb b/test/test_uriclassifier.rb
index ae5df1d..05db8d2 100644
--- a/test/test_uriclassifier.rb
+++ b/test/test_uriclassifier.rb
@@ -25,10 +25,12 @@ class URIClassifierTest < Test::Unit::TestCase
     u = URIClassifier.new
     u.register(prefix,1)
 
+    sn,pi,val = u.resolve(prefix)
     sn,pi,val = u.resolve(test)
     assert val != nil, "didn't resolve"
     assert_equal prefix,sn, "wrong script name"
     assert_equal test[sn.length .. -1],pi, "wrong path info"
+    
   end
 
   def test_not_finding
@@ -99,6 +101,55 @@ class URIClassifierTest < Test::Unit::TestCase
     puts "\nRESOLVE(#{count}): #{res}"
     puts "REG_UNREG(#{count}): #{reg_unreg}"
   end
-      
+
+
+  def test_uri_branching
+    u = URIClassifier.new
+    u.register("/test", 1)
+    u.register("/test/this",2)
+  
+    sn,pi,h = u.resolve("/test")
+    sn,pi,h = u.resolve("/test/that")
+    assert_equal "/test", sn, "failed to properly find script off branch portion of uri"
+    assert_equal "/that", pi, "didn't get the right patch info"
+    assert_equal 1, h, "wrong result for branching uri"
+  end
+
+
+  def test_all_prefixing
+    tests = ["/test","/test/that","/test/this"]
+    uri = "/test/this/that"
+    u = URIClassifier.new
+    
+    cur = ""
+    uri.each_byte do |c|
+      cur << c.chr
+      u.register(cur, c)
+    end
+
+    # try to resolve everything with no asserts as a fuzzing
+    tests.each do |prefix|
+      cur = ""
+      prefix.each_byte do |c|
+        cur << c.chr
+        sn, pi, h = u.resolve(cur)
+        assert sn != nil, "didn't get a script name"
+        assert pi != nil, "didn't get path info"
+        assert h != nil, "didn't find the handler"
+      end
+    end
+
+    # assert that we find stuff
+    tests.each do |t|
+      sn, pi, h = u.resolve(t)
+      assert h != nil, "didn't find handler"
+    end
+
+    # assert we don't find stuff
+    sn, pi, h = u.resolve("chicken")
+    assert_nil h, "shoulnd't find anything"
+    assert_nil sn, "shoulnd't find anything"
+    assert_nil pi, "shoulnd't find anything"
+  end
 end
 
-- 
cgit v1.2.3-24-ge0c7