From 1b9b3bcb734778037d7bc872e1c4d2290a7415e8 Mon Sep 17 00:00:00 2001 From: zedshaw Date: Thu, 2 Feb 2006 06:53:32 +0000 Subject: Improved the trie searching to only require one search and work more correctly. Fixed a few bugs found by people. git-svn-id: svn+ssh://rubyforge.org/var/svn/mongrel/trunk@19 19e92222-5c0b-0410-8929-a290d50e31e9 --- README | 4 +- Rakefile | 2 +- ext/http11/http11.c | 101 +++++++++++++++++++++------------------------ ext/http11/tst_search.c | 21 ++++++++-- test/test_uriclassifier.rb | 53 +++++++++++++++++++++++- 5 files changed, 120 insertions(+), 61 deletions(-) diff --git a/README b/README index cd9e868..050c3cf 100644 --- a/README +++ b/README @@ -11,7 +11,7 @@ scream without too many portability issues. == Status -The 0.2.0 release of Mongrel features an HTTP core server that is the fastest possible +The 0.2.1 release of Mongrel features an HTTP core server that is the fastest possible thing I could get without using something other than Ruby. It features a few bug fixes, but mostly just a change to the Mongrel::HttpResponse class to make it more feature complete. The remaining development will be spent getting Mongrel to work with @@ -68,7 +68,7 @@ basic way to give a more complex 404 message. == Speed -The 0.2.0 release probably consists of the most effort I've ever put into +The 0.2.1 release probably consists of the most effort I've ever put into tuning a Ruby library for speed. It consists of nearly everything I could think of to make Mongrel the fastest Ruby HTTP library possible. I've tried about seven different architectures and IO processing methods and none of them diff --git a/Rakefile b/Rakefile index e379779..f084c88 100644 --- a/Rakefile +++ b/Rakefile @@ -27,4 +27,4 @@ setup_extension("http11", "http11") summary = "An experimental fast simple web server for Ruby." test_file = "test/test_ws.rb" -setup_gem("mongrel", "0.2.0", "Zed A. Shaw", summary, [], test_file) +setup_gem("mongrel", "0.2.1", "Zed A. Shaw", summary, [], test_file) diff --git a/ext/http11/http11.c b/ext/http11/http11.c index 1cdb525..c01798a 100644 --- a/ext/http11/http11.c +++ b/ext/http11/http11.c @@ -11,15 +11,21 @@ static VALUE cHttpParser; static VALUE cURIClassifier; static int id_handler_map; +static VALUE global_http_prefix; +static VALUE global_request_method; +static VALUE global_path_info; +static VALUE global_query_string; +static VALUE global_http_version; + void http_field(void *data, const char *field, size_t flen, const char *value, size_t vlen) { char *ch, *end; VALUE req = (VALUE)data; - VALUE f = rb_str_new2("HTTP_"); VALUE v = rb_str_new(value, vlen); - - rb_str_buf_cat(f, field, flen); + VALUE f = rb_str_dup(global_http_prefix); + + f = rb_str_buf_cat(f, field, flen); for(ch = RSTRING(f)->ptr, end = ch + RSTRING(f)->len; ch < end; ch++) { if(*ch == '-') { @@ -36,16 +42,14 @@ void request_method(void *data, const char *at, size_t length) { VALUE req = (VALUE)data; VALUE val = rb_str_new(at, length); - VALUE id = rb_str_new2("REQUEST_METHOD"); - rb_hash_aset(req, id, val); + rb_hash_aset(req, global_request_method, val); } void path_info(void *data, const char *at, size_t length) { VALUE req = (VALUE)data; VALUE val = rb_str_new(at, length); - VALUE id = rb_str_new2("PATH_INFO"); - rb_hash_aset(req, id, val); + rb_hash_aset(req, global_path_info, val); } @@ -53,16 +57,14 @@ void query_string(void *data, const char *at, size_t length) { VALUE req = (VALUE)data; VALUE val = rb_str_new(at, length); - VALUE id = rb_str_new2("QUERY_STRING"); - rb_hash_aset(req, id, val); + rb_hash_aset(req, global_query_string, val); } void http_version(void *data, const char *at, size_t length) { VALUE req = (VALUE)data; VALUE val = rb_str_new(at, length); - VALUE id = rb_str_new2("HTTP_VERSION"); - rb_hash_aset(req, id, val); + rb_hash_aset(req, global_http_version, val); } @@ -258,23 +260,6 @@ VALUE URIClassifier_alloc(VALUE klass) * will do two searches most of the time in order to find the right handler for the * registered prefix portion. * - * Here's how it all works. Let's say you register "/blog" with a BlogHandler. Great. - * Now, someone goes to "/blog/zedsucks/ass". You want SCRIPT_NAME to be "/blog" and - * PATH_INFO to be "/zedsucks/ass". URIClassifier first does a TST search and comes - * up with a failure, but knows that the failure ended at the "/blog" part. So, that's - * the SCRIPT_NAME. It then tries a second search for just "/blog". If that comes back - * good then it sets the rest ("/zedsucks/ass") to the PATH_INFO and returns the BlogHandler. - * - * The optimal approach would be to not do the search twice, but the TST lib doesn't - * really support returning prefixes. Might not be hard to add later. - * - * The key though is that it will try to match the *longest* match it can. If you - * also register "/blog/zed" then the above URI will give SCRIPT_NAME="/blog/zed", - * PATH_INFO="sucks/ass". Probably not what you want, so your handler will need to - * do the 404 thing. - * - * Take a look at the postamble of example/tepee.rb to see how this is handled for - * Camping. */ VALUE URIClassifier_init(VALUE self) { @@ -283,6 +268,8 @@ VALUE URIClassifier_init(VALUE self) // we create an internal hash to protect stuff from the GC hash = rb_hash_new(); rb_ivar_set(self, id_handler_map, hash); + + return self; } @@ -356,7 +343,19 @@ VALUE URIClassifier_unregister(VALUE self, VALUE uri) * * Attempts to resolve either the whole URI or at the longest prefix, returning * the prefix (as script_info), path (as path_info), and registered handler - * (usually an HttpHandler). + * (usually an HttpHandler). If it doesn't find a handler registered at the longest + * match then it returns nil,nil,nil. + * + * Because the resolver uses a trie you are able to register a handler at *any* character + * in the URI and it will be handled as long as it's the longest prefix. So, if you + * registered handler #1 at "/something/lik", and #2 at "/something/like/that", then a + * a search for "/something/like" would give you #1. A search for "/something/like/that/too" + * would give you #2. + * + * This is very powerful since it means you can also attach handlers to parts of the ; + * (semi-colon) separated path params, any part of the path, use off chars, anything really. + * It also means that it's very efficient to do this only taking as long as the URI has + * characters. * * It expects strings. Don't try other string-line stuff yet. */ @@ -366,8 +365,6 @@ VALUE URIClassifier_resolve(VALUE self, VALUE uri) int pref_len = 0; struct tst *tst = NULL; VALUE result; - VALUE script_name; - VALUE path_info; unsigned char *uri_str = NULL; unsigned char *script_name_str = NULL; @@ -379,32 +376,17 @@ VALUE URIClassifier_resolve(VALUE self, VALUE uri) // setup for multiple return values result = rb_ary_new(); - - if(handler == NULL) { - script_name = rb_str_substr (uri, 0, pref_len); - script_name_str = (unsigned char *)StringValueCStr(script_name); - - handler = tst_search(script_name_str, tst, NULL); - - if(handler == NULL) { - // didn't find the script name at all - rb_ary_push(result, Qnil); - rb_ary_push(result, Qnil); - rb_ary_push(result, Qnil); - return result; - } else { - // found a handler, setup the path info and we're good - path_info = rb_str_substr(uri, pref_len, RSTRING(uri)->len); - } + if(handler) { + rb_ary_push(result, rb_str_substr (uri, 0, pref_len)); + rb_ary_push(result, rb_str_substr(uri, pref_len, RSTRING(uri)->len)); + rb_ary_push(result, (VALUE)handler); } else { - // whole thing was found, so uri is the script name, path info empty - script_name = uri; - path_info = rb_str_new2(""); + // not found so push back nothing + rb_ary_push(result, Qnil); + rb_ary_push(result, Qnil); + rb_ary_push(result, Qnil); } - rb_ary_push(result, script_name); - rb_ary_push(result, path_info); - rb_ary_push(result, (VALUE)handler); return result; } @@ -414,6 +396,17 @@ void Init_http11() mMongrel = rb_define_module("Mongrel"); id_handler_map = rb_intern("@handler_map"); + + global_http_prefix = rb_str_new2("HTTP_"); + rb_global_variable(&global_http_prefix); + global_request_method = rb_str_new2("REQUEST_METHOD"); + rb_global_variable(&global_request_method); + global_path_info = rb_str_new2("PATH_INFO"); + rb_global_variable(&global_path_info); + global_query_string = rb_str_new2("QUERY_STRING"); + rb_global_variable(&global_query_string); + global_http_version = rb_str_new2("HTTP_VERSION"); + rb_global_variable(&global_http_version); cHttpParser = rb_define_class_under(mMongrel, "HttpParser", rb_cObject); rb_define_alloc_func(cHttpParser, HttpParser_alloc); diff --git a/ext/http11/tst_search.c b/ext/http11/tst_search.c index efa1cfa..4cfe6ff 100644 --- a/ext/http11/tst_search.c +++ b/ext/http11/tst_search.c @@ -7,18 +7,20 @@ void *tst_search(unsigned char *key, struct tst *tst, int *prefix_len) { struct node *current_node; + void *longest_match = NULL; int key_index; assert(key != NULL && "key can't be NULL"); assert(tst != NULL && "tst can't be NULL"); - if(key[0] == 0) return NULL; if(tst->head[(int)key[0]] == NULL) return NULL; + + if(prefix_len) *prefix_len = 0; current_node = tst->head[(int)key[0]]; key_index = 1; @@ -30,7 +32,13 @@ void *tst_search(unsigned char *key, struct tst *tst, int *prefix_len) if(prefix_len) *prefix_len = key_index; return current_node->middle; } else { + current_node = current_node->middle; + if(current_node && current_node->value == 0) { + if(prefix_len) *prefix_len = key_index+1; + longest_match = current_node->middle; + } + key_index++; continue; } @@ -39,16 +47,23 @@ void *tst_search(unsigned char *key, struct tst *tst, int *prefix_len) ((current_node->value != 0) && (key[key_index] < current_node->value)) ) { + if(current_node->left && current_node->value == 0) { + if(prefix_len) *prefix_len = key_index; + longest_match = current_node->middle; + } current_node = current_node->left; continue; } else { + if(current_node->right && current_node->value == 0) { + if(prefix_len) *prefix_len = key_index; + longest_match = current_node->middle; + } current_node = current_node->right; continue; } } - if(prefix_len) *prefix_len = key_index; - return NULL; + return longest_match; } diff --git a/test/test_uriclassifier.rb b/test/test_uriclassifier.rb index ae5df1d..05db8d2 100644 --- a/test/test_uriclassifier.rb +++ b/test/test_uriclassifier.rb @@ -25,10 +25,12 @@ class URIClassifierTest < Test::Unit::TestCase u = URIClassifier.new u.register(prefix,1) + sn,pi,val = u.resolve(prefix) sn,pi,val = u.resolve(test) assert val != nil, "didn't resolve" assert_equal prefix,sn, "wrong script name" assert_equal test[sn.length .. -1],pi, "wrong path info" + end def test_not_finding @@ -99,6 +101,55 @@ class URIClassifierTest < Test::Unit::TestCase puts "\nRESOLVE(#{count}): #{res}" puts "REG_UNREG(#{count}): #{reg_unreg}" end - + + + def test_uri_branching + u = URIClassifier.new + u.register("/test", 1) + u.register("/test/this",2) + + sn,pi,h = u.resolve("/test") + sn,pi,h = u.resolve("/test/that") + assert_equal "/test", sn, "failed to properly find script off branch portion of uri" + assert_equal "/that", pi, "didn't get the right patch info" + assert_equal 1, h, "wrong result for branching uri" + end + + + def test_all_prefixing + tests = ["/test","/test/that","/test/this"] + uri = "/test/this/that" + u = URIClassifier.new + + cur = "" + uri.each_byte do |c| + cur << c.chr + u.register(cur, c) + end + + # try to resolve everything with no asserts as a fuzzing + tests.each do |prefix| + cur = "" + prefix.each_byte do |c| + cur << c.chr + sn, pi, h = u.resolve(cur) + assert sn != nil, "didn't get a script name" + assert pi != nil, "didn't get path info" + assert h != nil, "didn't find the handler" + end + end + + # assert that we find stuff + tests.each do |t| + sn, pi, h = u.resolve(t) + assert h != nil, "didn't find handler" + end + + # assert we don't find stuff + sn, pi, h = u.resolve("chicken") + assert_nil h, "shoulnd't find anything" + assert_nil sn, "shoulnd't find anything" + assert_nil pi, "shoulnd't find anything" + end end -- cgit v1.2.3-24-ge0c7