Switch to Ragel/C-based chunk/trailer parser

This should be more robust, faster and easier to deal with than the ugly proof-of-concept regexp-based ones.
author: Eric Wong <normalperson@yhbt.net> 2009-08-09 03:02:54 -0700
committer: Eric Wong <normalperson@yhbt.net> 2009-08-09 03:11:34 -0700
commit: 81026ea66279695206ea53287427c05281662572 (patch)
tree: 14909515a565f77647e233de6c1b159d85c8a97e /lib
parent: 5b9d3e4a5ea5b5832f2b91fb9d6288c59b65a199 (diff)
download: unicorn-81026ea66279695206ea53287427c05281662572.tar.gz
5 files changed, 50 insertions, 195 deletions
diff --git a/lib/unicorn.rb b/lib/unicorn.rb
index 556aba8..b185b25 100644
--- a/lib/unicorn.rb
+++ b/lib/unicorn.rb
@@ -11,8 +11,6 @@ module Unicorn
    autoload :HttpResponse, 'unicorn/http_response'
    autoload :Configurator, 'unicorn/configurator'
    autoload :TeeInput, 'unicorn/tee_input'
-  autoload :ChunkedReader, 'unicorn/chunked_reader'
-  autoload :TrailerParser, 'unicorn/trailer_parser'
    autoload :Util, 'unicorn/util'
  
    Z = '' # the stock empty string we use everywhere...
diff --git a/lib/unicorn/chunked_reader.rb b/lib/unicorn/chunked_reader.rb
deleted file mode 100644
index b813da6..0000000
--- a/lib/unicorn/chunked_reader.rb
+++ /dev/null
@@ -1,77 +0,0 @@
-# Copyright (c) 2009 Eric Wong
-# You can redistribute it and/or modify it under the same terms as Ruby.
-
-require 'unicorn'
-require 'unicorn_http'
-
-module Unicorn
-  class ChunkedReader
-
-    def initialize(env, input, buf)
-      @env, @input, @buf = env, input, buf
-      @chunk_left = 0
-      parse_chunk_header
-    end
-
-    def readpartial(max, buf = Z.dup)
-      while @input && @chunk_left <= 0 && ! parse_chunk_header
-        @buf << @input.readpartial(Const::CHUNK_SIZE, buf)
-      end
-
-      if @input
-        begin
-          @buf << @input.read_nonblock(Const::CHUNK_SIZE, buf)
-        rescue Errno::EAGAIN, Errno::EINTR
-        end
-      end
-
-      max = @chunk_left if max > @chunk_left
-      buf.replace(last_block(max) || Z)
-      @chunk_left -= buf.size
-      (0 == buf.size && @input.nil?) and raise EOFError
-      buf
-    end
-
-  private
-
-    def last_block(max = nil)
-      rv = @buf
-      if max && rv && max < rv.size
-        @buf = rv[max - rv.size, rv.size - max]
-        return rv[0, max]
-      end
-      @buf = Z.dup
-      rv
-    end
-
-    def parse_chunk_header
-      buf = @buf
-      # ignoring chunk-extension info for now, I haven't seen any use for it
-      # (or any users, and TE:chunked sent by clients is rare already)
-      # if there was not enough data in buffer to parse length of the chunk
-      # then just return
-      if buf.sub!(/\A(?:\r\n)?([a-fA-F0-9]{1,8})[^\r]*?\r\n/, Z)
-        @chunk_left = $1.to_i(16)
-        if 0 == @chunk_left # EOF
-          buf.sub!(/\A\r\n(?:\r\n)?/, Z) # cleanup for future requests
-          if trailer = @env[Const::HTTP_TRAILER]
-            tp = TrailerParser.new(trailer)
-            while ! tp.execute!(@env, buf)
-              buf << @input.readpartial(Const::CHUNK_SIZE)
-            end
-          end
-          @input = nil
-        end
-        return @chunk_left
-      end
-
-      buf.size > 256 and
-          raise HttpParserError,
-                "malformed chunk, chunk-length not found in buffer: " \
-                "#{buf.inspect}"
-      nil
-    end
-
-  end
-
-end
diff --git a/lib/unicorn/http_request.rb b/lib/unicorn/http_request.rb
index ec215ae..26eff1f 100644
--- a/lib/unicorn/http_request.rb
+++ b/lib/unicorn/http_request.rb
@@ -1,6 +1,4 @@
-require 'stringio'
-
-# compiled extension
+# coding:binary
  require 'unicorn_http'
  
  module Unicorn
@@ -19,18 +17,13 @@ module Unicorn
        "SERVER_SOFTWARE" => "Unicorn #{Const::UNICORN_VERSION}".freeze
      }
  
-    NULL_IO = StringIO.new(Z)
      LOCALHOST = '127.0.0.1'.freeze
  
-    def initialize
-    end
-
      # Being explicitly single-threaded, we have certain advantages in
      # not having to worry about variables being clobbered :)
-    BUFFER = ' ' * Const::CHUNK_SIZE # initial size, may grow
-    BUFFER.force_encoding(Encoding::BINARY) if Z.respond_to?(:force_encoding)
+    BUF = ' ' * Const::CHUNK_SIZE # initial size, may grow
      PARSER = HttpParser.new
-    PARAMS = Hash.new
+    REQ = {}
  
      # Does the majority of the IO processing.  It has been written in
      # Ruby using about 8 different IO processing strategies.
@@ -46,7 +39,7 @@ module Unicorn
      # This does minimal exception trapping and it is up to the caller
      # to handle any socket errors (e.g. user aborted upload).
      def read(socket)
-      PARAMS.clear
+      REQ.clear
        PARSER.reset
  
        # From http://www.ietf.org/rfc/rfc3875:
@@ -56,42 +49,30 @@ module Unicorn
        #  identify the client for the immediate request to the server;
        #  that client may be a proxy, gateway, or other intermediary
        #  acting on behalf of the actual source client."
-      PARAMS[Const::REMOTE_ADDR] =
+      REQ[Const::REMOTE_ADDR] =
                      TCPSocket === socket ? socket.peeraddr.last : LOCALHOST
  
        # short circuit the common case with small GET requests first
-      PARSER.execute(PARAMS, socket.readpartial(Const::CHUNK_SIZE, BUFFER)) and
+      PARSER.headers(REQ, socket.readpartial(Const::CHUNK_SIZE, BUF)) and
            return handle_body(socket)
  
-      data = BUFFER.dup # socket.readpartial will clobber BUFFER
+      data = BUF.dup # socket.readpartial will clobber data
  
        # Parser is not done, queue up more data to read and continue parsing
        # an Exception thrown from the PARSER will throw us out of the loop
        begin
-        data << socket.readpartial(Const::CHUNK_SIZE, BUFFER)
-        PARSER.execute(PARAMS, data) and return handle_body(socket)
+        BUF << socket.readpartial(Const::CHUNK_SIZE, data)
+        PARSER.headers(REQ, BUF) and return handle_body(socket)
        end while true
      end
  
      private
  
      # Handles dealing with the rest of the request
-    # returns a Rack environment if successful
+    # returns a # Rack environment if successful
      def handle_body(socket)
-      PARAMS[Const::RACK_INPUT] = if (body = PARAMS.delete(:http_body))
-        length = PARAMS[Const::CONTENT_LENGTH].to_i
-
-        if /\Achunked\z/i =~ PARAMS[Const::HTTP_TRANSFER_ENCODING]
-          socket = ChunkedReader.new(PARAMS, socket, body)
-          length = body = nil
-        end
-
-        TeeInput.new(socket, length, body)
-      else
-        NULL_IO
-      end
-
-      PARAMS.update(DEFAULTS)
+      REQ[Const::RACK_INPUT] = Unicorn::TeeInput.new(socket)
+      REQ.update(DEFAULTS)
      end
  
    end
diff --git a/lib/unicorn/tee_input.rb b/lib/unicorn/tee_input.rb
index bbc496b..07676a6 100644
--- a/lib/unicorn/tee_input.rb
+++ b/lib/unicorn/tee_input.rb
@@ -13,15 +13,23 @@
  module Unicorn
    class TeeInput
  
-    def initialize(input, size, body)
-      @tmp = Unicorn::Util.tmpio
-
-      if body
-        @tmp.write(body)
+    # it's so awesome to not have to care for thread safety...
+
+    RAW = HttpRequest::BUF # :nodoc:
+    DST = RAW.dup # :nodoc:
+    PARSER = HttpRequest::PARSER # :nodoc:
+    REQ = HttpRequest::REQ # :nodoc:
+
+    def initialize(socket)
+      @tmp = Util.tmpio
+      @size = PARSER.content_length
+      return(@input = nil) if 0 == @size
+      @input = socket
+      if RAW.size > 0
+        PARSER.read_body(DST, RAW) and finalize_input
+        @tmp.write(DST)
          @tmp.seek(0)
        end
-      @input = input
-      @size = size # nil if chunked
      end
  
      # returns the size of the input.  This is what the Content-Length
@@ -32,10 +40,10 @@ module Unicorn
        @size and return @size
  
        if @input
-        buf = Z.dup
-        while tee(Const::CHUNK_SIZE, buf)
+        pos = @tmp.tell
+        while tee(Const::CHUNK_SIZE, DST)
          end
-        @tmp.rewind
+        @tmp.seek(pos)
        end
  
        @size = @tmp.stat.size
@@ -47,13 +55,12 @@ module Unicorn
        length = args.shift
        if nil == length
          rv = @tmp.read || Z.dup
-        tmp = Z.dup
-        while tee(Const::CHUNK_SIZE, tmp)
-          rv << tmp
+        while tee(Const::CHUNK_SIZE, DST)
+          rv << DST
          end
          rv
        else
-        buf = args.shift || Z.dup
+        buf = args.shift || DST.dup
          diff = @tmp.stat.size - @tmp.pos
          if 0 == diff
            tee(length, buf)
@@ -70,7 +77,7 @@ module Unicorn
  
        orig_size = @tmp.stat.size
        if @tmp.pos == orig_size
-        tee(Const::CHUNK_SIZE, Z.dup) or return nil
+        tee(Const::CHUNK_SIZE, DST) or return nil
          @tmp.seek(orig_size)
        end
  
@@ -79,8 +86,8 @@ module Unicorn
  
        # unlikely, if we got here, then @tmp is at EOF
        begin
-        orig_size = @tmp.stat.size
-        tee(Const::CHUNK_SIZE, Z.dup) or break
+        orig_size = @tmp.pos
+        tee(Const::CHUNK_SIZE, DST) or break
          @tmp.seek(orig_size)
          line << @tmp.gets
          $/ == line[-$/.size, $/.size] and return line
@@ -108,25 +115,23 @@ module Unicorn
      # backing store as well as returning it.  +buf+ must be specified.
      # returns nil if reading from the input returns nil
      def tee(length, buf)
-      begin
-        if @size
-          left = @size - @tmp.stat.size
-          0 == left and return nil
-          if length >= left
-            @input.readpartial(left, buf) == left and @input = nil
-          elsif @input.nil?
-            return nil
-          else
-            @input.readpartial(length, buf)
+      unless PARSER.body_eof?
+        begin
+          if PARSER.read_body(buf, @input.readpartial(length, RAW)).nil?
+            @tmp.write(buf)
+            return buf
            end
-        else # ChunkedReader#readpartial just raises EOFError when done
-          @input.readpartial(length, buf)
+        rescue EOFError
          end
-      rescue EOFError
-        return @input = nil
        end
-      @tmp.write(buf)
-      buf
+      finalize_input
+    end
+
+    def finalize_input
+      while PARSER.trailers(REQ, RAW).nil?
+        RAW << @input.readpartial(Const::CHUNK_SIZE, DST)
+      end
+      @input = nil
      end
  
    end
diff --git a/lib/unicorn/trailer_parser.rb b/lib/unicorn/trailer_parser.rb
deleted file mode 100644
index 22f2e1d..0000000
--- a/lib/unicorn/trailer_parser.rb
+++ /dev/null
@@ -1,52 +0,0 @@
-# Copyright (c) 2009 Eric Wong
-# You can redistribute it and/or modify it under the same terms as Ruby.
-require 'unicorn'
-require 'unicorn_http'
-
-# Eventually I should integrate this into HttpParser...
-module Unicorn
-  class TrailerParser
-
-    TR_FR = 'a-z-'.freeze
-    TR_TO = 'A-Z_'.freeze
-
-    # initializes HTTP trailer parser with acceptable +trailer+
-    def initialize(http_trailer)
-      @trailers = http_trailer.split(/\s*,\s*/).inject({}) { |hash, key|
-        hash[key.tr(TR_FR, TR_TO)] = true
-        hash
-      }
-    end
-
-    # Executes our TrailerParser on +data+ and modifies +env+  This will
-    # shrink +data+ as it is being consumed.  Returns true if it has
-    # parsed all trailers, false if not.  It raises HttpParserError on
-    # parse failure or unknown headers.  It has slightly smaller limits
-    # than the C-based HTTP parser but should not be an issue in practice
-    # since Content-MD5 is probably the only legitimate use for it.
-    def execute!(env, data)
-      data.size > 0xffff and
-        raise HttpParserError, "trailer buffer too large: #{data.size} bytes"
-
-      begin
-        data.sub!(/\A([^\r]+)\r\n/, Z) or return false # need more data
-
-        key, val = $1.split(/:\s*/, 2)
-
-        key.size > 256 and
-          raise HttpParserError, "trailer key #{key.inspect} is too long"
-        val.size > 8192 and
-          raise HttpParserError, "trailer value #{val.inspect} is too long"
-
-        key.tr!(TR_FR, TR_TO)
-
-        @trailers.delete(key) or
-          raise HttpParserError, "unknown trailer: #{key.inspect}"
-        env["HTTP_#{key}"] = val
-
-        @trailers.empty? and return true
-      end while true
-    end
-
-  end
-end
author	Eric Wong <normalperson@yhbt.net>	2009-08-09 03:02:54 -0700
committer	Eric Wong <normalperson@yhbt.net>	2009-08-09 03:11:34 -0700
commit	81026ea66279695206ea53287427c05281662572 (patch)
tree	14909515a565f77647e233de6c1b159d85c8a97e /lib
parent	5b9d3e4a5ea5b5832f2b91fb9d6288c59b65a199 (diff)
download	unicorn-81026ea66279695206ea53287427c05281662572.tar.gz