Reinstate timeouts for killing workers

The timeout mechanism is implemented via shared tempfile handles between the worker and master and checking the ctime of the tempfile from the master. Instead of using sockets or pipes to communicate between the workers and master, this allows the master to avoid being overloaded with wakeups when the workers are running at full crank (or this avoids having extra logic in workers to throttle wakeup notifications to master). The master still wakes up at a leisurely interval of once per second to check, reap, or murder workers that are timed out. [1] http://cr.yp.to/docs/selfpipe.html
author: Eric Wong <normalperson@yhbt.net> 2009-02-09 19:48:23 -0800
committer: Eric Wong <normalperson@yhbt.net> 2009-02-09 19:52:27 -0800
commit: 8c711107d8c53d899048190e5870f2689f63e529 (patch)
tree: a8b60d7ea82b6ef1857ea46063ec4ddce2dd300f /lib
parent: 02772f4a4693c0a8eb9412e60df52e325f76657a (diff)
download: unicorn-8c711107d8c53d899048190e5870f2689f63e529.tar.gz
1 files changed, 94 insertions, 31 deletions
diff --git a/lib/unicorn.rb b/lib/unicorn.rb
index a5aaf7f..9575944 100644
--- a/lib/unicorn.rb
+++ b/lib/unicorn.rb
@@ -44,6 +44,15 @@ module Unicorn
            server.logger.info("worker=#{worker_nr} spawning...")
          },
      }
+
+    Worker = Struct.new(:nr, :tempfile) unless defined?(Worker)
+    class Worker
+      # worker objects may be compared to just plain numbers
+      def ==(other_nr)
+        self.nr == other_nr
+      end
+    end
+
      # Creates a working server on host:port (strange things happen if
      # port isn't a Number).  Use HttpServer::run to start the server and
      # HttpServer.workers.join to join the thread that's processing
@@ -117,7 +126,9 @@ module Unicorn
      end
  
      # monitors children and receives signals forever
-    # (or until a termination signal is sent)
+    # (or until a termination signal is sent).  This handles signals
+    # one-at-a-time time and we'll happily drop signals in case somebody
+    # is signalling us too often.
      def join
        %w(QUIT INT TERM USR1 USR2 HUP).each { |sig| trap_deferred(sig) }
        begin
@@ -126,6 +137,7 @@ module Unicorn
            case @mode
            when :idle
              kill_each_worker(0) # ensure they're running
+            murder_lazy_workers
              spawn_missing_workers
            when 'QUIT' # graceful shutdown
              break
@@ -148,11 +160,15 @@ module Unicorn
              @mode = :idle
            end
            reap_all_workers
-          ready = IO.select([@rd_sig], nil, nil, 1) or next
+
+          ready = begin
+            IO.select([@rd_sig], nil, nil, 1) or next
+          rescue Errno::EINTR # next
+          end
            ready[0] && ready[0][0] or next
            begin # just consume the pipe when we're awakened, @mode is set
              loop { @rd_sig.sysread(Const::CHUNK_SIZE) }
-          rescue Errno::EAGAIN
+          rescue Errno::EAGAIN, Errno::EINTR # next
            end
          end
        rescue Errno::EINTR
@@ -207,8 +223,10 @@ module Unicorn
        begin
          loop do
            pid = waitpid(-1, WNOHANG) or break
-          worker_nr = @workers.delete(pid)
-          logger.info "reaped pid=#{pid} worker=#{worker_nr || 'unknown'} " \
+          worker = @workers.delete(pid)
+          worker.tempfile.close rescue nil
+          logger.info "reaped pid=#{pid} " \
+                      "worker=#{worker && worker.nr || 'unknown'} " \
                        "status=#{$?.exitstatus}"
          end
        rescue Errno::ECHILD
@@ -248,19 +266,39 @@ module Unicorn
        end
      end
  
+    # forcibly terminate all workers that haven't checked in in @timeout
+    # seconds.  The timeout is implemented using an unlinked tempfile
+    # shared between the parent process and each worker.  The worker
+    # runs File#chmod to modify the ctime of the tempfile.  If the ctime
+    # is stale for >@timeout seconds, then we'll kill the corresponding
+    # worker.
+    def murder_lazy_workers
+      now = Time.now
+      @workers.each_pair do |pid, worker|
+        (now - worker.tempfile.ctime) <= @timeout and next
+        logger.error "worker=#{worker.nr} pid=#{pid} is too old, killing"
+        kill_worker('KILL', pid) # take no prisoners for @timeout violations
+        worker.tempfile.close rescue nil
+      end
+    end
+
      def spawn_missing_workers
        return if @workers.size == @nr_workers
        (0...@nr_workers).each do |worker_nr|
          @workers.values.include?(worker_nr) and next
-        @before_fork.call(self, worker_nr)
-        pid = fork { worker_loop(worker_nr) }
-        @workers[pid] = worker_nr
+        tempfile = Tempfile.new('unicorn_worker')
+        tempfile.unlink # don't allow other processes to find or see it
+        tempfile.sync = true
+        worker = Worker.new(worker_nr, tempfile)
+        @before_fork.call(self, worker.nr)
+        pid = fork { worker_loop(worker) }
+        @workers[pid] = worker
        end
      end
  
      # once a client is accepted, it is processed in its entirety here
      # in 3 easy steps: read request, call app, write app response
-    def process_client(client, client_nr)
+    def process_client(client)
        env = @request.read(client) or return
        app_response = @app.call(env)
        HttpResponse.write(client, app_response)
@@ -279,18 +317,29 @@ module Unicorn
        @request.reset
      end
  
-    # runs inside each forked worker, this sits around and waits
-    # for connections and doesn't die until the parent dies
-    def worker_loop(worker_nr)
+    # gets rid of stuff the worker has no business keeping track of
+    # to free some resources and drops all sig handlers.
+    # traps for USR1, USR2, and HUP may be set in the @after_fork Proc
+    # by the user.
+    def init_worker_process
+      %w(TERM INT QUIT USR1 USR2 HUP).each { |sig| trap(sig, 'IGNORE') }
        @rd_sig.close
        @wr_sig.close
-      # allow @after_fork to override these signals:
-      %w(USR1 USR2 HUP).each { |sig| trap(sig, 'IGNORE') }
-      @after_fork.call(self, worker_nr) if @after_fork
-
+      @workers.values.each { |other| other.tempfile.close rescue nil }
+      @workers.clear
+      @start_ctx.clear
+      @mode = @start_ctx = @workers = @rd_sig = @wr_sig = nil
        @listeners.each { |sock| set_cloexec(sock) }
-      nr_before = nr = 0
-      client = nil
+    end
+
+    # runs inside each forked worker, this sits around and waits
+    # for connections and doesn't die until the parent dies (or is
+    # given a INT, QUIT, or TERM signal)
+    def worker_loop(worker)
+      init_worker_process
+      @after_fork.call(self, worker.nr) if @after_fork
+      nr = 0
+      tempfile = worker.tempfile
        alive = true
        ready = @listeners
        %w(TERM INT).each { |sig| trap(sig) { exit(0) } } # instant shutdown
@@ -300,8 +349,17 @@ module Unicorn
        end
  
        while alive && @master_pid == ppid
+        # we're a goner in @timeout seconds anyways if tempfile.chmod
+        # breaks, so don't trap the exception.  Using fchmod() since
+        # futimes() is not available in base Ruby and I very strongly
+        # prefer temporary files to be unlinked for security,
+        # performance and reliability reasons, so utime is out.  No-op
+        # changes with chmod doesn't update ctime on all filesystems; so
+        # we increment our counter each and every time.
+        tempfile.chmod(nr += 1)
+
          begin
-          nr_before = nr
+          accepted = false
            ready.each do |sock|
              begin
                client = begin
@@ -309,29 +367,30 @@ module Unicorn
                rescue Errno::EAGAIN
                  next
                end
-              client.sync = true
+              accepted = client.sync = true
                client.nonblock = false
                set_client_sockopt(client) if client.class == TCPSocket
-              nr += 1
-              process_client(client, nr)
+              process_client(client)
              rescue Errno::ECONNABORTED
                # client closed the socket even before accept
                if client && !client.closed?
                  client.close rescue nil
                end
              end
+            tempfile.chmod(nr += 1)
            end
  
            # make the following bet: if we accepted clients this round,
            # we're probably reasonably busy, so avoid calling select(2)
            # and try to do a blind non-blocking accept(2) on everything
            # before we sleep again in select
-          if nr != nr_before
+          if accepted
              ready = @listeners
            else
              begin
+              tempfile.chmod(nr += 1)
                # timeout used so we can detect parent death:
-              ret = IO.select(@listeners, nil, nil, @timeout) or next
+              ret = IO.select(@listeners, nil, nil, @timeout/2.0) or next
                ready = ret[0]
              rescue Errno::EBADF => e
                exit(alive ? 1 : 0)
@@ -348,15 +407,19 @@ module Unicorn
        end
      end
  
+    # delivers a signal to a worker and fails gracefully if the worker
+    # is no longer running.
+    def kill_worker(signal, pid)
+      begin
+        Process.kill(signal, pid)
+      rescue Errno::ESRCH
+        worker = @workers.delete(pid) and worker.tempfile.close rescue nil
+      end
+    end
+
      # delivers a signal to each worker
      def kill_each_worker(signal)
-      @workers.keys.each do |pid|
-        begin
-          Process.kill(signal, pid)
-        rescue Errno::ESRCH
-          @workers.delete(pid)
-        end
-      end
+      @workers.keys.each { |pid| kill_worker(signal, pid) }
      end
  
    end
author	Eric Wong <normalperson@yhbt.net>	2009-02-09 19:48:23 -0800
committer	Eric Wong <normalperson@yhbt.net>	2009-02-09 19:52:27 -0800
commit	8c711107d8c53d899048190e5870f2689f63e529 (patch)
tree	a8b60d7ea82b6ef1857ea46063ec4ddce2dd300f /lib
parent	02772f4a4693c0a8eb9412e60df52e325f76657a (diff)
download	unicorn-8c711107d8c53d899048190e5870f2689f63e529.tar.gz