ruby_io_splice.git  about / heads / tags
zero-copy pipe I/O for Linux and Ruby
blob a0ba3812e8b7b2da91c79a1a4d111a58ea643256 16593 bytes (raw)
$ git show v3.1.0:ext/io_splice/io_splice_ext.c	# shows this blob on the CLI

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
 
#include "ruby.h"
#ifdef HAVE_RUBY_IO_H
#  include "ruby/io.h"
#else
#  include "rubyio.h"
#endif
#include <errno.h>
#include <fcntl.h>
#include <assert.h>
#include <sys/uio.h>
#include <limits.h>
#include <alloca.h>
#include <sys/utsname.h>

static VALUE sym_EAGAIN;

#ifndef F_LINUX_SPECIFIC_BASE
#  define F_LINUX_SPECIFIC_BASE 1024
#endif

#ifndef F_GETPIPE_SZ
#  define F_SETPIPE_SZ    (F_LINUX_SPECIFIC_BASE + 7)
#  define F_GETPIPE_SZ    (F_LINUX_SPECIFIC_BASE + 8)
#endif

#if ! HAVE_RB_IO_T
#  define rb_io_t OpenFile
#endif

#ifdef GetReadFile
#  define FPTR_TO_FD(fptr) (fileno(GetReadFile(fptr)))
#else
#  if !HAVE_RB_IO_T || (RUBY_VERSION_MAJOR == 1 && RUBY_VERSION_MINOR == 8)
#    define FPTR_TO_FD(fptr) fileno(fptr->f)
#  else
#    define FPTR_TO_FD(fptr) fptr->fd
#  endif
#endif

static int my_fileno(VALUE io)
{
	rb_io_t *fptr;

	for (;;) {
		switch (TYPE(io)) {
		case T_FIXNUM: return NUM2INT(io);
		case T_FILE: {
			GetOpenFile(io, fptr);
			return FPTR_TO_FD(fptr);
		}
		default:
			io = rb_convert_type(io, T_FILE, "IO", "to_io");
			/* retry */
		}
	}
}
#ifndef HAVE_RB_THREAD_BLOCKING_REGION
/* partial emulation of the 1.9 rb_thread_blocking_region under 1.8 */
#  include <rubysig.h>
#  define RUBY_UBF_IO ((rb_unblock_function_t *)-1)
typedef void rb_unblock_function_t(void *);
typedef VALUE rb_blocking_function_t(void *);
static VALUE
rb_thread_blocking_region(
	rb_blocking_function_t *fn, void *data1,
	rb_unblock_function_t *ubf, void *data2)
{
	VALUE rv;

	assert(RUBY_UBF_IO == ubf && "RUBY_UBF_IO required for emulation");

	TRAP_BEG;
	rv = fn(data1);
	TRAP_END;

	return rv;
}
#endif /* ! HAVE_RB_THREAD_BLOCKING_REGION */

#ifndef RSTRING_PTR
#  define RSTRING_PTR(s) (RSTRING(s)->ptr)
#endif
#ifndef RSTRING_LEN
#  define RSTRING_LEN(s) (RSTRING(s)->len)
#endif
#ifndef RARRAY_PTR
#  define RARRAY_PTR(s) (RARRAY(s)->ptr)
#endif
#ifndef RARRAY_LEN
#  define RARRAY_LEN(s) (RARRAY(s)->len)
#endif

static VALUE io_run(rb_blocking_function_t *fn, void *data)
{
	return rb_thread_blocking_region(fn, data, RUBY_UBF_IO, 0);
}

/*
 * Releases GVL only iff blocking I/O is used.
 * Only use this if all file descriptors in data are pipes.
 * We'll trust programmers who use non-blocking I/O explicitly to
 * want the fastest possible performance without resorting to threads,
 * so releasing and them immediately reacquiring the GVL would be
 * a waste of time.
 */
static VALUE nb_io_run(rb_blocking_function_t *fn, void *data, unsigned flags)
{
	if (flags & SPLICE_F_NONBLOCK)
		return fn(data);
	return io_run(fn, data);
}

struct splice_args {
	int fd_in;
	off_t *off_in;
	int fd_out;
	off_t *off_out;
	size_t len;
	unsigned flags;
};

static VALUE nogvl_splice(void *ptr)
{
	struct splice_args *a = ptr;

	return (VALUE)splice(a->fd_in, a->off_in, a->fd_out, a->off_out,
	                     a->len, a->flags);
}

static long do_splice(int argc, VALUE *argv, unsigned dflags)
{
	off_t i, o;
	VALUE fd_in, off_in, fd_out, off_out, len, flags;
	struct splice_args a;

	rb_scan_args(argc, argv, "51",
	             &fd_in, &off_in, &fd_out, &off_out, &len, &flags);

	a.off_in = NIL_P(off_in) ? NULL : (i = NUM2OFFT(off_in), &i);
	a.off_out = NIL_P(off_out) ? NULL : (o = NUM2OFFT(off_out), &o);
	a.fd_in = my_fileno(fd_in);
	a.fd_out = my_fileno(fd_out);
	a.len = (size_t)NUM2ULONG(len);
	a.flags = NIL_P(flags) ? dflags : NUM2UINT(flags) | dflags;

	return (long)io_run(nogvl_splice, &a);
}

/*
 * call-seq:
 *    IO.splice(fd_in, off_in, fd_out, off_out, len) => integer
 *    IO.splice(fd_in, off_in, fd_out, off_out, len, flags) => integer
 *
 * Splice +len+ bytes from/to a pipe.  Either +fd_in+ or +fd_out+
 * MUST be a pipe.  +fd_in+ and +fd_out+ may BOTH be pipes as of
 * Linux 2.6.31 or later.
 *
 * +off_in+ and +off_out+ if non-nil may be used to
 * specify an offset for the non-pipe file descriptor.
 *
 * +flags+ defaults to zero if unspecified.
 * +flags+ may be a bitmask of the following flags:
 *
 * IO::Splice::F_MOVE, IO::Splice::F_NONBLOCK, IO::Splice::F_MORE
 *
 * Returns the number of bytes spliced.
 * Raises EOFError when +fd_in+ has reached end of file.
 * Raises Errno::EAGAIN if the IO::Splice::F_NONBLOCK flag is set
 * and the pipe has no data to read from or space to write to.  May
 * also raise Errno::EAGAIN if the non-pipe descriptor has no data
 * to read from or space to write to.
 *
 *     rd, wr = (pipe = IO.pipe).map { |io| io.fileno }
 *     src_io, dst_io = File.open("/path/to/src"), File.open("/path/to/dst")
 *     src, dst = src_io.fileno, dst_io.fileno
 *
 *     nr = IO.splice(src, nil, wr, nil, IO::Splice::PIPE_CAPA, 0)
 *     IO.splice(rd, nil, dst, nil, nr, 0)
 *
 * As splice never exposes buffers to userspace, it will not take
 * into account userspace buffering done by Ruby or stdio.  It is
 * also not subject to encoding/decoding filters under Ruby 1.9.
 *
 * Consider using IO.trysplice if you are using non-blocking I/O on
 * both descriptors as it avoids the cost of raising common Errno::EAGAIN
 * exceptions.
 *
 * See manpage for full documentation:
 * http://kernel.org/doc/man-pages/online/pages/man2/splice.2.html
 */
static VALUE my_splice(int argc, VALUE *argv, VALUE self)
{
	long n = do_splice(argc, argv, 0);

	if (n == 0)
		rb_eof_error();
	if (n < 0)
		rb_sys_fail("splice");
	return LONG2NUM(n);
}

/*
 * call-seq:
 *    IO.trysplice(fd_in, off_in, fd_out, off_out, len) => integer
 *    IO.trysplice(fd_in, off_in, fd_out, off_out, len, flags) => integer
 *
 * Exactly like IO.splice, except +:EAGAIN+ is returned when either
 * the read or write end would block instead of raising Errno::EAGAIN.
 *
 * IO::Splice::F_NONBLOCK is always passed for the pipe descriptor,
 * but this can still block if the non-pipe descriptor is blocking.
 *
 * See IO.splice documentation for more details.
 */
static VALUE trysplice(int argc, VALUE *argv, VALUE self)
{
	long n = do_splice(argc, argv, SPLICE_F_NONBLOCK);

	if (n == 0)
		return Qnil;
	if (n < 0) {
		if (errno == EAGAIN)
			return sym_EAGAIN;
		rb_sys_fail("splice");
	}
	return LONG2NUM(n);
}

struct tee_args {
	int fd_in;
	int fd_out;
	size_t len;
	unsigned flags;
};

/* runs without GVL */
static VALUE nogvl_tee(void *ptr)
{
	struct tee_args *a = ptr;

	return (VALUE)tee(a->fd_in, a->fd_out, a->len, a->flags);
}

static long do_tee(int argc, VALUE *argv, unsigned dflags)
{
	VALUE fd_in, fd_out, len, flags;
	struct tee_args a;

	rb_scan_args(argc, argv, "31", &fd_in, &fd_out, &len, &flags);
	a.fd_in = my_fileno(fd_in);
	a.fd_out = my_fileno(fd_out);
	a.len = (size_t)NUM2ULONG(len);
	a.flags = NIL_P(flags) ? dflags : NUM2UINT(flags) | dflags;

	return (long)nb_io_run(nogvl_tee, &a, a.flags);
}

/*
 * call-seq:
 *   IO.tee(fd_in, fd_out, len) => integer
 *   IO.tee(fd_in, fd_out, len, flags) => integer
 *
 * Copies up to +len+ bytes of data from +fd_in+ to +fd_out+.  +fd_in+
 * and +fd_out+ must both refer to pipe descriptors.  +fd_in+ and +fd_out+
 * may not be endpoints of the same pipe.
 *
 * +flags+ may be zero (the default) or IO::Splice::F_NONBLOCK
 * Other IO::Splice flags are currently unimplemented or have no effect.
 *
 * Returns the number of bytes duplicated if successful.
 * Raises EOFError when +fd_in+ is closed and emptied.
 * Raises Errno::EAGAIN when +fd_in+ is empty and/or +fd_out+ is full
 * and +flags+ contains IO::Splice::F_NONBLOCK
 *
 * Consider using IO.trytee if you are using IO::Splice::F_NONBLOCK
 * as it avoids the cost of raising common Errno::EAGAIN exceptions.
 *
 * See manpage for full documentation:
 * http://kernel.org/doc/man-pages/online/pages/man2/tee.2.html
 */
static VALUE my_tee(int argc, VALUE *argv, VALUE self)
{
	long n = do_tee(argc, argv, 0);

	if (n == 0)
		rb_eof_error();
	if (n < 0)
		rb_sys_fail("tee");

	return LONG2NUM(n);
}

/*
 * call-seq:
 *    IO.trytee(fd_in, fd_out, len) => integer
 *    IO.trytee(fd_in, fd_out, len, flags) => integer
 *
 * Exactly like IO.tee, except +:EAGAIN+ is returned when either
 * the read or write end would block instead of raising Errno::EAGAIN.
 *
 * IO::Splice::F_NONBLOCK is always passed for the pipe descriptor,
 * but this can still block if the non-pipe descriptor is blocking.
 *
 * See IO.tee documentation for more details.
 */
static VALUE trytee(int argc, VALUE *argv, VALUE self)
{
	long n = do_tee(argc, argv, SPLICE_F_NONBLOCK);

	if (n == 0)
		return Qnil;
	if (n < 0) {
		if (errno == EAGAIN)
			return sym_EAGAIN;
		rb_sys_fail("tee");
	}

	return LONG2NUM(n);
}

struct vmsplice_args {
	int fd;
	struct iovec *iov;
	unsigned long nr_segs;
	unsigned flags;
};

static VALUE nogvl_vmsplice(void *ptr)
{
	struct vmsplice_args *a = ptr;

	return (VALUE)vmsplice(a->fd, a->iov, a->nr_segs, a->flags);
}

/* this can't be a function since we use alloca() */
#define ARY2IOVEC(iov,iovcnt,expect,ary) \
do { \
	VALUE *cur; \
	struct iovec *tmp; \
	long n; \
	cur = RARRAY_PTR(ary); \
	n = RARRAY_LEN(ary); \
	if (n > IOV_MAX) \
		rb_raise(rb_eArgError, "array is larger than IOV_MAX"); \
	iov = tmp = alloca(sizeof(struct iovec) * n); \
	expect = 0; \
	iovcnt = n; \
	for (; --n >= 0; tmp++, cur++) { \
		Check_Type(*cur, T_STRING); \
		tmp->iov_base = RSTRING_PTR(*cur); \
		tmp->iov_len = RSTRING_LEN(*cur); \
		expect += tmp->iov_len; \
	} \
} while (0)

static void advance_vmsplice_args(struct vmsplice_args *a, long n)
{
	struct iovec *new_iov = a->iov;
	unsigned long i;

	/* skip over iovecs we've already written completely */
	for (i = 0; i < a->nr_segs; i++, new_iov++) {
		if (n == 0)
			break;
		/*
		 * partially written iov,
		 * modify and retry with current iovec in
		 * front
		 */
		if (new_iov->iov_len > (size_t)n) {
			VALUE base = (VALUE)new_iov->iov_base;

			new_iov->iov_len -= n;
			new_iov->iov_base = (void *)(base + n);
			break;
		}

		n -= new_iov->iov_len;
	}

	/* setup to retry without the already-written iovecs */
	a->nr_segs -= i;
	a->iov = new_iov;
}

/*
 * call-seq:
 *   IO.vmsplice(fd, string_array) => integer
 *   IO.vmsplice(fd, string_array, flags) => integer
 *   IO.vmsplice(fd, string) => integer
 *   IO.vmsplice(fd, string, flags) => integer
 *
 * Transfers an array of strings into the pipe descriptor given by fd.
 * +fd+ must be the writable end of a pipe.
 *
 * This may allow the kernel to avoid data copies in some cases.
 * but is (probably) of limited usefulness in Ruby.  If you have
 * use cases or ideas for making this more useful for Ruby users,
 * please tell us at ruby.io.splice@librelist.com!
 *
 * Also consider the "sendfile" RubyGem or IO.copy_stream in Ruby 1.9
 * if you want to do zero-copy file transfers to pipes or sockets.  As
 * of Linux 2.6.33, sendfile(2) can copy to any output descriptor,
 * not just sockets.
 *
 * See manpage for full documentation:
 * http://kernel.org/doc/man-pages/online/pages/man2/vmsplice.2.html
 */
static VALUE my_vmsplice(int argc, VALUE * argv, VALUE self)
{
	long rv = 0;
	ssize_t left;
	struct vmsplice_args a;
	VALUE fd, data, flags;

	rb_scan_args(argc, argv, "21", &fd, &data, &flags);

	switch (TYPE(data)) {
	case T_STRING: {
		struct iovec iov;

		iov.iov_base = RSTRING_PTR(data);
		iov.iov_len = (size_t)(left = (ssize_t)RSTRING_LEN(data));
		a.iov = &iov;
		a.nr_segs = 1;
		}
		break;
	case T_ARRAY:
		ARY2IOVEC(a.iov, a.nr_segs, left, data);
		break;
	default:
		rb_raise(rb_eTypeError, "wrong argument type %s "
		         "(expected a String or Array of strings)",
		         rb_obj_classname(data));
	}
	a.fd = my_fileno(fd);
	a.flags = NIL_P(flags) ? 0 : NUM2UINT(flags);

	for (;;) {
		long n = (long)nb_io_run(nogvl_vmsplice, &a, a.flags);

		if (n < 0) {
			if (errno == EAGAIN) {
				if (a.flags & SPLICE_F_NONBLOCK)
					rb_sys_fail("vmsplice");
				else if (rb_io_wait_writable(a.fd))
					continue;
				/* fall through on error */
			}
			/*
			 * unlikely to hit this case, return the
			 * already written bytes, we'll let the next
			 * write (or close) fail instead
			 */
			if (rv > 0)
				break;
			rb_sys_fail("vmsplice");
		}

		rv += n;
		left -= n;
		if (left == 0)
			break;
		advance_vmsplice_args(&a, n);
	}

	return LONG2NUM(rv);
}

/*
 * call-seq:
 *   reader, writer = IO.pipe
 *   reader.pipe_size => integer
 *
 * Returns the pipe capacity of the underlying pipe in bytes.  The
 * default capacity is 65536 bytes since Linux 2.6.11, and 4096 bytes
 * in previous kernels.
 *
 * Since the pipe is a circular buffer in the same kernel, the size
 * of the reader is exactly the same as the size of the writer.
 *
 * This method is only exposed on Linux 2.6.35 or later.
 */
static VALUE pipe_size(VALUE self)
{
	int size = fcntl(my_fileno(self),  F_GETPIPE_SZ);

	if (size < 0)
		rb_sys_fail("fcntl(F_GETPIPE_SZ)");

	return INT2NUM(size);
}

/*
 * call-seq:
 *   reader, writer = IO.pipe
 *   reader.pipe_size = integer
 *
 * Sets and returns the pipe capacity of the underlying pipe in bytes.
 *
 * This MUST be a power-of-two, or Errno::EINVAL will be raised.
 * Linux will silently increase this to be equal to the page size
 * (4096 bytes on most architectures) if the specified value is
 * less than the size of a page.
 *
 * For users without CAP_SYS_RESOURCE, this raises Errno::EPERM when
 * attempting to specify a value greater than the value in
 * /proc/sys/fs/pipe-max-size.
 *
 * Since the pipe is a circular buffer in the same kernel, the size
 * of the reader is exactly the same as the size of the writer.
 *
 * Raises Errno::EBUSY if the assigned value is less than
 * the currently filled portion of the pipe.
 *
 * This method is only exposed on Linux 2.6.35 or later.
 */
static VALUE set_pipe_size(VALUE self, VALUE size)
{
	int fd = my_fileno(self);
	int bytes = NUM2INT(size);
	int rv = fcntl(fd, F_SETPIPE_SZ, bytes);

	if (rv < 0) {
		if (errno == ENOMEM) {
			rb_gc();
			rv = fcntl(fd, F_SETPIPE_SZ, bytes);
		}
		if (rv < 0)
			rb_sys_fail("fcntl(F_SETPIPE_SZ)");
	}

	return size;
}

void Init_io_splice_ext(void)
{
	VALUE mSplice = rb_define_module_under(rb_cIO, "Splice");
	struct utsname utsname;

	rb_define_singleton_method(rb_cIO, "splice", my_splice, -1);
	rb_define_singleton_method(rb_cIO, "trysplice", trysplice, -1);
	rb_define_singleton_method(rb_cIO, "tee", my_tee, -1);
	rb_define_singleton_method(rb_cIO, "trytee", trytee, -1);
	rb_define_singleton_method(rb_cIO, "vmsplice", my_vmsplice, -1);

	/*
	 * Attempt to move pages instead of copying.  This is only a hint
	 * and support for it was removed in Linux 2.6.21.  It will be
         * re-added for FUSE filesystems only in Linux 2.6.35.
	 */
	rb_define_const(mSplice, "F_MOVE", UINT2NUM(SPLICE_F_MOVE));

	/*
	 * Do not block on pipe I/O.  This flag only affects the pipe(s)
	 * being spliced from/to and has no effect on the non-pipe
	 * descriptor (which requires non-blocking operation to be set
	 * explicitly).
	 *
	 * The non-blocking flag (O_NONBLOCK) on the pipe descriptors
	 * themselves are ignored by this family of functions, and
	 * using this flag is the only way to get non-blocking operation
	 * out of them.
	 */
	rb_define_const(mSplice, "F_NONBLOCK", UINT2NUM(SPLICE_F_NONBLOCK));

	/*
	 * Indicate that there may be more data coming into the outbound
	 * descriptor.  This can allow the kernel to avoid sending partial
	 * frames from sockets.  Currently only used with splice.
	 */
	rb_define_const(mSplice, "F_MORE", UINT2NUM(SPLICE_F_MORE));

	/*
	 * Only usable by vmsplice.  This flag probably not useful in the
	 * context of Ruby applications which cannot control alignment.
	 */
	rb_define_const(mSplice, "F_GIFT", UINT2NUM(SPLICE_F_GIFT));

	/*
	 * The maximum size of an atomic write to a pipe
	 * POSIX requires this to be at least 512 bytes.
	 * Under Linux, this is 4096 bytes.
	 */
	rb_define_const(mSplice, "PIPE_BUF", UINT2NUM(PIPE_BUF));

	if (uname(&utsname) == -1)
		rb_sys_fail("uname");

	/* includes 2.6.35-rc[1-6] */
	if (strcmp(utsname.release, "2.6.35") >= 0) {
		rb_define_method(rb_cIO, "pipe_size", pipe_size, 0);
		rb_define_method(rb_cIO, "pipe_size=", set_pipe_size, 1);

		/*
		 * fcntl() command constant used to return the size of a pipe.
		 * This constant is only defined when running Linux 2.6.35
		 * or later.  For convenience, use IO#pipe_size instead.
		 */
		rb_define_const(mSplice, "F_GETPIPE_SZ",
		                UINT2NUM(F_GETPIPE_SZ));

		/*
		 * fcntl() command constant used to set the size of a pipe.
		 * This constant is only defined when running Linux 2.6.35
		 * or later.  For convenience, use IO#pipe_size= instead.
		 */
		rb_define_const(mSplice, "F_SETPIPE_SZ",
		                UINT2NUM(F_SETPIPE_SZ));
	}

	sym_EAGAIN = ID2SYM(rb_intern("EAGAIN"));
}

git clone https://yhbt.net/ruby_io_splice.git