2 * Network Block Device - server
4 * Copyright 1996-1998 Pavel Machek, distribute under GPL
5 * <pavel@atrey.karlin.mff.cuni.cz>
6 * Copyright 2001-2004 Wouter Verhelst <wouter@debian.org>
7 * Copyright 2002 Anton Altaparmakov <aia21@cam.ac.uk>
9 * Version 1.0 - hopefully 64-bit-clean
10 * Version 1.1 - merging enhancements from Josh Parsons, <josh@coombs.anu.edu.au>
11 * Version 1.2 - autodetect size of block devices, thanx to Peter T. Breuer" <ptb@it.uc3m.es>
12 * Version 1.5 - can compile on Unix systems that don't have 64 bit integer
13 * type, or don't have 64 bit file offsets by defining FS_32BIT
14 * in compile options for nbd-server *only*. This can be done
15 * with make FSCHOICE=-DFS_32BIT nbd-server. (I don't have the
16 * original autoconf input file, or I would make it a configure
17 * option.) Ken Yap <ken@nlc.net.au>.
18 * Version 1.6 - fix autodetection of block device size and really make 64 bit
19 * clean on 32 bit machines. Anton Altaparmakov <aia21@cam.ac.uk>
20 * Version 2.0 - Version synchronised with client
21 * Version 2.1 - Reap zombie client processes when they exit. Removed
22 * (uncommented) the _IO magic, it's no longer necessary. Wouter
23 * Verhelst <wouter@debian.org>
24 * Version 2.2 - Auto switch to read-only mode (usefull for floppies).
25 * Version 2.3 - Fixed code so that Large File Support works. This
26 * removes the FS_32BIT compile-time directive; define
27 * _FILE_OFFSET_BITS=64 and _LARGEFILE_SOURCE if you used to be
28 * using FS_32BIT. This will allow you to use files >2GB instead of
29 * having to use the -m option. Wouter Verhelst <wouter@debian.org>
30 * Version 2.4 - Added code to keep track of children, so that we can
31 * properly kill them from initscripts. Add a call to daemon(),
32 * so that processes don't think they have to wait for us, which is
33 * interesting for initscripts as well. Wouter Verhelst
35 * Version 2.5 - Bugfix release: forgot to reset child_arraysize to
36 * zero after fork()ing, resulting in nbd-server going berserk
37 * when it receives a signal with at least one child open. Wouter
38 * Verhelst <wouter@debian.org>
39 * 10/10/2003 - Added socket option SO_KEEPALIVE (sf.net bug 819235);
40 * rectified type of mainloop::size_host (sf.net bugs 814435 and
41 * 817385); close the PID file after writing to it, so that the
42 * daemon can actually be found. Wouter Verhelst
44 * 10/10/2003 - Size of the data "size_host" was wrong and so was not
45 * correctly put in network endianness. Many types were corrected
46 * (size_t and off_t instead of int). <vspaceg@sourceforge.net>
47 * Version 2.6 - Some code cleanup.
48 * Version 2.7 - Better build system.
49 * 11/02/2004 - Doxygenified the source, modularized it a bit. Needs a
50 * lot more work, but this is a start. Wouter Verhelst
52 * 16/03/2010 - Add IPv6 support.
53 * Kitt Tientanopajai <kitt@kitty.in.th>
54 * Neutron Soutmun <neo.neutron@gmail.com>
55 * Suriya Soutmun <darksolar@gmail.com>
58 /* Includes LFS defines, which defines behaviours of some of the following
59 * headers, so must come before those */
61 #define _DEFAULT_SOURCE
62 #define _XOPEN_SOURCE 500 /* to get pread/pwrite */
64 #define _BSD_SOURCE /* to get DT_* macros on some platforms */
66 #define _DARWIN_C_SOURCE /* to get DT_* macros on OS X */
69 #include <sys/types.h>
70 #include <sys/socket.h>
72 #include <sys/select.h>
75 #ifdef HAVE_SYS_IOCTL_H
76 #include <sys/ioctl.h>
81 #include <sys/param.h>
85 #include <netinet/tcp.h>
86 #include <netinet/in.h>
96 #include <linux/falloc.h>
101 #include <arpa/inet.h>
104 #ifdef HAVE_SYS_DIR_H
107 #ifdef HAVE_SYS_DIRENT_H
108 #include <sys/dirent.h>
115 #include <inttypes.h>
119 /* used in cliserv.h, so must come first */
120 #define MY_NAME "nbd_server"
122 #include "nbd-debug.h"
123 #include "netdb-compat.h"
125 #include "treefiles.h"
126 #include "nbd-helper.h"
129 #include <sdp_inet.h>
132 #if HAVE_FSCTL_SET_ZERO_DATA
134 /* don't include <windows.h> to avoid redefining eg the ERROR macro */
138 #include <winioctl.h>
141 /** Default position of the config file */
143 #define SYSCONFDIR "/etc"
145 #define CFILE SYSCONFDIR "/nbd-server/config"
148 #include <gnutls/gnutls.h>
149 #include <gnutls/x509.h>
152 #ifndef HAVE_G_MEMDUP2
153 /* Our uses of g_memdup2 below are safe from g_memdup's 32-bit overflow */
154 #define g_memdup2 g_memdup
158 * Shorten error handling and regular function return sequences
159 * automatically freeing dynamically allocated resources
161 #define _cleanup_(x) __attribute__((__cleanup__(x)))
162 static inline void g_freep(void *p) {
165 #define _cleanup_g_free_ _cleanup_(g_freep)
166 #define DEFINE_TRIVIAL_CLEANUP_FUNC(type, func) \
167 static inline void func##p(type *p) { \
171 DEFINE_TRIVIAL_CLEANUP_FUNC(GKeyFile*, g_key_file_free)
172 DEFINE_TRIVIAL_CLEANUP_FUNC(gchar **, g_strfreev)
174 /** Where our config file actually is */
175 gchar* config_file_pos;
180 /* Whether we should avoid daemonizing the main process */
183 /* Whether we should avoid forking into child processes */
187 * The highest value a variable of type off_t can reach. This is a signed
188 * integer, so set all bits except for the leftmost one.
190 #define OFFT_MAX ~((off_t)1<<(sizeof(off_t)*8-1))
191 #define BUFSIZE ((1024*1024)+sizeof(struct nbd_reply)) /**< Size of buffer that can hold requests */
192 #define DIFFPAGESIZE 4096 /**< diff file uses those chunks */
195 #define F_OLDSTYLE 1 /**< Allow oldstyle (port-based) exports */
196 #define F_LIST 2 /**< Allow clients to list the exports on a server */
197 #define F_NO_ZEROES 4 /**< Do not send zeros to client */
198 #define F_DUAL_LISTEN 8 /**< Listen on both TCP and unix socket */
199 // also accepts F_FORCEDTLS (which is 16384)
200 GHashTable *children;
201 char pidfname[256]; /**< name of our PID file */
202 char default_authname[] = SYSCONFDIR "/nbd-server/allow"; /**< default name of allow file */
204 #define NEG_INIT (1 << 0)
205 #define NEG_OLD (1 << 1)
206 #define NEG_MODERN (1 << 2)
209 * If we want what the system really has set we'd have to read
210 * /proc/sys/fs/pipe-max-size, but for now 1mb should be enough.
212 #define MAX_PIPE_SIZE (1 * 1024 * 1024)
218 /* Our thread pool */
219 GThreadPool *tpool = NULL;
221 /* A work package for the thread pool functions */
222 struct work_package {
224 struct nbd_request* req;
226 void* data; /**< for write requests */
229 static volatile sig_atomic_t is_sigchld_caught; /**< Flag set by
234 static volatile sig_atomic_t is_sigterm_caught; /**< Flag set by
239 static volatile sig_atomic_t is_sighup_caught; /**< Flag set by SIGHUP
244 GArray* modernsocks; /**< Sockets for the modern handler. Not used
245 if a client was only specified on the
246 command line; only port used if
247 oldstyle is set to false (and then the
248 command-line client isn't used, gna gna).
249 This may be more than one socket on
250 systems that don't support serving IPv4
251 and IPv6 from the same socket (like,
253 GArray* childsocks; /**< parent-side sockets for communication with children */
254 int commsocket; /**< child-side socket for communication with parent */
255 static sem_t file_wait_sem;
257 bool logged_oversized=false; /**< whether we logged oversized requests already */
260 * Type of configuration file values
263 PARAM_INT, /**< This parameter is an integer */
264 PARAM_INT64, /**< This parameter is an integer */
265 PARAM_STRING, /**< This parameter is a string */
266 PARAM_BOOL, /**< This parameter is a boolean */
270 * Configuration file values
273 gchar *paramname; /**< Name of the parameter, as it appears in
275 gboolean required; /**< Whether this is a required (as opposed to
276 optional) parameter */
277 PARAM_TYPE ptype; /**< Type of the parameter. */
278 gpointer target; /**< Pointer to where the data of this
279 parameter should be written. If ptype is
280 PARAM_BOOL, the data is or'ed rather than
282 gint flagval; /**< Flag mask for this parameter in case ptype
287 * Configuration file values of the "generic" section
289 struct generic_conf {
290 gchar *user; /**< user we run the server as */
291 gchar *group; /**< group we run running as */
292 gchar *modernaddr; /**< address of the modern socket */
293 gchar *modernport; /**< port of the modern socket */
294 gchar *unixsock; /**< file name of the unix domain socket */
295 gchar *certfile; /**< certificate file */
296 gchar *keyfile; /**< key file */
297 gchar *cacertfile; /**< CA certificate file */
298 gchar *tlsprio; /**< TLS priority string */
299 gint flags; /**< global flags */
300 gint threads; /**< maximum number of parallel threads we want to run */
304 static int writeit_tls(gnutls_session_t s, void *buf, size_t len) {
305 _cleanup_g_free_ char *m = NULL;
309 if ((res = gnutls_record_send(s, buf, len)) < 0 && !gnutls_error_is_fatal(res)) {
310 m = g_strdup_printf("issue while sending data: %s", gnutls_strerror(res));
313 m = g_strdup_printf("could not send data: %s", gnutls_strerror(res));
324 static int readit_tls(gnutls_session_t s, void *buf, size_t len) {
325 _cleanup_g_free_ char *m = NULL;
329 if((res = gnutls_record_recv(s, buf, len)) < 0 && !gnutls_error_is_fatal(res)) {
330 m = g_strdup_printf("issue while receiving data: %s", gnutls_strerror(res));
333 m = g_strdup_printf("could not receive data: %s", gnutls_strerror(res));
344 static int socket_read_tls(CLIENT* client, void *buf, size_t len) {
345 return readit_tls(*((gnutls_session_t*)client->tls_session), buf, len);
348 static int socket_write_tls(CLIENT* client, void *buf, size_t len) {
349 return writeit_tls(*((gnutls_session_t*)client->tls_session), buf, len);
351 #endif // HAVE_GNUTLS
353 static int socket_read_notls(CLIENT* client, void *buf, size_t len) {
354 return readit(client->net, buf, len);
357 static int socket_write_notls(CLIENT* client, void *buf, size_t len) {
358 return writeit(client->net, buf, len);
361 static void socket_read(CLIENT* client, void *buf, size_t len) {
362 g_assert(client->socket_read != NULL);
363 if(client->socket_read(client, buf, len)<0) {
364 g_assert(client->socket_closed != NULL);
365 client->socket_closed(client);
370 * Consume data from a socket that we don't want
372 * @param c the client to read from
373 * @param len the number of bytes to consume
374 * @param buf a buffer
375 * @param bufsiz the size of the buffer
377 static inline void consume(CLIENT* c, size_t len, void * buf, size_t bufsiz) {
380 curlen = (len>bufsiz)?bufsiz:len;
381 socket_read(c, buf, curlen);
387 * Consume a length field and corresponding payload that we don't want
389 * @param c the client to read from
391 static inline void consume_len(CLIENT* c) {
395 socket_read(c, &len, sizeof(len));
397 consume(c, len, buf, sizeof(buf));
400 static void socket_write(CLIENT* client, void *buf, size_t len) {
401 g_assert(client->socket_write != NULL);
402 if(client->socket_write(client, buf, len)<0) {
403 g_assert(client->socket_closed != NULL);
404 client->socket_closed(client);
408 static inline void socket_closed_negotiate(CLIENT* client) {
409 err("Negotiation failed: %m");
412 static void cleanup_transactionlog(CLIENT *client) {
414 if (client->transactionlogfd != -1) {
415 close(client->transactionlogfd);
416 client->transactionlogfd = -1;
418 if (client->logsem != SEM_FAILED) {
419 sem_close(client->logsem);
420 client->logsem = SEM_FAILED;
421 sem_unlink(client->semname);
425 static void lock_logsem(CLIENT *client) {
426 sem_wait(client->logsem);
428 static void unlock_logsem(CLIENT *client) {
429 sem_post(client->logsem);
433 * Run a command. This is used for the ``prerun'' and ``postrun'' config file
436 * @param command the command to be ran. Read from the config file
437 * @param file the file name we're about to export
439 int do_run(gchar* command, gchar* file) {
440 _cleanup_g_free_ gchar* cmd = NULL;
443 if(command && *command) {
444 cmd = g_strdup_printf(command, file);
450 static inline void finalize_client(CLIENT* client) {
451 g_thread_pool_free(tpool, FALSE, TRUE);
452 do_run(client->server->postrun, client->exportname);
453 if(client->transactionlogfd != -1)
454 cleanup_transactionlog(client);
456 if(client->server->flags & F_COPYONWRITE) {
457 unlink(client->difffilename);
459 serve_dec_ref(client->server);
462 static inline void socket_closed_transmission(CLIENT* client) {
463 int saved_errno = errno;
464 finalize_client(client);
466 err("Connection dropped: %m");
471 * Splice data between a pipe and a file descriptor
473 * @param fd_in The fd to splice from.
474 * @param off_in The fd_in offset to splice from.
475 * @param fd_out The fd to splice to.
476 * @param off_out The fd_out offset to splice to.
477 * @param len The length to splice.
479 static inline void spliceit(int fd_in, loff_t *off_in, int fd_out,
480 loff_t *off_out, size_t len)
484 if ((ret = splice(fd_in, off_in, fd_out, off_out, len,
485 SPLICE_F_MOVE)) <= 0)
486 err("Splice failed: %m");
493 * Print out a message about how to use nbd-server. Split out to a separate
494 * function so that we can call it from multiple places
497 printf("This is nbd-server version " VERSION "\n");
498 printf("Usage: [ip:|ip6@]port file_to_export [size][kKmM] [-l authorize_file] [-r] [-m] [-c] [-C configuration file] [-p PID file name] [-o section name] [-M max connections] [-V] [-n] [-d]\n"
499 "\t-r|--read-only\t\tread only\n"
500 "\t-m|--multi-file\t\tmultiple file\n"
501 "\t-c|--copy-on-write\tcopy on write\n"
502 "\t-C|--config-file\tspecify an alternate configuration file\n"
503 "\t-l|--authorize-file\tfile with list of hosts that are allowed to\n\t\t\t\tconnect.\n"
504 "\t-p|--pid-file\t\tspecify a filename to write our PID to\n"
505 "\t-o|--output-config\toutput a config file section for what you\n\t\t\t\tspecified on the command line, with the\n\t\t\t\tspecified section name\n"
506 "\t-M|--max-connection\tspecify the maximum number of opened connections\n"
507 "\t-V|--version\t\toutput the version and exit\n"
508 "\t-n|--nodaemon\t\tdo not daemonize main process\n"
509 "\t-d|--dont-fork\t\tdo not fork (implies --nodaemon)\n\n"
510 "\tif port is set to 0, stdin is used (for running from inetd).\n"
511 "\tif file_to_export contains '%%s', it is substituted with the IP\n"
512 "\t\taddress of the machine trying to connect\n"
513 "\tif ip is set, it contains the local IP address on which we're listening.\n\tif not, the server will listen on all local IP addresses\n");
514 printf("Using configuration file %s\n", CFILE);
515 printf("For help, or when encountering bugs, please contact %s\n", PACKAGE_BUGREPORT);
518 /* Dumps a config file section of the given SERVER*, and exits. */
519 void dump_section(SERVER* serve, gchar* section_header) {
520 printf("[%s]\n", section_header);
521 printf("\texportname = %s\n", serve->exportname);
522 printf("\tlistenaddr = %s\n", serve->listenaddr);
523 if(serve->flags & F_READONLY) {
524 printf("\treadonly = true\n");
526 if(serve->flags & F_MULTIFILE) {
527 printf("\tmultifile = true\n");
529 if(serve->flags & F_TREEFILES) {
530 printf("\ttreefiles = true\n");
532 if(serve->flags & F_COPYONWRITE) {
533 printf("\tcopyonwrite = true\n");
535 if(serve->expected_size) {
536 printf("\tfilesize = %lld\n", (long long int)serve->expected_size);
538 if(serve->authname) {
539 printf("\tauthfile = %s\n", serve->authname);
545 * Parse the command line.
547 * @param argc the argc argument to main()
548 * @param argv the argv argument to main()
550 SERVER* cmdline(int argc, char *argv[], struct generic_conf *genconf) {
554 struct option long_options[] = {
555 {"read-only", no_argument, NULL, 'r'},
556 {"multi-file", no_argument, NULL, 'm'},
557 {"copy-on-write", no_argument, NULL, 'c'},
558 {"nodaemon", no_argument, NULL, 'n'},
559 {"dont-fork", no_argument, NULL, 'd'},
560 {"authorize-file", required_argument, NULL, 'l'},
561 {"config-file", required_argument, NULL, 'C'},
562 {"pid-file", required_argument, NULL, 'p'},
563 {"output-config", required_argument, NULL, 'o'},
564 {"max-connection", required_argument, NULL, 'M'},
565 {"version", no_argument, NULL, 'V'},
572 bool do_output=false;
573 gchar* section_header="";
579 serve=serve_inc_ref((SERVER*)g_new0(SERVER, 1));
580 serve->authname = g_strdup(default_authname);
581 serve->virtstyle=VIRT_IPLIT;
582 while((c=getopt_long(argc, argv, "-C:cwndl:mo:rp:M:V", long_options, &i))>=0) {
585 /* non-option argument */
586 switch(nonspecial++) {
588 if(strchr(optarg, ':') == strrchr(optarg, ':')) {
589 addr_port=g_strsplit(optarg, ":", 2);
591 /* Check for "@" - maybe user using this separator
594 g_strfreev(addr_port);
595 addr_port=g_strsplit(optarg, "@", 2);
598 addr_port=g_strsplit(optarg, "@", 2);
602 genconf->modernport=g_strdup(addr_port[1]);
603 genconf->modernaddr=g_strdup(addr_port[0]);
605 g_free(genconf->modernaddr);
606 genconf->modernaddr=NULL;
607 genconf->modernport=g_strdup(addr_port[0]);
609 g_strfreev(addr_port);
612 serve->exportname = g_strdup(optarg);
613 if(serve->exportname[0] != '/') {
614 fprintf(stderr, "E: The to be exported file needs to be an absolute filename!\n");
619 last=strlen(optarg)-1;
621 if (suffix == 'k' || suffix == 'K' ||
622 suffix == 'm' || suffix == 'M')
624 es = (off_t)atoll(optarg);
632 serve->expected_size = es;
637 serve->flags |= F_READONLY;
640 serve->flags |= F_MULTIFILE;
644 section_header = g_strdup(optarg);
647 strncpy(pidfname, optarg, 256);
651 serve->flags |=F_COPYONWRITE;
661 g_free(config_file_pos);
662 config_file_pos=g_strdup(optarg);
665 g_free(serve->authname);
666 serve->authname=g_strdup(optarg);
669 serve->max_connections = strtol(optarg, NULL, 0);
672 printf("This is nbd-server version " VERSION "\n");
681 /* What's left: the port to export, the name of the to be exported
682 * file, and, optionally, the size of the file, in that order. */
684 serve=serve_dec_ref(serve);
686 serve->servename = "";
690 g_critical("Need a complete configuration on the command line to output a config file section!");
693 dump_section(serve, section_header);
698 /* forward definition of parse_cfile */
699 GArray* parse_cfile(gchar* f, struct generic_conf *genconf, bool expect_generic, GError** e);
701 #ifdef HAVE_STRUCT_DIRENT_D_TYPE
702 #define NBD_D_TYPE de->d_type
710 * Parse config file snippets in a directory. Uses readdir() and friends
711 * to find files and open them, then passes them on to parse_cfile
712 * with have_global set false
714 GArray* do_cfile_dir(gchar* dir, struct generic_conf *const genconf, GError** e) {
715 DIR* dirh = opendir(dir);
718 GArray* retval = NULL;
723 g_set_error(e, NBDS_ERR, NBDS_ERR_CFILE_DIR_UNKNOWN, "Invalid directory specified: %s", strerror(errno));
727 while((de = readdir(dirh))) {
728 int saved_errno=errno;
729 fname = g_build_filename(dir, de->d_name, NULL);
732 /* Filesystem doesn't return type of
733 * file through readdir, or struct dirent
734 * doesn't have d_type. Run stat() on the file
736 if(stat(fname, &stbuf)) {
740 if (!S_ISREG(stbuf.st_mode)) {
744 /* Skip unless the name ends with '.conf' */
745 if(strcmp((de->d_name + strlen(de->d_name) - 5), ".conf")) {
748 tmp = parse_cfile(fname, genconf, false, e);
754 retval = g_array_new(FALSE, TRUE, sizeof(SERVER*));
755 retval = g_array_append_vals(retval, tmp->data, tmp->len);
756 g_array_free(tmp, TRUE);
764 g_set_error(e, NBDS_ERR, NBDS_ERR_CFILE_READDIR_ERR, "Error trying to read directory: %s", strerror(errno));
767 g_array_free(retval, TRUE);
776 * To be called by GArray clearing function.
777 * @param server pointer to server element
779 static void serve_clear_element(SERVER **server) {
780 serve_dec_ref(*server);
784 * Parse the config file.
786 * @param f the name of the config file
788 * @param genconf a pointer to generic configuration which will get
789 * updated with parsed values. If NULL, then parsed generic
790 * configuration values are safely and silently discarded.
792 * @param e a GError. Error code can be any of the following:
793 * NBDS_ERR_CFILE_NOTFOUND, NBDS_ERR_CFILE_MISSING_GENERIC,
794 * NBDS_ERR_CFILE_VALUE_INVALID, NBDS_ERR_CFILE_VALUE_UNSUPPORTED
795 * or NBDS_ERR_CFILE_NO_EXPORTS. @see NBDS_ERRS.
797 * @param expect_generic if true, we expect a configuration file that
798 * contains a [generic] section. If false, we don't.
800 * @return a GArray of SERVER* pointers. If the config file is empty or does not
801 * exist, returns an empty GArray; if the config file contains an
802 * error, returns NULL, and e is set appropriately
804 GArray* parse_cfile(gchar* f, struct generic_conf *const genconf, bool expect_generic, GError** e) {
805 const char* DEFAULT_ERROR = "Could not parse %s in group %s: %s";
806 const char* MISSING_REQUIRED_ERROR = "Could not find required value %s in group %s: %s";
809 gchar *virtstyle=NULL;
811 { "exportname", TRUE, PARAM_STRING, &(s.exportname), 0 },
812 { "authfile", FALSE, PARAM_STRING, &(s.authname), 0 },
813 { "filesize", FALSE, PARAM_OFFT, &(s.expected_size), 0 },
814 { "virtstyle", FALSE, PARAM_STRING, &(virtstyle), 0 },
815 { "prerun", FALSE, PARAM_STRING, &(s.prerun), 0 },
816 { "postrun", FALSE, PARAM_STRING, &(s.postrun), 0 },
817 { "transactionlog", FALSE, PARAM_STRING, &(s.transactionlog), 0 },
818 { "cowdir", FALSE, PARAM_STRING, &(s.cowdir), 0 },
819 { "readonly", FALSE, PARAM_BOOL, &(s.flags), F_READONLY },
820 { "multifile", FALSE, PARAM_BOOL, &(s.flags), F_MULTIFILE },
821 { "treefiles", FALSE, PARAM_BOOL, &(s.flags), F_TREEFILES },
822 { "copyonwrite", FALSE, PARAM_BOOL, &(s.flags), F_COPYONWRITE },
823 { "waitfile", FALSE, PARAM_BOOL, &(s.flags), F_WAIT },
824 { "sparse_cow", FALSE, PARAM_BOOL, &(s.flags), F_SPARSE },
825 { "sdp", FALSE, PARAM_BOOL, &(s.flags), F_SDP },
826 { "sync", FALSE, PARAM_BOOL, &(s.flags), F_SYNC },
827 { "flush", FALSE, PARAM_BOOL, &(s.flags), F_FLUSH },
828 { "fua", FALSE, PARAM_BOOL, &(s.flags), F_FUA },
829 { "rotational", FALSE, PARAM_BOOL, &(s.flags), F_ROTATIONAL },
830 { "temporary", FALSE, PARAM_BOOL, &(s.flags), F_TEMPORARY },
831 { "trim", FALSE, PARAM_BOOL, &(s.flags), F_TRIM },
832 { "datalog", FALSE, PARAM_BOOL, &(s.flags), F_DATALOG },
833 { "listenaddr", FALSE, PARAM_STRING, &(s.listenaddr), 0 },
834 { "maxconnections", FALSE, PARAM_INT, &(s.max_connections), 0 },
835 { "force_tls", FALSE, PARAM_BOOL, &(s.flags), F_FORCEDTLS },
836 { "splice", FALSE, PARAM_BOOL, &(s.flags), F_SPLICE},
838 const int lp_size=sizeof(lp)/sizeof(PARAM);
839 struct generic_conf genconftmp;
841 { "user", FALSE, PARAM_STRING, &(genconftmp.user), 0 },
842 { "group", FALSE, PARAM_STRING, &(genconftmp.group), 0 },
843 { "oldstyle", FALSE, PARAM_BOOL, &(genconftmp.flags), F_OLDSTYLE }, // only left here so we can issue an appropriate error message when the option is used
844 { "listenaddr", FALSE, PARAM_STRING, &(genconftmp.modernaddr), 0 },
845 { "port", FALSE, PARAM_STRING, &(genconftmp.modernport), 0 },
846 { "includedir", FALSE, PARAM_STRING, &cfdir, 0 },
847 { "allowlist", FALSE, PARAM_BOOL, &(genconftmp.flags), F_LIST },
848 { "unixsock", FALSE, PARAM_STRING, &(genconftmp.unixsock), 0 },
849 { "duallisten", FALSE, PARAM_BOOL, &(genconftmp.flags), F_DUAL_LISTEN }, // Used to listen on both TCP and unix socket
850 { "max_threads", FALSE, PARAM_INT, &(genconftmp.threads), 0 },
851 { "force_tls", FALSE, PARAM_BOOL, &(genconftmp.flags), F_FORCEDTLS },
852 { "certfile", FALSE, PARAM_STRING, &(genconftmp.certfile), 0 },
853 { "keyfile", FALSE, PARAM_STRING, &(genconftmp.keyfile), 0 },
854 { "cacertfile", FALSE, PARAM_STRING, &(genconftmp.cacertfile), 0 },
855 { "tlsprio", FALSE, PARAM_STRING, &(genconftmp.tlsprio), 0 },
858 int p_size=sizeof(gp)/sizeof(PARAM);
859 _cleanup_(g_key_file_freep) GKeyFile *cfile = NULL;
860 g_autoptr(GError) err = NULL;
861 const char *err_msg=NULL;
868 _cleanup_g_free_ gchar* startgroup = NULL;
872 memset(&genconftmp, 0, sizeof(struct generic_conf));
874 genconftmp.tlsprio = "NORMAL:-VERS-TLS-ALL:+VERS-TLS1.2:%SERVER_PRECEDENCE";
877 /* Use the passed configuration values as defaults. The
878 * parsing algorithm below updates all parameter targets
879 * found from configuration files. */
880 memcpy(&genconftmp, genconf, sizeof(struct generic_conf));
883 cfile = g_key_file_new();
884 retval = g_array_new(FALSE, TRUE, sizeof(SERVER*));
886 g_array_set_clear_func(retval, (GDestroyNotify)serve_clear_element);
888 if(!g_key_file_load_from_file(cfile, f, G_KEY_FILE_KEEP_COMMENTS |
889 G_KEY_FILE_KEEP_TRANSLATIONS, &err)) {
890 g_set_error(e, NBDS_ERR, NBDS_ERR_CFILE_NOTFOUND, "Could not open config file %s: %s",
892 g_key_file_free(cfile);
895 startgroup = g_key_file_get_start_group(cfile);
896 if((!startgroup || strcmp(startgroup, "generic")) && expect_generic) {
897 g_set_error(e, NBDS_ERR, NBDS_ERR_CFILE_MISSING_GENERIC, "Config file does not contain the [generic] group!");
900 groups = g_key_file_get_groups(cfile, NULL);
901 for(i=0;groups[i];i++) {
902 memset(&s, '\0', sizeof(SERVER));
904 /* After the [generic] group or when we're parsing an include
905 * directory, start parsing exports */
906 if(i==1 || !expect_generic) {
910 for(j=0;j<p_size;j++) {
911 assert(p[j].target != NULL);
912 assert(p[j].ptype==PARAM_INT||p[j].ptype==PARAM_STRING||p[j].ptype==PARAM_BOOL||p[j].ptype==PARAM_INT64);
915 ival = g_key_file_get_integer(cfile,
920 *((gint*)p[j].target) = ival;
924 i64val = g_key_file_get_int64(cfile,
929 *((gint64*)p[j].target) = i64val;
933 sval = g_key_file_get_string(cfile,
938 *((gchar**)p[j].target) = sval;
942 bval = g_key_file_get_boolean(cfile,
944 p[j].paramname, &err);
947 *((gint*)p[j].target) |= p[j].flagval;
949 *((gint*)p[j].target) &= ~(p[j].flagval);
955 if(err->code == G_KEY_FILE_ERROR_KEY_NOT_FOUND) {
957 /* Ignore not-found error for optional values */
961 err_msg = MISSING_REQUIRED_ERROR;
964 err_msg = DEFAULT_ERROR;
966 g_set_error(e, NBDS_ERR, NBDS_ERR_CFILE_VALUE_INVALID, err_msg, p[j].paramname, groups[i], err->message);
967 g_array_free(retval, TRUE);
972 if(!strncmp(virtstyle, "none", 4)) {
973 s.virtstyle=VIRT_NONE;
974 } else if(!strncmp(virtstyle, "ipliteral", 9)) {
975 s.virtstyle=VIRT_IPLIT;
976 } else if(!strncmp(virtstyle, "iphash", 6)) {
977 s.virtstyle=VIRT_IPHASH;
978 } else if(!strncmp(virtstyle, "cidrhash", 8)) {
979 s.virtstyle=VIRT_CIDR;
980 if(strlen(virtstyle)<10) {
981 g_set_error(e, NBDS_ERR, NBDS_ERR_CFILE_VALUE_INVALID, "Invalid value %s for parameter virtstyle in group %s: missing length", virtstyle, groups[i]);
982 g_array_free(retval, TRUE);
985 s.cidrlen=strtol(virtstyle+8, NULL, 0);
987 g_set_error(e, NBDS_ERR, NBDS_ERR_CFILE_VALUE_INVALID, "Invalid value %s for parameter virtstyle in group %s", virtstyle, groups[i]);
988 g_array_free(retval, TRUE);
992 s.virtstyle=VIRT_IPLIT;
994 if(genconftmp.flags & F_OLDSTYLE) {
995 g_message("Since 3.10, the oldstyle protocol is no longer supported. Please migrate to the newstyle protocol.");
996 g_message("Exiting.");
1000 if (s.flags & F_SPLICE) {
1001 g_set_error(e, NBDS_ERR, NBDS_ERR_CFILE_VALUE_UNSUPPORTED, "This nbd-server was built without splice support, yet group %s uses it", groups[i]);
1002 g_array_free(retval, TRUE);
1006 /* We can't mix copyonwrite and splice. */
1007 if ((s.flags & F_COPYONWRITE) && (s.flags & F_SPLICE)) {
1008 g_set_error(e, NBDS_ERR, NBDS_ERR_CFILE_INVALID_SPLICE,
1009 "Cannot mix copyonwrite with splice for an export in group %s",
1011 g_array_free(retval, TRUE);
1014 if ((s.flags & F_COPYONWRITE) && (s.flags & F_WAIT)) {
1015 g_set_error(e, NBDS_ERR, NBDS_ERR_CFILE_INVALID_WAIT,
1016 "Cannot mix copyonwrite with waitfile for an export in group %s",
1018 g_array_free(retval, TRUE);
1021 /* We can't mix datalog and splice. */
1022 if ((s.flags & F_DATALOG) && (s.flags & F_SPLICE)) {
1023 g_set_error(e, NBDS_ERR, NBDS_ERR_CFILE_INVALID_SPLICE,
1024 "Cannot mix datalog with splice for an export in group %s",
1026 g_array_free(retval, TRUE);
1029 /* Don't need to free this, it's not our string */
1031 /* Don't append values for the [generic] group */
1032 if(i>0 || !expect_generic) {
1033 s.servename = groups[i];
1035 SERVER *srv = serve_inc_ref(g_memdup2(&s, sizeof(SERVER)));
1036 g_array_append_val(retval, srv);
1039 if(s.flags & F_SDP) {
1040 g_set_error(e, NBDS_ERR, NBDS_ERR_CFILE_VALUE_UNSUPPORTED, "This nbd-server was built without support for SDP, yet group %s uses it", groups[i]);
1041 g_array_free(retval, TRUE);
1047 GArray* extra = do_cfile_dir(cfdir, &genconftmp, e);
1049 retval = g_array_append_vals(retval, extra->data, extra->len);
1051 g_array_free(extra, TRUE);
1054 g_array_free(retval, TRUE);
1059 if(i==1 && expect_generic) {
1060 g_set_error(e, NBDS_ERR, NBDS_ERR_CFILE_NO_EXPORTS, "The config file does not specify any exports");
1064 /* Return the updated generic configuration through the
1065 * pointer parameter. */
1066 memcpy(genconf, &genconftmp, sizeof(struct generic_conf));
1073 * Handle SIGCHLD by setting atomically a flag which will be evaluated in the
1074 * main loop of the root server process. This allows us to separate the signal
1075 * catching from th actual task triggered by SIGCHLD and hence processing in the
1076 * interrupt context is kept as minimial as possible.
1078 * @param s the signal we're handling (must be SIGCHLD, or something
1079 * is severely wrong)
1081 static void sigchld_handler(const int s G_GNUC_UNUSED) {
1082 is_sigchld_caught = 1;
1086 * Kill a child. Called from sigterm_handler::g_hash_table_foreach.
1088 * @param key the key
1089 * @param value the value corresponding to the above key
1090 * @param user_data a pointer which we always set to 1, so that we know what
1093 void killchild(gpointer key, gpointer value, gpointer user_data) {
1096 kill(*pid, SIGTERM);
1100 * Handle SIGTERM by setting atomically a flag which will be evaluated in the
1101 * main loop of the root server process. This allows us to separate the signal
1102 * catching from th actual task triggered by SIGTERM and hence processing in the
1103 * interrupt context is kept as minimial as possible.
1105 * @param s the signal we're handling (must be SIGTERM, or something
1106 * is severely wrong).
1108 static void sigterm_handler(const int s G_GNUC_UNUSED) {
1109 is_sigterm_caught = 1;
1113 * Handle SIGHUP by setting atomically a flag which will be evaluated in
1114 * the main loop of the root server process. This allows us to separate
1115 * the signal catching from th actual task triggered by SIGHUP and hence
1116 * processing in the interrupt context is kept as minimial as possible.
1118 * @param s the signal we're handling (must be SIGHUP, or something
1119 * is severely wrong).
1121 static void sighup_handler(const int s G_GNUC_UNUSED) {
1122 is_sighup_caught = 1;
1125 static void sigusr1_handler(const int s G_GNUC_UNUSED) {
1126 msg(LOG_INFO, "Got SIGUSR1");
1127 sem_post(&file_wait_sem);
1131 * Get the file handle and offset, given an export offset.
1133 * @param client The client we're serving for
1134 * @param a The offset to get corresponding file/offset for
1135 * @param fhandle [out] File descriptor
1136 * @param foffset [out] Offset into fhandle
1137 * @param maxbytes [out] Tells how many bytes can be read/written
1138 * from fhandle starting at foffset (0 if there is no limit)
1139 * @return 0 on success, -1 on failure
1141 int get_filepos(CLIENT *client, off_t a, int* fhandle, off_t* foffset, size_t* maxbytes ) {
1142 GArray * const export = client->export;
1144 /* Negative offset not allowed */
1148 /* Open separate file for treefiles */
1149 if (client->server->flags & F_TREEFILES) {
1150 *foffset = a % TREEPAGESIZE;
1151 *maxbytes = (( 1 + (a/TREEPAGESIZE) ) * TREEPAGESIZE) - a; // start position of next block
1152 *fhandle = open_treefile(client->exportname, ((client->server->flags & F_READONLY) ? O_RDONLY : O_RDWR), client->exportsize,a, &client->lock);
1156 /* Binary search for last file with starting offset <= a */
1159 int end = export->len - 1;
1160 while( start <= end ) {
1161 int mid = (start + end) / 2;
1162 fi = g_array_index(export, FILE_INFO, mid);
1163 if( fi.startoff < a ) {
1165 } else if( fi.startoff > a ) {
1173 /* end should never go negative, since first startoff is 0 and a >= 0 */
1176 fi = g_array_index(export, FILE_INFO, end);
1177 *fhandle = fi.fhandle;
1178 *foffset = a - fi.startoff;
1180 if( end+1 < export->len ) {
1181 FILE_INFO fi_next = g_array_index(export, FILE_INFO, end+1);
1182 *maxbytes = fi_next.startoff - a;
1189 * Write an amount of bytes at a given offset to the right file. This
1190 * abstracts the write-side of the multiple file option.
1192 * @param a The offset where the write should start
1193 * @param buf The buffer to write from
1194 * @param len The length of buf
1195 * @param client The client we're serving for
1196 * @param fua Flag to indicate 'Force Unit Access'
1197 * @return The number of bytes actually written, or -1 in case of an error
1199 ssize_t rawexpwrite(off_t a, char *buf, size_t len, CLIENT *client, int fua) {
1205 if(get_filepos(client, a, &fhandle, &foffset, &maxbytes))
1207 if(maxbytes && len > maxbytes)
1210 DEBUG("(WRITE to fd %d offset %llu len %u fua %d), ", fhandle, (long long unsigned)foffset, (unsigned int)len, fua);
1212 retval = pwrite(fhandle, buf, len, foffset);
1213 if(client->server->flags & F_SYNC) {
1217 /* This is where we would do the following
1218 * #ifdef USE_SYNC_FILE_RANGE
1219 * However, we don't, for the reasons set out below
1220 * by Christoph Hellwig <hch@infradead.org>
1223 * fdatasync is equivalent to fsync except that it does not flush
1224 * non-essential metadata (basically just timestamps in practice), but it
1225 * does flush metadata requried to find the data again, e.g. allocation
1226 * information and extent maps. sync_file_range does nothing but flush
1227 * out pagecache content - it means you basically won't get your data
1228 * back in case of a crash if you either:
1230 * a) have a volatile write cache in your disk (e.g. any normal SATA disk)
1231 * b) are using a sparse file on a filesystem
1232 * c) are using a fallocate-preallocated file on a filesystem
1233 * d) use any file on a COW filesystem like btrfs
1235 * e.g. it only does anything useful for you if you do not have a volatile
1236 * write cache, and either use a raw block device node, or just overwrite
1237 * an already fully allocated (and not preallocated) file on a non-COW
1241 * What we should do is open a second FD with O_DSYNC set, then write to
1242 * that when appropriate. However, with a Linux client, every REQ_FUA
1243 * immediately follows a REQ_FLUSH, so fdatasync does not cause performance
1248 sync_file_range(fhandle, foffset, len,
1249 SYNC_FILE_RANGE_WAIT_BEFORE | SYNC_FILE_RANGE_WRITE |
1250 SYNC_FILE_RANGE_WAIT_AFTER);
1255 /* close file pointer in case of treefiles */
1256 if (client->server->flags & F_TREEFILES) {
1263 * Call rawexpwrite repeatedly until all data has been written.
1265 * @param a The offset where the write should start
1266 * @param buf The buffer to write from
1267 * @param len The length of buf
1268 * @param client The client we're serving for
1269 * @param fua Flag to indicate 'Force Unit Access'
1270 * @return 0 on success, nonzero on failure
1272 int rawexpwrite_fully(off_t a, char *buf, size_t len, CLIENT *client, int fua) {
1275 while(len > 0 && (ret=rawexpwrite(a, buf, len, client, fua)) > 0 ) {
1280 return (ret < 0 || len != 0);
1283 static void setup_reply(struct nbd_reply* rep, struct nbd_request* req) {
1284 rep->magic = htonl(NBD_REPLY_MAGIC);
1286 rep->cookie = req->cookie;
1289 static void log_reply(CLIENT *client, struct nbd_reply *prply) {
1290 if (client->transactionlogfd != -1) {
1291 lock_logsem(client);
1292 writeit(client->transactionlogfd, prply, sizeof(*prply));
1293 unlock_logsem(client);
1297 static void log_structured_reply(CLIENT *client, struct nbd_structured_reply *prply) {
1298 if (client->transactionlogfd != -1) {
1299 lock_logsem(client);
1300 writeit(client->transactionlogfd, prply, sizeof(*prply));
1301 unlock_logsem(client);
1305 void send_structured_chunk(CLIENT *client, struct nbd_request *req, uint16_t flags, uint16_t type, uint32_t length, int bufcount, void *buf[], size_t buflen[]) {
1306 struct nbd_structured_reply rep;
1307 rep.magic = htonl(NBD_STRUCTURED_REPLY_MAGIC);
1308 rep.flags = htons(flags);
1309 rep.type = htons(type);
1310 rep.cookie = req->cookie;
1311 rep.paylen = htonl(length);
1312 pthread_mutex_lock(&(client->lock));
1313 socket_write(client, &rep, sizeof rep);
1314 for(int i=0; i<bufcount; i++) {
1315 socket_write(client, buf[i], buflen[i]);
1317 pthread_mutex_unlock(&(client->lock));
1318 log_structured_reply(client, &rep);
1321 void send_structured_chunk_v(CLIENT *client, struct nbd_request *req, uint16_t flags, uint16_t type, uint32_t length, int bufcount, ...) {
1322 struct nbd_structured_reply rep;
1324 rep.magic = htonl(NBD_STRUCTURED_REPLY_MAGIC);
1325 rep.flags = htons(flags);
1326 rep.type = htons(type);
1327 rep.cookie = req->cookie;
1328 rep.paylen = htonl(length);
1329 va_start(ap, bufcount);
1330 pthread_mutex_lock(&(client->lock));
1331 socket_write(client, &rep, sizeof rep);
1332 for(int i=0; i<bufcount; i++) {
1333 void *buf = va_arg(ap, void*);
1334 size_t size = va_arg(ap, size_t);
1335 socket_write(client, buf, size);
1337 pthread_mutex_unlock(&(client->lock));
1338 log_structured_reply(client, &rep);
1343 * Find the location to write the data for the next chunk to.
1344 * Assumes checks on memory sizes etc have already been done.
1346 * @param ctx the context we're working with
1347 * @param offset the offset into the request
1348 * @param len the length of this chunk.
1350 char * find_read_buf(READ_CTX *ctx) {
1351 if(!(ctx->is_structured) || ctx->df) {
1352 return ctx->buf + ctx->current_offset;
1354 ctx->buf = malloc(ctx->current_len);
1356 err("Could not allocate memory for request");
1361 void confirm_read(CLIENT *client, READ_CTX *ctx, size_t len_read) {
1362 if(ctx->is_structured && !(ctx->df)) {
1363 uint64_t offset = htonll(ctx->req->from + (uint64_t)(ctx->current_offset));
1364 send_structured_chunk_v(client, ctx->req, 0, NBD_REPLY_TYPE_OFFSET_DATA, len_read + 8, 2, &offset, sizeof offset, ctx->buf, (size_t)len_read);
1369 void complete_read(CLIENT *client, READ_CTX *ctx, uint32_t error, char *errmsg, uint16_t msglen, bool with_offset, uint64_t err_offset) {
1371 uint64_t offset = 0;
1372 if(ctx->is_structured) {
1374 uint32_t len = ctx->req->len;
1375 if(error != 0 && with_offset) {
1378 if(error == 0 || with_offset) {
1379 offset = htonll(ctx->req->from);
1380 send_structured_chunk_v(client, ctx->req, 0, NBD_REPLY_TYPE_OFFSET_DATA, len + 8, 2, &offset, sizeof offset, ctx->buf, err_offset);
1385 struct nbd_structured_error_payload pl;
1393 offset += err_offset;
1394 type = NBD_REPLY_TYPE_ERROR_OFFSET;
1396 type = NBD_REPLY_TYPE_ERROR;
1399 bufsize[0] = sizeof pl;
1400 total_size = bufsize[0];
1402 buf[payloads] = errmsg;
1403 bufsize[payloads++] = msglen;
1404 total_size += msglen;
1407 buf[payloads] = &offset;
1408 bufsize[payloads++] = sizeof offset;
1409 total_size += sizeof offset;
1411 send_structured_chunk(client, ctx->req, NBD_REPLY_FLAG_DONE, type, total_size, payloads, buf, bufsize);
1414 send_structured_chunk_v(client, ctx->req, NBD_REPLY_FLAG_DONE, NBD_REPLY_TYPE_NONE, 0, 0);
1416 struct nbd_reply rep;
1417 setup_reply(&rep, ctx->req);
1421 log_reply(client, &rep);
1422 pthread_mutex_lock(&(client->lock));
1423 socket_write(client, &rep, sizeof rep);
1425 socket_write(client, ctx->buf, ctx->buflen);
1427 pthread_mutex_unlock(&(client->lock));
1433 * Read an amount of bytes at a given offset from the right file. This
1434 * abstracts the read-side of the multiple files option.
1436 * @param a The offset where the read should start
1437 * @param buf A buffer to read into
1438 * @param len The size of buf
1439 * @param client The client we're serving for
1440 * @return The number of bytes actually read, or -1 in case of an
1443 ssize_t rawexpread(off_t a, char *buf, size_t len, CLIENT *client) {
1449 if(get_filepos(client, a, &fhandle, &foffset, &maxbytes))
1451 if(maxbytes && len > maxbytes)
1454 DEBUG("(READ from fd %d offset %llu len %u), ", fhandle, (long long unsigned int)foffset, (unsigned int)len);
1456 retval = pread(fhandle, buf, len, foffset);
1457 if (client->server->flags & F_TREEFILES) {
1464 * Call rawexpread repeatedly until all data has been read.
1465 * @return 0 on success, nonzero on failure
1467 int rawexpread_fully(READ_CTX *ctx, CLIENT *client) {
1472 while(ctx->current_len > 0) {
1473 buf = find_read_buf(ctx);
1474 if((ret = rawexpread((off_t)ctx->req->from + (off_t)ctx->current_offset, buf, ctx->current_len, client)) <= 0) {
1477 confirm_read(client, ctx, ret);
1478 ctx->current_offset += ret;
1479 ctx->current_len -= ret;
1481 return (ret < 0 || ctx->current_len != 0);
1485 int rawexpsplice(int pipe, off_t a, size_t len, CLIENT *client, int dir,
1493 if (get_filepos(client, a, &fhandle, &foffset, &maxbytes))
1495 if (maxbytes && len > maxbytes)
1498 DEBUG("(SPLICE %s fd %d offset %llu len %u), ",
1499 (dir == SPLICE_IN) ? "from" : "to", fhandle,
1500 (unsigned long long)a, (unsigned)len);
1503 * SPLICE_F_MOVE doesn't actually work at the moment, but in the future
1504 * it might, so go ahead and use it.
1506 if (dir == SPLICE_IN) {
1507 retval = splice(fhandle, &foffset, pipe, NULL, len,
1510 retval = splice(pipe, NULL, fhandle, &foffset, len,
1512 if (client->server->flags & F_SYNC)
1517 if (client->server->flags & F_TREEFILES)
1523 * Splice an amount of bytes from the given offset from/into the right file
1524 * from/into the given pipe.
1525 * @param pipe The pipe we are using for this splice.
1526 * @param a The offset of the file we are operating on.
1527 * @param len The length of the splice.
1528 * @param client The client we're splicing for.
1529 * @param dir The direction we are doing the splice in.
1530 * @param fua Set if this is a write and we need to fua.
1531 * @return 0 on success, nonzero on failure.
1533 int expsplice(int pipe, off_t a, size_t len, CLIENT *client, int dir, int fua)
1538 (ret = rawexpsplice(pipe, a, len, client, dir, fua)) > 0) {
1542 return (ret < 0 || len != 0);
1544 #endif /* HAVE_SPLICE */
1547 * Read an amount of bytes at a given offset from the right file. This
1548 * abstracts the read-side of the copyonwrite stuff, and calls
1549 * rawexpread() with the right parameters to do the actual work.
1550 * @param a The offset where the read should start
1551 * @param buf A buffer to read into
1552 * @param len The size of buf
1553 * @param client The client we're going to read for
1554 * @return 0 on success, nonzero on failure
1556 int expread(READ_CTX *ctx, CLIENT *client) {
1557 off_t rdlen, offset;
1558 off_t mapcnt, mapl, maph, pagestart;
1559 off_t a = (off_t)ctx->current_offset + (off_t)ctx->req->from;
1560 size_t len = (size_t) ctx->req->len;
1563 DEBUG("Asked to read %u bytes at %llu.\n", (unsigned int)len, (unsigned long long)a);
1565 if (!(client->server->flags & F_COPYONWRITE) && !((client->server->flags & F_WAIT) && (client->export == NULL)))
1566 return(rawexpread_fully(ctx, client));
1568 mapl=a/DIFFPAGESIZE; maph=(a+len-1)/DIFFPAGESIZE;
1570 for (mapcnt=mapl;mapcnt<=maph;mapcnt++) {
1571 pagestart=mapcnt*DIFFPAGESIZE;
1573 rdlen=(0<DIFFPAGESIZE-offset && len<(size_t)(DIFFPAGESIZE-offset)) ?
1574 len : (size_t)DIFFPAGESIZE-offset;
1575 if (!(client->server->flags & F_COPYONWRITE))
1576 pthread_rwlock_rdlock(&client->export_lock);
1577 if (client->difmap[mapcnt]!=(u32)(-1)) { /* the block is already there */
1578 DEBUG("Page %llu is at %lu\n", (unsigned long long)mapcnt,
1579 (unsigned long)(client->difmap[mapcnt]));
1580 char *buf = find_read_buf(ctx);
1581 if (pread(client->difffile, buf, rdlen, client->difmap[mapcnt]*DIFFPAGESIZE+offset) != rdlen) {
1584 confirm_read(client, ctx, rdlen);
1585 } else { /* the block is not there */
1586 if ((client->server->flags & F_WAIT) && (client->export == NULL)){
1587 DEBUG("Page %llu is not here, and waiting for file\n",
1588 (unsigned long long)mapcnt);
1591 DEBUG("Page %llu is not here, we read the original one\n",
1592 (unsigned long long)mapcnt);
1593 ctx->current_len = rdlen;
1594 if(rawexpread_fully(ctx, client)) goto fail;
1597 if (!(client->server->flags & F_COPYONWRITE))
1598 pthread_rwlock_unlock(&client->export_lock);
1599 len-=rdlen; a+=rdlen;
1604 if (!(client->server->flags & F_COPYONWRITE))
1605 pthread_rwlock_unlock(&client->export_lock);
1612 * Write an amount of bytes at a given offset to the right file. This
1613 * abstracts the write-side of the copyonwrite option, and calls
1614 * rawexpwrite() with the right parameters to do the actual work.
1616 * @param a The offset where the write should start
1617 * @param buf The buffer to write from
1618 * @param len The length of buf
1619 * @param client The client we're going to write for.
1620 * @param fua Flag to indicate 'Force Unit Access'
1621 * @return 0 on success, nonzero on failure
1623 int expwrite(off_t a, char *buf, size_t len, CLIENT *client, int fua) {
1624 char pagebuf[DIFFPAGESIZE];
1625 off_t mapcnt,mapl,maph;
1630 DEBUG("Asked to write %u bytes at %llu.\n", (unsigned int)len, (unsigned long long)a);
1633 if (!(client->server->flags & F_COPYONWRITE) && !((client->server->flags & F_WAIT) && (client->export == NULL)))
1634 return(rawexpwrite_fully(a, buf, len, client, fua));
1636 mapl=a/DIFFPAGESIZE ; maph=(a+len-1)/DIFFPAGESIZE ;
1638 for (mapcnt=mapl;mapcnt<=maph;mapcnt++) {
1639 pagestart=mapcnt*DIFFPAGESIZE ;
1640 offset=a-pagestart ;
1641 wrlen=(0<DIFFPAGESIZE-offset && len<(size_t)(DIFFPAGESIZE-offset)) ?
1642 len : (size_t)DIFFPAGESIZE-offset;
1644 if (!(client->server->flags & F_COPYONWRITE))
1645 pthread_rwlock_rdlock(&client->export_lock);
1646 if (client->difmap[mapcnt]!=(u32)(-1)) { /* the block is already there */
1647 DEBUG("Page %llu is at %lu\n", (unsigned long long)mapcnt,
1648 (unsigned long)(client->difmap[mapcnt])) ;
1649 if (pwrite(client->difffile, buf, wrlen, client->difmap[mapcnt]*DIFFPAGESIZE+offset) != wrlen) goto fail;
1650 } else { /* the block is not there */
1651 client->difmap[mapcnt]=(client->server->flags&F_SPARSE)?mapcnt:client->difffilelen++;
1652 DEBUG("Page %llu is not here, we put it at %lu\n",
1653 (unsigned long long)mapcnt,
1654 (unsigned long)(client->difmap[mapcnt]));
1655 if ((offset != 0) || (wrlen != DIFFPAGESIZE)){
1656 if ((client->server->flags & F_WAIT) && (client->export == NULL)){
1657 DEBUG("error: we can write only whole page while waiting for file\n");
1662 char *ptr = pagebuf;
1663 while(rdlen > 0 && (ret = rawexpread(pagestart, ptr, rdlen, client)) > 0) {
1668 if(ret < 0 ) goto fail;
1670 memcpy(pagebuf+offset,buf,wrlen) ;
1671 if (write(client->difffile, pagebuf, DIFFPAGESIZE) != DIFFPAGESIZE)
1674 if (!(client->server->flags & F_COPYONWRITE))
1675 pthread_rwlock_unlock(&client->export_lock);
1676 len-=wrlen ; a+=wrlen ; buf+=wrlen ;
1678 if (client->server->flags & F_SYNC) {
1679 fsync(client->difffile);
1681 /* open question: would it be cheaper to do multiple sync_file_ranges?
1682 as we iterate through the above?
1684 fdatasync(client->difffile);
1688 if (!(client->server->flags & F_COPYONWRITE))
1689 pthread_rwlock_unlock(&client->export_lock);
1695 * Write an amount of zeroes at a given offset to the right file.
1696 * This routine could be optimised by not calling expwrite. However,
1697 * this is by far the simplest way to do it.
1699 * @param req the request
1700 * @param client The client we're going to write for.
1701 * @return 0 on success, nonzero on failure
1703 int expwrite_zeroes(struct nbd_request* req, CLIENT* client, int fua) {
1704 off_t a = req->from;
1705 size_t len = req->len;
1706 size_t maxsize = 64LL*1024LL*1024LL;
1707 /* use calloc() as sadly MAP_ANON is apparently not POSIX standard */
1708 char *buf = calloc (1, maxsize);
1714 ret = expwrite(a, buf, l, client, fua);
1726 * Flush data to a client
1728 * @param client The client we're going to write for.
1729 * @return 0 on success, nonzero on failure
1731 int expflush(CLIENT *client) {
1734 if (client->server->flags & F_COPYONWRITE) {
1735 return fsync(client->difffile);
1738 if (client->server->flags & F_WAIT) {
1739 return fsync(client->difffile);
1742 if (client->server->flags & F_TREEFILES ) {
1743 // all we can do is force sync the entire filesystem containing the tree
1744 if (client->server->flags & F_READONLY)
1750 for (i = 0; i < client->export->len; i++) {
1751 FILE_INFO fi = g_array_index(client->export, FILE_INFO, i);
1752 if (fsync(fi.fhandle) < 0)
1759 void punch_hole(int fd, off_t off, off_t len) {
1760 DEBUG("Request to punch a hole in fd=%d, starting from %llu, length %llu\n", fd, (unsigned long long)off, (unsigned long long)len);
1762 // fallocate -- files, Linux
1765 if(fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, off, len) == 0)
1767 } while(errno == EINTR);
1769 // ioctl(BLKDISCARD) -- block devices, Linux
1771 uint64_t range[2] = {off, len};
1773 if(ioctl(fd, BLKDISCARD, range) == 0)
1775 } while(errno == EINTR);
1778 #if HAVE_FSCTL_SET_ZERO_DATA
1779 FILE_ZERO_DATA_INFORMATION zerodata;
1780 zerodata.FileOffset.QuadPart = off;
1781 zerodata.BeyondFinalZero.QuadPart = off + len;
1782 HANDLE w32handle = (HANDLE)_get_osfhandle(fd);
1784 DeviceIoControl(w32handle, FSCTL_SET_ZERO_DATA, &zerodata, sizeof(zerodata), NULL, 0, &bytesret, NULL);
1788 DEBUG("punching holes failed: %s", strerror(errno));
1790 DEBUG("punching holes not supported on this platform\n");
1794 static void send_reply(CLIENT* client, uint32_t opt, uint32_t reply_type, ssize_t datasize, void* data) {
1798 uint32_t reply_type;
1800 } __attribute__ ((packed)) header = {
1801 htonll(0x3e889045565a9LL),
1807 datasize = strlen((char*)data);
1808 header.datasize = htonl(datasize);
1810 socket_write(client, &header, sizeof(header));
1812 socket_write(client, data, datasize);
1817 * Find the name of the file we have to serve. This will use g_strdup_printf
1818 * to put the IP address of the client inside a filename containing
1819 * "%s" (in the form as specified by the "virtstyle" option). That name
1820 * is then written to client->exportname.
1822 * @param net A socket connected to an nbd client
1823 * @param client information about the client. The IP address in human-readable
1824 * format will be written to a new char* buffer, the address of which will be
1825 * stored in client->clientname.
1826 * @return: 0 - OK, -1 - failed.
1828 int set_peername(int net, CLIENT *client) {
1829 struct sockaddr_storage netaddr;
1830 struct sockaddr* addr = (struct sockaddr*)&netaddr;
1831 socklen_t addrinlen = sizeof( struct sockaddr_storage );
1832 struct addrinfo hints;
1833 struct addrinfo *ai = NULL;
1834 char peername[NI_MAXHOST];
1835 char netname[NI_MAXHOST];
1840 if (getsockname(net, addr, &addrinlen) < 0) {
1841 msg(LOG_INFO, "getsockname failed: %m");
1845 if(netaddr.ss_family == AF_UNIX) {
1846 client->clientaddr.ss_family = AF_UNIX;
1847 strcpy(peername, "unix");
1849 if (getpeername(net, (struct sockaddr *) &(client->clientaddr), &addrinlen) < 0) {
1850 msg(LOG_INFO, "getpeername failed: %m");
1853 if((e = getnameinfo((struct sockaddr *)&(client->clientaddr), addrinlen,
1854 peername, sizeof (peername), NULL, 0, NI_NUMERICHOST))) {
1855 msg(LOG_INFO, "getnameinfo failed: %s", gai_strerror(e));
1859 memset(&hints, '\0', sizeof (hints));
1860 hints.ai_flags = AI_ADDRCONFIG;
1861 e = getaddrinfo(peername, NULL, &hints, &ai);
1864 msg(LOG_INFO, "getaddrinfo failed: %s", gai_strerror(e));
1870 if(strncmp(peername, "::ffff:", 7) == 0) {
1871 memmove(peername, peername+7, strlen(peername));
1874 switch(client->server->virtstyle) {
1876 msg(LOG_DEBUG, "virtualization is off");
1877 client->exportname=g_strdup(client->server->exportname);
1880 msg(LOG_DEBUG, "virtstyle iphash");
1881 for(i=0;i<strlen(peername);i++) {
1882 if(peername[i]=='.') {
1888 msg(LOG_DEBUG, "virtstyle ipliteral");
1889 client->exportname=g_strdup_printf(client->server->exportname, peername);
1892 msg(LOG_DEBUG, "virtstyle cidr %d", client->server->cidrlen);
1893 memcpy(&netaddr, &(client->clientaddr), addrinlen);
1895 if(client->clientaddr.ss_family == AF_UNIX) {
1896 tmp = g_strdup(peername);
1898 assert((ai->ai_family == AF_INET) || (ai->ai_family == AF_INET6));
1899 if(ai->ai_family == AF_INET) {
1901 } else if(ai->ai_family == AF_INET6) {
1904 g_assert_not_reached();
1906 uint8_t* addrptr = (uint8_t*)(((struct sockaddr*)&netaddr)->sa_data);
1907 for(int i = 0; i < addrbits; i+=8) {
1908 int masklen = client->server->cidrlen - i;
1909 masklen = masklen > 0 ? masklen : 0;
1910 uint8_t mask = getmaskbyte(masklen);
1914 getnameinfo((struct sockaddr *) &netaddr, addrinlen,
1915 netname, sizeof (netname), NULL, 0, NI_NUMERICHOST);
1916 tmp=g_strdup_printf("%s/%s", netname, peername);
1920 client->exportname=g_strdup_printf(client->server->exportname, tmp);
1930 msg(LOG_INFO, "connect from %s, assigned file is %s",
1931 peername, client->exportname);
1932 client->clientname=g_strdup(peername);
1936 int commit_diff(CLIENT* client, bool lock, int fhandle){
1938 int pagecount = client->exportsize/DIFFPAGESIZE;
1940 char* buf = malloc(sizeof(char)*DIFFPAGESIZE);
1942 for (int i=0; i<pagecount; i++){
1943 offset = DIFFPAGESIZE*i;
1945 pthread_rwlock_wrlock(&client->export_lock);
1946 if (client->difmap[i] != (u32)-1){
1948 DEBUG("flushing dirty page %d, offset %ld\n", i, offset);
1949 if (pread(client->difffile, buf, DIFFPAGESIZE, client->difmap[i]*DIFFPAGESIZE) != DIFFPAGESIZE) {
1950 msg(LOG_WARNING, "could not read while committing diff: %m");
1952 pthread_rwlock_unlock(&client->export_lock);
1956 if (pwrite(fhandle, buf, DIFFPAGESIZE, offset) != DIFFPAGESIZE) {
1957 msg(LOG_WARNING, "could not write while committing diff: %m");
1959 pthread_rwlock_unlock(&client->export_lock);
1963 client->difmap[i] = (u32)-1;
1966 pthread_rwlock_unlock(&client->export_lock);
1973 void* wait_file(void *void_ptr) {
1974 CLIENT* client = (CLIENT *)void_ptr;
1977 mode_t mode = O_RDWR;
1983 while (fi.fhandle < 1){
1984 sem_wait(&file_wait_sem);
1985 msg(LOG_INFO, "checking for file %s", client->server->exportname);
1986 fi.fhandle = open(client->server->exportname, mode);
1989 msg(LOG_INFO, "File %s appeared, fd %d", client->server->exportname, fi.fhandle);
1991 // first time there may be lot of data so we lock only per page
1993 dirtycount = commit_diff(client, true, fi.fhandle);
1994 } while (dirtycount > 0);
1996 //last time we lock export for the whole time until we switch write destination
1997 pthread_rwlock_wrlock(&client->export_lock);
1999 dirtycount = commit_diff(client, false, fi.fhandle);
2000 } while (dirtycount > 0);
2002 export = g_array_new(TRUE, TRUE, sizeof(FILE_INFO));
2003 g_array_append_val(export, fi);
2005 client->export = export;
2006 pthread_rwlock_unlock(&client->export_lock);
2007 msg(LOG_INFO, "Waiting for file ended, switching to exported file %s", client->server->exportname);
2013 * Set up client export array, which is an array of FILE_INFO.
2014 * Also, split a single exportfile into multiple ones, if that was asked.
2015 * @param client information on the client which we want to setup export for
2017 bool setupexport(CLIENT* client) {
2019 off_t laststartoff = 0, lastsize = 0;
2020 int multifile = (client->server->flags & F_MULTIFILE);
2021 int treefile = (client->server->flags & F_TREEFILES);
2022 int temporary = (client->server->flags & F_TEMPORARY) && !multifile;
2023 int cancreate = (client->server->expected_size) && !multifile;
2025 if (treefile || (client->server->flags & F_WAIT)) {
2026 client->export = NULL; // this could be thousands of files so we open handles on demand although its slower
2027 client->exportsize = client->server->expected_size; // available space is not checked, as it could change during runtime anyway
2029 if(client->server->flags & F_WAIT){
2030 pthread_t wait_file_thread;
2031 if (pthread_create(&wait_file_thread, NULL, wait_file, client)){
2032 DEBUG("failed to create wait_file thread");
2038 client->export = g_array_new(TRUE, TRUE, sizeof(FILE_INFO));
2040 /* If multi-file, open as many files as we can.
2041 * If not, open exactly one file.
2042 * Calculate file sizes as we go to get total size. */
2045 _cleanup_g_free_ gchar *tmpname = NULL;
2046 _cleanup_g_free_ gchar* error_string = NULL;
2050 /* if expected_size is specified, and this is the first file, we can create the file */
2051 mode_t mode = (client->server->flags & F_READONLY) ?
2052 O_RDONLY : (O_RDWR | (cancreate?O_CREAT:0));
2055 tmpname=g_strdup_printf("%s.%d-XXXXXX", client->exportname, i);
2056 DEBUG( "Opening %s\n", tmpname );
2057 fi.fhandle = mkstemp(tmpname);
2060 tmpname=g_strdup_printf("%s.%d", client->exportname, i);
2062 tmpname=g_strdup(client->exportname);
2064 DEBUG( "Opening %s\n", tmpname );
2065 fi.fhandle = open(tmpname, mode, 0600);
2066 if(fi.fhandle == -1 && mode == O_RDWR) {
2067 /* Try again because maybe media was read-only */
2068 fi.fhandle = open(tmpname, O_RDONLY);
2069 if(fi.fhandle != -1) {
2070 /* Opening the base file in copyonwrite mode is
2072 if(!(client->server->flags & F_COPYONWRITE)) {
2073 client->server->flags |= F_AUTOREADONLY;
2074 client->server->flags |= F_READONLY;
2079 if(fi.fhandle == -1) {
2080 if(multifile && i>0)
2082 error_string=g_strdup_printf(
2083 "Could not open exported file %s: %%m",
2085 err_nonfatal(error_string);
2090 unlink(tmpname); /* File will stick around whilst FD open */
2093 fi.startoff = laststartoff + lastsize;
2094 g_array_append_val(client->export, fi);
2096 /* Starting offset and size of this file will be used to
2097 * calculate starting offset of next file */
2098 laststartoff = fi.startoff;
2099 lastsize = size_autodetect(fi.fhandle);
2101 /* If we created the file, it will be length zero */
2102 if (!lastsize && cancreate) {
2104 if(ftruncate (fi.fhandle, client->server->expected_size)<0) {
2105 err_nonfatal("Could not expand file: %m");
2108 lastsize = client->server->expected_size;
2109 break; /* don't look for any more files */
2112 if(!multifile || temporary)
2116 /* Set export size to total calculated size */
2117 client->exportsize = laststartoff + lastsize;
2119 /* Export size may be overridden */
2120 if(client->server->expected_size) {
2121 /* desired size must be <= total calculated size */
2122 if(client->server->expected_size > client->exportsize) {
2123 err_nonfatal("Size of exported file is too big\n");
2127 client->exportsize = client->server->expected_size;
2131 msg(LOG_INFO, "Size of exported file/device is %llu", (unsigned long long)client->exportsize);
2133 msg(LOG_INFO, "Total number of files: %d", i);
2136 msg(LOG_INFO, "Total number of (potential) files: %" PRId64, (client->exportsize+TREEPAGESIZE-1)/TREEPAGESIZE);
2141 bool copyonwrite_prepare(CLIENT* client) {
2143 _cleanup_g_free_ gchar* dir = NULL;
2144 _cleanup_g_free_ gchar* export_base = NULL;
2145 if (client->server->cowdir != NULL) {
2146 dir = g_strdup(client->server->cowdir);
2148 dir = g_strdup(dirname(client->exportname));
2150 export_base = g_strdup(basename(client->exportname));
2151 client->difffilename = g_strdup_printf("%s/%s-%s-%d.diff",dir,export_base,client->clientname,
2153 msg(LOG_INFO, "About to create map and diff file %s", client->difffilename) ;
2154 client->difffile=open(client->difffilename,O_RDWR | O_CREAT | O_TRUNC,0600) ;
2155 if (client->difffile<0) {
2156 err("Could not create diff file (%m)");
2159 if ((client->difmap=calloc(client->exportsize/DIFFPAGESIZE,sizeof(u32)))==NULL) {
2160 err("Could not allocate memory");
2163 for (i=0;i<client->exportsize/DIFFPAGESIZE;i++) client->difmap[i]=(u32)-1;
2168 void send_export_info(CLIENT* client, SERVER* server, bool maybe_zeroes) {
2169 uint64_t size_host = htonll((u64)(client->exportsize));
2170 uint16_t flags = NBD_FLAG_HAS_FLAGS | NBD_FLAG_SEND_WRITE_ZEROES;
2172 socket_write(client, &size_host, 8);
2173 if (server->flags & F_READONLY)
2174 flags |= NBD_FLAG_READ_ONLY;
2175 if (server->flags & F_FLUSH)
2176 flags |= NBD_FLAG_SEND_FLUSH;
2177 if (server->flags & F_FUA)
2178 flags |= NBD_FLAG_SEND_FUA;
2179 if (server->flags & F_ROTATIONAL)
2180 flags |= NBD_FLAG_ROTATIONAL;
2181 if (server->flags & F_TRIM)
2182 flags |= NBD_FLAG_SEND_TRIM;
2183 if (!(server->flags & F_COPYONWRITE))
2184 flags |= NBD_FLAG_CAN_MULTI_CONN;
2185 if (client->clientflags & F_STRUCTURED)
2186 flags |= NBD_FLAG_SEND_DF;
2187 flags = htons(flags);
2188 socket_write(client, &flags, sizeof(flags));
2189 if (!(glob_flags & F_NO_ZEROES) && maybe_zeroes) {
2191 memset(zeros, '\0', sizeof(zeros));
2192 socket_write(client, zeros, 124);
2197 * Setup the transaction log
2199 * The function does all things required for the transaction log:
2200 * - Create a new log file.
2201 * - allocate the posix semaphore for synchronization.
2202 * - Report if a log file already exists.
2203 * - If needed add a header to the log.
2205 * If something goes wrong, logging is disabled.
2207 * @param client the CLIENT structure with .server and .net members set
2210 static void setup_transactionlog(CLIENT *client) {
2214 /* 1) create the file */
2215 if((client->transactionlogfd =
2216 open(client->server->transactionlog,
2218 S_IRUSR | S_IWUSR)) ==
2220 msg(LOG_INFO, "Could not open transactionlog %s, moving on without it",
2221 client->server->transactionlog);
2225 /* 2) If needed, write flags */
2226 if (client->server->flags & F_DATALOG) {
2227 struct nbd_request req;
2230 req.magic = htonl(NBD_TRACELOG_MAGIC);
2231 req.type = htonl(NBD_TRACELOG_SET_DATALOG);
2233 req.from = htonll(NBD_TRACELOG_FROM_MAGIC);
2234 req.len = htonl(TRUE);
2236 ret = writeit(client->transactionlogfd, &req, sizeof(struct nbd_request));
2238 msg(LOG_INFO, "Could not write to transactionlog %s, moving on without it",
2239 client->server->transactionlog);
2240 close(client->transactionlogfd);
2241 client->transactionlogfd = -1;
2246 /* 3) Allocate the semaphore used for locking */
2247 ret = fstat(client->transactionlogfd, &fdinfo);
2249 msg(LOG_INFO, "Could not stat transactionlog %s, moving on without it",
2250 client->server->transactionlog);
2251 close(client->transactionlogfd);
2252 client->transactionlogfd = -1;
2255 snprintf(client->semname, sizeof(client->semname), "/nbd-server-%llx-%llx",
2256 (unsigned long long)fdinfo.st_dev,
2257 (unsigned long long)fdinfo.st_ino);
2258 client->logsem = sem_open(client->semname, O_CREAT, 0600, 1);
2259 if (client->logsem == SEM_FAILED) {
2260 msg(LOG_INFO, "Could not allocate semaphore for transactionlog %s, moving on without it",
2261 client->server->transactionlog);
2262 close(client->transactionlogfd);
2263 client->transactionlogfd = -1;
2268 * Commit to exporting the chosen export
2270 * When a client sends NBD_OPT_EXPORT_NAME or NBD_OPT_GO, we need to do
2271 * a number of things (verify whether the client is allowed access, try
2272 * to open files, etc etc) before we're ready to actually serve the
2275 * This function does all those things.
2277 * @param client the CLIENT structure with .server and .net members set
2279 * @return true if the client is allowed access to the export, false
2282 static bool commit_client(CLIENT* client, SERVER* server) {
2286 client->server = serve_inc_ref(server);
2287 client->exportsize = OFFT_MAX;
2288 client->transactionlogfd = -1;
2289 if(pthread_mutex_init(&(client->lock), NULL)) {
2290 msg(LOG_ERR, "Unable to initialize mutex");
2293 if (pthread_rwlock_init(&client->export_lock, NULL)){
2294 msg(LOG_ERR, "Unable to initialize write lock");
2297 /* Check whether we exceeded the maximum number of allowed
2298 * clients already */
2302 len = strlen(client->server->servename);
2303 writeit(commsocket, &len, sizeof len);
2304 writeit(commsocket, client->server->servename, len);
2305 readit(commsocket, &acl, 1);
2310 msg(LOG_ERR, "Connection not allowed (too many clients)");
2313 msg(LOG_ERR, "Connection not allowed (unknown by parent?!?)");
2317 /* Check whether the client is listed in the authfile */
2318 if (set_peername(client->net, client)) {
2319 msg(LOG_ERR, "Failed to set peername");
2323 if (!authorized_client(client)) {
2324 msg(LOG_INFO, "Client '%s' is not authorized to access",
2325 client->clientname);
2329 /* Set up the transactionlog, if we need one */
2330 if (client->server->transactionlog && (client->transactionlogfd == -1))
2331 setup_transactionlog(client);
2333 /* Run any pre scripts that we may need */
2334 if (do_run(client->server->prerun, client->exportname)) {
2335 msg(LOG_INFO, "Client '%s' not allowed access by prerun script",
2336 client->clientname);
2339 client->socket_closed = socket_closed_transmission;
2340 if(!setupexport(client)) {
2344 if (client->server->flags & F_COPYONWRITE) {
2345 if(!copyonwrite_prepare(client)) {
2350 if (client->server->flags & F_WAIT) {
2351 if(!copyonwrite_prepare(client)) {
2356 setmysockopt(client->net);
2361 static CLIENT* handle_export_name(CLIENT* client, uint32_t opt, GArray* servers, uint32_t cflags) {
2366 socket_read(client, &namelen, sizeof(namelen));
2367 namelen = ntohl(namelen);
2368 if(namelen > 4096) {
2372 name = malloc(namelen+1);
2374 socket_read(client, name, namelen);
2378 for(i=0; i<servers->len; i++) {
2379 SERVER* serve = (g_array_index(servers, SERVER*, i));
2380 // hide exports that are TLS-only if we haven't negotiated TLS
2382 if ((serve->flags & F_FORCEDTLS) && !client->tls_session) {
2385 if(!strcmp(serve->servename, name)) {
2386 client->clientfeats = cflags;
2388 if(!commit_client(client, serve)) {
2391 send_export_info(client, serve, true);
2396 err("Negotiation failed/8a: Requested export not found, or is TLS-only and client did not negotiate TLS");
2399 static void handle_list(CLIENT* client, uint32_t opt, GArray* servers, uint32_t cflags) {
2403 char *ptr = buf + sizeof(len);
2405 socket_read(client, &len, sizeof(len));
2408 send_reply(client, opt, NBD_REP_ERR_INVALID, -1, "NBD_OPT_LIST with nonzero data length is not a valid request");
2410 if(!(glob_flags & F_LIST)) {
2411 send_reply(client, opt, NBD_REP_ERR_POLICY, -1, "Listing of exports denied by server configuration");
2412 err_nonfatal("Client tried disallowed list option");
2415 for(i=0; i<servers->len; i++) {
2416 SERVER* serve = (g_array_index(servers, SERVER*, i));
2417 // Hide TLS-only exports if we haven't negotiated TLS yet
2418 if(!client->tls_session && (serve->flags & F_FORCEDTLS)) {
2421 len = htonl(strlen(serve->servename));
2422 memcpy(buf, &len, sizeof(len));
2423 strncpy(ptr, serve->servename, sizeof(buf) - sizeof(len));
2424 send_reply(client, opt, NBD_REP_SERVER, strlen(serve->servename)+sizeof(len), buf);
2426 send_reply(client, opt, NBD_REP_ACK, 0, NULL);
2430 static int verify_cert(gnutls_session_t session) {
2432 unsigned int status, cert_list_size;
2433 const gnutls_datum_t *cert_list;
2434 gnutls_x509_crt_t cert;
2435 time_t now = time(NULL);
2437 ret = gnutls_certificate_verify_peers2(session, &status);
2438 if(ret < 0 || status != 0 || gnutls_certificate_type_get(session) !=
2443 if(gnutls_x509_crt_init(&cert) < 0) {
2447 cert_list = gnutls_certificate_get_peers(session, &cert_list_size);
2448 if(cert_list == NULL) {
2451 if(gnutls_x509_crt_import(cert, &cert_list[0], GNUTLS_X509_FMT_DER) < 0) {
2454 if(gnutls_x509_crt_get_activation_time(cert) > now) {
2457 if(gnutls_x509_crt_get_expiration_time(cert) < now) {
2460 // TODO: check CRLs and/or OCSP etc. Patches welcome.
2461 msg(LOG_INFO, "client certificate verification successful");
2464 msg(LOG_ERR, "E: client certificate verification failed");
2465 return GNUTLS_E_CERTIFICATE_ERROR;
2468 CLIENT* handle_starttls(CLIENT* client, int opt, GArray* servers, uint32_t cflags, struct generic_conf *genconf) {
2469 #define check_rv(c) if((c)<0) { retval = NULL; goto exit; }
2470 gnutls_certificate_credentials_t x509_cred;
2471 CLIENT* retval = client;
2472 gnutls_priority_t priority_cache;
2473 gnutls_session_t *session = g_new0(gnutls_session_t, 1);
2477 socket_read(client, &len, sizeof(len));
2478 if(G_UNLIKELY(len != 0)) {
2479 char buf[1024*1024];
2480 consume(client, len, buf, sizeof(buf));
2481 send_reply(client, opt, NBD_REP_ERR_INVALID, -1, "Sending a STARTTLS command with data is invalid");
2485 send_reply(client, opt, NBD_REP_ACK, 0, NULL);
2487 check_rv(gnutls_certificate_allocate_credentials(&x509_cred));
2488 gnutls_certificate_set_verify_function(x509_cred, verify_cert);
2489 check_rv(gnutls_certificate_set_x509_trust_file(x509_cred, genconf->cacertfile, GNUTLS_X509_FMT_PEM));
2490 check_rv(gnutls_certificate_set_x509_key_file(x509_cred, genconf->certfile, genconf->keyfile, GNUTLS_X509_FMT_PEM));
2491 check_rv(gnutls_priority_init(&priority_cache, genconf->tlsprio, NULL));
2492 check_rv(gnutls_init(session, GNUTLS_SERVER));
2493 check_rv(gnutls_priority_set(*session, priority_cache));
2494 check_rv(gnutls_credentials_set(*session, GNUTLS_CRD_CERTIFICATE, x509_cred));
2496 gnutls_certificate_server_set_request(*session, GNUTLS_CERT_REQUEST);
2497 #if GNUTLS_VERSION_NUMBER >= 0x030109
2498 gnutls_transport_set_int(*session, client->net);
2500 gnutls_transport_set_ptr(*session, (gnutls_transport_ptr_t) (intptr_t) client->net);
2503 ret = gnutls_handshake(*session);
2504 } while(ret < 0 && gnutls_error_is_fatal(ret) == 0);
2507 err_nonfatal(gnutls_strerror(ret));
2508 gnutls_bye(*session, GNUTLS_SHUT_RDWR);
2509 gnutls_deinit(*session);
2513 client->tls_session = session;
2514 client->socket_read = socket_read_tls;
2515 client->socket_write = socket_write_tls;
2518 if(retval == NULL && session != NULL) {
2521 /* export names cannot be chosen before NBD_OPT_STARTTLS and be retained */
2522 if(retval != NULL && retval->server != NULL) {
2523 retval->server = NULL;
2530 * Handle an NBD_OPT_STRUCTURED_REPLY message
2532 static void handle_structured_reply(CLIENT *client, uint32_t opt, GArray *servers, uint32_t cflags) {
2536 socket_read(client, &len, sizeof(len));
2539 send_reply(client, opt, NBD_REP_ERR_INVALID, -1, "NBD_OPT_STRUCTURED_REPLY with nonzero data length is not a valid request");
2541 consume(client, len, buf, sizeof buf);
2544 if(client->clientflags & F_STRUCTURED) {
2545 send_reply(client, opt, NBD_REP_ERR_INVALID, -1, "NBD_OPT_STRUCTURED_REPLY has already been called");
2548 client->clientflags |= F_STRUCTURED;
2549 send_reply(client, opt, NBD_REP_ACK, 0, NULL);
2553 * Handle an NBD_OPT_INFO or NBD_OPT_GO request.
2555 static bool handle_info(CLIENT* client, uint32_t opt, GArray* servers, uint32_t cflags) {
2556 uint32_t namelen, len;
2559 SERVER *server = NULL;
2560 uint16_t n_requests;
2563 bool sent_export = false;
2564 uint32_t reptype = NBD_REP_ERR_UNKNOWN;
2565 char *msg = "Export unknown";
2567 socket_read(client, &len, sizeof(len));
2569 socket_read(client, &namelen, sizeof(namelen));
2570 namelen = htonl(namelen);
2571 if(namelen > (len - 6)) {
2572 send_reply(client, opt, NBD_REP_ERR_INVALID, -1, "An OPT_INFO request cannot be smaller than the length of the name + 6");
2573 consume(client, len - sizeof(namelen), buf, sizeof(buf));
2575 if(namelen > 4096) {
2576 send_reply(client, opt, NBD_REP_ERR_INVALID, -1, "The name for this OPT_INFO request is too long");
2577 consume(client, namelen, buf, sizeof(buf));
2580 name = malloc(namelen + 1);
2582 send_reply(client, opt, reptype, -1, "nbd server out of memory");
2586 socket_read(client, name, namelen);
2590 for(i=0; i<servers->len; i++) {
2591 SERVER *serve = (g_array_index(servers, SERVER*, i));
2592 if (!strcmp(serve->servename, name)) {
2593 if ((serve->flags & F_FORCEDTLS) && !client->tls_session) {
2594 reptype = NBD_REP_ERR_TLS_REQD;
2595 msg = "TLS is required for that export";
2602 socket_read(client, &n_requests, sizeof(n_requests));
2603 n_requests = ntohs(n_requests);
2605 consume(client, n_requests * sizeof(request), buf,
2607 send_reply(client, opt, reptype, -1, msg);
2610 if (opt == NBD_OPT_GO) {
2611 client->clientfeats = cflags;
2612 if(!commit_client(client, server)) {
2613 send_reply(client, opt, NBD_REP_ERR_POLICY, -1, "Access denied by server configuration");
2617 for(i=0; i<n_requests; i++) {
2618 socket_read(client, &request, sizeof(request));
2619 switch(ntohs(request)) {
2620 case NBD_INFO_EXPORT:
2621 send_reply(client, opt, NBD_REP_INFO, 12, NULL);
2622 socket_write(client, &request, 2);
2623 send_export_info(client, server, false);
2627 // ignore all other options for now.
2632 request = htons(NBD_INFO_EXPORT);
2633 send_reply(client, opt, NBD_REP_INFO, 12, NULL);
2634 socket_write(client, &request, 2);
2635 send_export_info(client, server, false);
2637 send_reply(client, opt, NBD_REP_ACK, 0, NULL);
2643 * Do the initial negotiation.
2645 * @param net The socket we're doing the negotiation over.
2646 * @param servers The array of known servers.
2647 * @param genconf the global options (needed for accessing TLS config data)
2649 CLIENT* negotiate(int net, GArray* servers, struct generic_conf *genconf) {
2650 uint16_t smallflags = NBD_FLAG_FIXED_NEWSTYLE | NBD_FLAG_NO_ZEROES;
2652 uint32_t cflags = 0;
2654 CLIENT* client = g_new0(CLIENT, 1);
2656 client->socket_read = socket_read_notls;
2657 client->socket_write = socket_write_notls;
2658 client->socket_closed = socket_closed_negotiate;
2659 client->transactionlogfd = -1;
2660 client->logsem = SEM_FAILED;
2662 assert(servers != NULL);
2663 socket_write(client, INIT_PASSWD, 8);
2664 magic = htonll(opts_magic);
2665 socket_write(client, &magic, sizeof(magic));
2667 smallflags = htons(smallflags);
2668 socket_write(client, &smallflags, sizeof(uint16_t));
2669 socket_read(client, &cflags, sizeof(cflags));
2670 cflags = htonl(cflags);
2671 if (cflags & NBD_FLAG_C_NO_ZEROES) {
2672 glob_flags |= F_NO_ZEROES;
2675 socket_read(client, &magic, sizeof(magic));
2676 magic = ntohll(magic);
2677 if(magic != opts_magic) {
2678 err_nonfatal("Negotiation failed/5a: magic mismatch");
2681 socket_read(client, &opt, sizeof(opt));
2683 if(client->tls_session == NULL
2684 && glob_flags & F_FORCEDTLS
2685 && opt != NBD_OPT_STARTTLS) {
2686 if(opt == NBD_OPT_EXPORT_NAME) {
2687 // can't send an error message for EXPORT_NAME,
2688 // so must do hard close
2691 if(opt == NBD_OPT_ABORT) {
2695 consume_len(client);
2696 send_reply(client, opt, NBD_REP_ERR_TLS_REQD, -1, "TLS is required on this server");
2700 case NBD_OPT_EXPORT_NAME:
2701 // NBD_OPT_EXPORT_NAME must be the last
2702 // selected option, so return from here
2703 // if that is chosen.
2704 if(handle_export_name(client, opt, servers, cflags) != NULL) {
2711 handle_list(client, opt, servers, cflags);
2716 case NBD_OPT_STARTTLS:
2718 consume_len(client);
2719 send_reply(client, opt, NBD_REP_ERR_PLATFORM, -1, "This nbd-server was compiled without TLS support");
2721 if(client->tls_session != NULL) {
2722 consume_len(client);
2723 send_reply(client, opt, NBD_REP_ERR_INVALID, -1, "Invalid STARTTLS request: TLS has already been negotiated!");
2726 if(genconf->keyfile == NULL) {
2727 consume_len(client);
2728 send_reply(client, opt, NBD_REP_ERR_POLICY, -1, "TLS not allowed on this server");
2731 if(handle_starttls(client, opt, servers, cflags, genconf) == NULL) {
2732 // can't recover from failed TLS negotiation.
2735 // once TLS has been negotiated, any state must be cleared
2736 client->clientflags = 0;
2741 if(handle_info(client, opt, servers, cflags) && opt == NBD_OPT_GO) {
2745 case NBD_OPT_STRUCTURED_REPLY:
2746 handle_structured_reply(client, opt, servers, cflags);
2749 consume_len(client);
2750 send_reply(client, opt, NBD_REP_ERR_UNSUP, -1, "The given option is unknown to this server implementation");
2753 } while((opt != NBD_OPT_EXPORT_NAME) && (opt != NBD_OPT_ABORT));
2754 if(opt == NBD_OPT_ABORT) {
2755 err_nonfatal("Session terminated by client");
2758 err_nonfatal("Weird things happened: reached end of negotiation without success");
2764 static int nbd_errno(int errcode) {
2779 return htonl(28); // ENOSPC
2781 return htonl(22); // EINVAL
2785 static void package_dispose(struct work_package* package) {
2786 if (package->pipefd[0] > 0)
2787 close(package->pipefd[0]);
2788 if (package->pipefd[1] > 0)
2789 close(package->pipefd[1]);
2790 g_free(package->data);
2791 g_free(package->req);
2795 static int mkpipe(int pipefd[2], size_t len)
2797 if (len > MAX_PIPE_SIZE)
2803 if (fcntl(pipefd[1], F_SETPIPE_SZ, MAX_PIPE_SIZE) < MAX_PIPE_SIZE) {
2815 struct work_package* package_create(CLIENT* client, struct nbd_request* req) {
2816 struct work_package* rv = calloc(sizeof (struct work_package), 1);
2819 rv->client = client;
2824 if((req->type & NBD_CMD_MASK_COMMAND) == NBD_CMD_WRITE) {
2825 if (client->server->flags & F_SPLICE) {
2826 if (mkpipe(rv->pipefd, req->len))
2827 rv->data = malloc(req->len);
2829 rv->data = malloc(req->len);
2837 static int handle_splice_read(CLIENT *client, struct nbd_request *req)
2839 struct nbd_reply rep;
2842 // splice doesn't work with TLS
2843 if (client->tls_session != NULL)
2846 if (mkpipe(pipefd, req->len))
2849 if (expsplice(pipefd[1], req->from, req->len, client, SPLICE_IN, 0)) {
2855 DEBUG("handling read request (splice)\n");
2856 setup_reply(&rep, req);
2857 log_reply(client, &rep);
2858 pthread_mutex_lock(&(client->lock));
2859 writeit(client->net, &rep, sizeof(rep));
2860 spliceit(pipefd[0], NULL, client->net, NULL, req->len);
2861 pthread_mutex_unlock(&(client->lock));
2868 static void handle_normal_read(CLIENT *client, struct nbd_request *req)
2870 DEBUG("handling read request\n");
2871 char read_failed[] = "Read failed";
2872 _cleanup_g_free_ READ_CTX *ctx = g_new0(READ_CTX, 1);
2874 ctx->current_len = req->len;
2876 char *errmsg = NULL;
2877 uint16_t msglen = 0;
2878 if(client->clientflags & F_STRUCTURED) {
2879 ctx->is_structured = 1;
2881 ctx->is_structured = 0;
2883 if(req->type & NBD_CMD_FLAG_DF != 0) {
2886 if(ctx->is_structured && ctx->df && req->len > (1 << 20)) {
2887 /* standard requires a minimum of 64KiB; we are more generous
2888 * by allowing up to 1MiB as our largest unfragmented answer */
2889 const char too_long[] = "Request too long for unfragmented reply";
2890 struct nbd_structured_error_payload pl;
2891 pl.error = NBD_EOVERFLOW;
2892 pl.msglen = sizeof too_long;
2893 send_structured_chunk_v(client, req, NBD_REPLY_FLAG_DONE, NBD_REPLY_TYPE_ERROR, 6 + pl.msglen, 2, &pl, sizeof pl, too_long, sizeof too_long);
2896 if(ctx->df || !(ctx->is_structured)) {
2897 ctx->buf = malloc(req->len);
2899 err("Could not allocate memory for request");
2901 ctx->buflen = req->len;
2903 if(expread(ctx, client)) {
2904 DEBUG("Read failed: %m");
2905 error = nbd_errno(errno);
2906 errmsg = read_failed;
2907 msglen = sizeof read_failed;
2909 complete_read(client, ctx, error, errmsg, msglen, false, 0);
2912 static void handle_read(CLIENT* client, struct nbd_request* req)
2916 * If we have splice set we want to try that first, and if that fails
2917 * for whatever reason we fall through to ye olde read.
2919 if (client->server->flags & F_SPLICE)
2920 if (!handle_splice_read(client, req))
2923 handle_normal_read(client, req);
2926 static void handle_write(struct work_package *pkg)
2928 CLIENT *client = pkg->client;
2929 struct nbd_request *req = pkg->req;
2930 struct nbd_reply rep;
2931 int fua = !!(req->type & NBD_CMD_FLAG_FUA);
2933 DEBUG("handling write request\n");
2934 setup_reply(&rep, req);
2938 if (expsplice(pkg->pipefd[0], req->from, req->len, client,
2940 DEBUG("Splice failed: %m");
2941 rep.error = nbd_errno(errno);
2946 if(expwrite(req->from, pkg->data, req->len, client, fua)) {
2947 DEBUG("Write failed: %m");
2948 rep.error = nbd_errno(errno);
2951 log_reply(client, &rep);
2952 pthread_mutex_lock(&(client->lock));
2953 socket_write(client, &rep, sizeof rep);
2954 pthread_mutex_unlock(&(client->lock));
2957 static void handle_flush(CLIENT* client, struct nbd_request* req) {
2958 struct nbd_reply rep;
2959 DEBUG("handling flush request\n");
2960 setup_reply(&rep, req);
2961 if(expflush(client)) {
2962 DEBUG("Flush failed: %m");
2963 rep.error = nbd_errno(errno);
2965 log_reply(client, &rep);
2966 pthread_mutex_lock(&(client->lock));
2967 socket_write(client, &rep, sizeof rep);
2968 pthread_mutex_unlock(&(client->lock));
2971 static void handle_trim(CLIENT* client, struct nbd_request* req) {
2972 struct nbd_reply rep;
2973 DEBUG("handling trim request\n");
2974 setup_reply(&rep, req);
2975 if(exptrim(req, client)) {
2976 DEBUG("Trim failed: %m");
2977 rep.error = nbd_errno(errno);
2979 log_reply(client, &rep);
2980 pthread_mutex_lock(&(client->lock));
2981 socket_write(client, &rep, sizeof rep);
2982 pthread_mutex_unlock(&(client->lock));
2985 static void handle_write_zeroes(CLIENT* client, struct nbd_request* req) {
2986 struct nbd_reply rep;
2987 DEBUG("handling write_zeroes request\n");
2988 int fua = !!(req->type & NBD_CMD_FLAG_FUA);
2989 setup_reply(&rep, req);
2990 if(expwrite_zeroes(req, client, fua)) {
2991 DEBUG("Write_zeroes failed: %m");
2992 rep.error = nbd_errno(errno);
2994 // For now, don't trim
2995 // TODO: handle this far more efficiently with reference to the
2996 // actual backing driver
2997 log_reply(client, &rep);
2998 pthread_mutex_lock(&(client->lock));
2999 socket_write(client, &rep, sizeof rep);
3000 pthread_mutex_unlock(&(client->lock));
3004 static bool bad_write(CLIENT* client, struct nbd_request* req) {
3005 if ((client->server->flags & F_READONLY) ||
3006 (client->server->flags & F_AUTOREADONLY)) {
3007 DEBUG("[WRITE to READONLY!]");
3013 static bool bad_range(CLIENT* client, struct nbd_request* req) {
3014 if(req->from > client->exportsize ||
3015 req->from + req->len > client->exportsize) {
3016 DEBUG("[out of bounds!]");
3022 static void handle_request(gpointer data, gpointer user_data) {
3023 struct work_package* package = (struct work_package*) data;
3024 uint32_t type = package->req->type & NBD_CMD_MASK_COMMAND;
3025 uint32_t flags = package->req->type & ~NBD_CMD_MASK_COMMAND;
3026 struct nbd_reply rep;
3029 if(flags & ~(NBD_CMD_FLAG_FUA | NBD_CMD_FLAG_NO_HOLE)) {
3030 msg(LOG_ERR, "E: received invalid flag %d on command %d, ignoring", flags, type);
3036 if (bad_range(package->client, package->req)) {
3039 handle_read(package->client, package->req);
3042 if (bad_write(package->client, package->req)) {
3046 if (bad_range(package->client, package->req)) {
3050 handle_write(package);
3053 handle_flush(package->client, package->req);
3056 if (bad_write(package->client, package->req)) {
3060 if (bad_range(package->client, package->req)) {
3063 handle_trim(package->client, package->req);
3065 case NBD_CMD_WRITE_ZEROES:
3066 if (bad_write(package->client, package->req)) {
3070 if (bad_range(package->client, package->req)) {
3074 handle_write_zeroes(package->client, package->req);
3077 msg(LOG_ERR, "E: received unknown command %d of type, ignoring", package->req->type);
3082 setup_reply(&rep, package->req);
3083 rep.error = nbd_errno(err);
3084 log_reply(package->client, &rep);
3085 pthread_mutex_lock(&(package->client->lock));
3086 socket_write(package->client, &rep, sizeof rep);
3087 pthread_mutex_unlock(&(package->client->lock));
3089 package_dispose(package);
3092 static int mainloop_threaded(CLIENT* client) {
3093 struct nbd_request* req;
3094 struct work_package* pkg;
3095 int write_data = false;
3097 DEBUG("Entering request loop\n");
3099 req = calloc(sizeof (struct nbd_request), 1);
3101 socket_read(client, req, sizeof(struct nbd_request));
3103 if(client->transactionlogfd != -1) {
3104 lock_logsem(client);
3105 writeit(client->transactionlogfd, req, sizeof(struct nbd_request));
3106 if(((ntohl(req->type) & NBD_CMD_MASK_COMMAND) == NBD_CMD_WRITE) &&
3107 (client->server->flags & F_DATALOG) &&
3108 !(client->server->flags & F_SPLICE)) {
3112 unlock_logsem(client);
3116 req->from = ntohll(req->from);
3117 req->type = ntohl(req->type);
3118 req->len = ntohl(req->len);
3120 if(req->magic != htonl(NBD_REQUEST_MAGIC))
3121 err("Protocol error: not enough magic.");
3123 pkg = package_create(client, req);
3125 if((req->type & NBD_CMD_MASK_COMMAND) == NBD_CMD_WRITE) {
3127 if ((client->server->flags & F_SPLICE) &&
3128 (req->len <= MAX_PIPE_SIZE && pkg->pipefd[1] > 0) &&
3129 (client->tls_session == NULL))
3130 spliceit(client->net, NULL, pkg->pipefd[1],
3134 socket_read(client, pkg->data, req->len);
3137 writeit(client->transactionlogfd, pkg->data, req->len);
3138 unlock_logsem(client);
3142 if(req->type == NBD_CMD_DISC) {
3143 finalize_client(client);
3144 package_dispose(pkg);
3147 g_thread_pool_push(tpool, pkg, NULL);
3153 * @param data a pointer to pid_t which should be freed
3155 void destroy_pid_t(gpointer data) {
3159 static pid_t spawn_child(int* socket) {
3165 sigemptyset(&newset);
3166 sigaddset(&newset, SIGCHLD);
3167 sigaddset(&newset, SIGTERM);
3168 sigprocmask(SIG_BLOCK, &newset, &oldset);
3169 socketpair(AF_UNIX, SOCK_STREAM, 0, sockets);
3172 msg(LOG_ERR, "Could not fork (%s)", strerror(errno));
3177 if (pid > 0) { /* Parent */
3180 pidp = g_malloc(sizeof(pid_t));
3182 *socket = sockets[1];
3184 g_hash_table_insert(children, pidp, pidp);
3188 *socket = sockets[0];
3190 /* Child's signal disposition is reset to default. */
3191 signal(SIGCHLD, SIG_DFL);
3192 signal(SIGTERM, SIG_DFL);
3193 signal(SIGHUP, SIG_DFL);
3194 sigemptyset(&oldset);
3196 sigprocmask(SIG_SETMASK, &oldset, NULL);
3201 socket_accept(const int sock)
3203 struct sockaddr_storage addrin;
3204 socklen_t addrinlen = sizeof(addrin);
3207 net = accept(sock, (struct sockaddr *) &addrin, &addrinlen);
3209 err_nonfatal("Failed to accept socket connection: %m");
3216 handle_modern_connection(GArray *const servers, const int sock, struct generic_conf *genconf)
3220 CLIENT *client = NULL;
3224 net = socket_accept(sock);
3229 pid = spawn_child(&commsocket);
3232 msg(LOG_INFO, "Spawned a child process");
3233 g_array_append_val(childsocks, commsocket);
3236 msg(LOG_ERR, "Failed to spawn a child process");
3240 /* Child just continues. */
3242 tpool = g_thread_pool_new(handle_request, NULL, genconf->threads, FALSE, NULL);
3244 sock_flags_old = fcntl(net, F_GETFL, 0);
3245 if (sock_flags_old == -1) {
3246 msg(LOG_ERR, "Failed to get socket flags");
3250 sock_flags_new = sock_flags_old & ~O_NONBLOCK;
3251 if (sock_flags_new != sock_flags_old &&
3252 fcntl(net, F_SETFL, sock_flags_new) == -1) {
3253 msg(LOG_ERR, "Failed to set socket to blocking mode");
3257 client = negotiate(net, servers, genconf);
3259 msg(LOG_ERR, "Modern initial negotiation failed");
3266 /* Free all root server resources here, because we are
3267 * currently in the child process serving one specific
3268 * connection. These are not simply needed anymore. */
3269 g_hash_table_destroy(children);
3271 for (i = 0; i < modernsocks->len; i++) {
3272 close(g_array_index(modernsocks, int, i));
3274 g_array_free(modernsocks, TRUE);
3276 /* Now that we are in the child process after a
3277 * succesful negotiation, we do not need the list of
3278 * servers anymore, get rid of it.*/
3279 g_array_free(servers, FALSE);
3282 msg(LOG_INFO, "Starting to serve");
3283 mainloop_threaded(client);
3295 static int handle_childname(GArray* servers, int socket)
3298 _cleanup_g_free_ char *buf = NULL;
3301 while(rt < sizeof(len)) {
3302 switch((r = read(socket, &len, sizeof len))) {
3306 err_nonfatal("Error reading from acl socket: %m");
3313 if (len >= ULONG_MAX - 1) {
3314 err_nonfatal("Value out of range");
3317 buf = g_malloc0(len + 1);
3318 readit(socket, buf, len);
3320 for(i=0; i<servers->len; i++) {
3321 SERVER* srv = g_array_index(servers, SERVER*, i);
3322 if(strcmp(srv->servename, buf) == 0) {
3323 if(srv->max_connections == 0 || srv->max_connections > srv->numclients) {
3324 writeit(socket, "Y", 1);
3327 writeit(socket, "N", 1);
3332 writeit(socket, "X", 1);
3338 * Return the index of the server whose servename matches the given
3341 * @param servename a string to match
3342 * @param servers an array of servers
3343 * @return the first index of the server whose servename matches the
3344 * given name or -1 if one cannot be found
3346 static int get_index_by_servename(const gchar *const servename,
3347 const GArray *const servers) {
3350 for (i = 0; i < servers->len; ++i) {
3351 const SERVER* server = g_array_index(servers, SERVER*, i);
3353 if (strcmp(servename, server->servename) == 0)
3361 * Parse configuration files and add servers to the array if they don't
3362 * already exist there. The existence is tested by comparing
3363 * servenames. A server is appended to the array only if its servename
3364 * is unique among all other servers.
3366 * @param servers an array of servers
3367 * @param genconf a pointer to generic configuration
3368 * @return the number of new servers appended to the array, or -1 in
3371 static int append_new_servers(GArray *const servers, struct generic_conf *genconf, GError **const gerror) {
3373 GArray *new_servers;
3374 const int old_len = servers->len;
3377 new_servers = parse_cfile(config_file_pos, genconf, true, gerror);
3378 if(tpool) g_thread_pool_set_max_threads(tpool, genconf->threads, NULL);
3382 for(i = 0; i < new_servers->len; ++i) {
3383 SERVER *new_server = g_array_index(new_servers, SERVER*, i);
3385 if (new_server->servename
3386 && -1 == get_index_by_servename(new_server->servename,
3388 serve_inc_ref(new_server);
3389 g_array_append_val(servers, new_server);
3393 retval = servers->len - old_len;
3395 g_array_free(new_servers, TRUE);
3400 void serveloop(GArray* servers, struct generic_conf *genconf) G_GNUC_NORETURN;
3402 * Loop through the available servers, and serve them. Never returns.
3404 void serveloop(GArray* servers, struct generic_conf *genconf) {
3409 sigset_t blocking_mask;
3410 sigset_t original_mask;
3413 * Set up the master fd_set. The set of descriptors we need
3414 * to select() for never changes anyway and it buys us a *lot*
3415 * of time to only build this once. However, if we ever choose
3416 * to not fork() for clients anymore, we may have to revisit
3421 for(i=0;i<modernsocks->len;i++) {
3422 int sock = g_array_index(modernsocks, int, i);
3423 FD_SET(sock, &mset);
3424 mmax=sock>mmax?sock:mmax;
3427 /* Construct a signal mask which is used to make signal testing and
3428 * receiving an atomic operation to ensure no signal is received between
3429 * tests and blocking pselect(). */
3430 if (sigemptyset(&blocking_mask) == -1)
3431 err("failed to initialize blocking_mask: %m");
3433 if (sigaddset(&blocking_mask, SIGCHLD) == -1)
3434 err("failed to add SIGCHLD to blocking_mask: %m");
3436 if (sigaddset(&blocking_mask, SIGHUP) == -1)
3437 err("failed to add SIGHUP to blocking_mask: %m");
3439 if (sigaddset(&blocking_mask, SIGTERM) == -1)
3440 err("failed to add SIGTERM to blocking_mask: %m");
3442 if (sigprocmask(SIG_BLOCK, &blocking_mask, &original_mask) == -1)
3443 err("failed to block signals: %m");
3446 if (is_sigterm_caught) {
3447 is_sigterm_caught = 0;
3449 g_hash_table_foreach(children, killchild, NULL);
3455 if (is_sigchld_caught) {
3460 is_sigchld_caught = 0;
3462 while ((pid=waitpid(-1, &status, WNOHANG)) > 0) {
3463 if (WIFEXITED(status)) {
3464 msg(LOG_INFO, "Child exited with %d", WEXITSTATUS(status));
3466 i = g_hash_table_lookup(children, &pid);
3468 msg(LOG_INFO, "SIGCHLD received for an unknown child with PID %ld", (long)pid);
3470 DEBUG("Removing %d from the list of children", pid);
3471 g_hash_table_remove(children, &pid);
3476 /* SIGHUP causes the root server process to reconfigure
3477 * itself and add new export servers for each newly
3478 * found export configuration group, i.e. spawn new
3479 * server processes for each previously non-existent
3480 * export. This does not alter old runtime configuration
3481 * but just appends new exports. */
3482 if (is_sighup_caught) {
3484 GError *gerror = NULL;
3486 msg(LOG_INFO, "reconfiguration request received");
3487 is_sighup_caught = 0; /* Reset to allow catching
3490 n = append_new_servers(servers, genconf, &gerror);
3492 msg(LOG_ERR, "failed to append new servers: %s",
3495 for (i = servers->len - n; i < servers->len; ++i) {
3496 const SERVER *server = g_array_index(servers,
3499 msg(LOG_INFO, "reconfigured new server: %s",
3504 memcpy(&rset, &mset, sizeof(fd_set));
3506 for(i=0;i<childsocks->len;i++) {
3507 int sock = g_array_index(childsocks, int, i);
3508 FD_SET(sock, &rset);
3509 max=sock>max?sock:max;
3512 if (pselect(max + 1, &rset, NULL, NULL, NULL, &original_mask) > 0) {
3514 for(i=0; i < modernsocks->len; i++) {
3515 int sock = g_array_index(modernsocks, int, i);
3516 if(!FD_ISSET(sock, &rset)) {
3520 handle_modern_connection(servers, sock, genconf);
3522 for(i=0; i < childsocks->len; i++) {
3523 int sock = g_array_index(childsocks, int, i);
3525 if(FD_ISSET(sock, &rset)) {
3526 if(handle_childname(servers, sock) < 0) {
3528 g_array_remove_index(childsocks, i);
3537 * Set server socket options.
3539 * @param socket a socket descriptor of the server
3541 * @param gerror a pointer to an error object pointer used for reporting
3542 * errors. On error, if gerror is not NULL, *gerror is set and -1
3545 * @return 0 on success, -1 on error
3547 int dosockopts(const int socket, GError **const gerror) {
3555 /* lose the pesky "Address already in use" error message */
3556 if (setsockopt(socket,SOL_SOCKET,SO_REUSEADDR,&yes,sizeof(int)) == -1) {
3557 g_set_error(gerror, NBDS_ERR, NBDS_ERR_SO_REUSEADDR,
3558 "failed to set socket option SO_REUSEADDR: %s",
3564 if (setsockopt(socket,SOL_SOCKET,SO_LINGER,&l,sizeof(l)) == -1) {
3565 g_set_error(gerror, NBDS_ERR, NBDS_ERR_SO_LINGER,
3566 "failed to set socket option SO_LINGER: %s",
3570 if (setsockopt(socket,SOL_SOCKET,SO_KEEPALIVE,&yes,sizeof(int)) == -1) {
3571 g_set_error(gerror, NBDS_ERR, NBDS_ERR_SO_KEEPALIVE,
3572 "failed to set socket option SO_KEEPALIVE: %s",
3580 int open_unix(const gchar *const sockname, GError **const gerror) {
3581 struct sockaddr_un sa;
3585 memset(&sa, 0, sizeof(struct sockaddr_un));
3586 sa.sun_family = AF_UNIX;
3587 strncpy(sa.sun_path, sockname, sizeof sa.sun_path);
3588 sa.sun_path[sizeof(sa.sun_path)-1] = '\0';
3589 sock = socket(AF_UNIX, SOCK_STREAM, 0);
3591 g_set_error(gerror, NBDS_ERR, NBDS_ERR_SOCKET,
3592 "failed to open a unix socket: "
3593 "failed to create socket: %s",
3597 if(bind(sock, (struct sockaddr*)&sa, sizeof(struct sockaddr_un))<0) {
3598 g_set_error(gerror, NBDS_ERR, NBDS_ERR_BIND,
3599 "failed to open a unix socket: "
3600 "failed to bind to address %s: %s",
3601 sockname, strerror(errno));
3604 if(listen(sock, 10)<0) {
3605 g_set_error(gerror, NBDS_ERR, NBDS_ERR_BIND,
3606 "failed to open a unix socket: "
3607 "failed to start listening: %s",
3612 g_array_append_val(modernsocks, sock);
3614 if(retval<0 && sock >= 0) {
3621 int open_modern(const gchar *const addr, const gchar *const port,
3622 GError **const gerror) {
3623 struct addrinfo hints;
3624 struct addrinfo* ai = NULL;
3625 struct addrinfo* ai_bak = NULL;
3630 _cleanup_(g_strfreevp) gchar** addrs;
3631 gchar const* l_addr = addr;
3633 if(!addr || strlen(addr) == 0) {
3634 l_addr = "::, 0.0.0.0";
3637 addrs = g_strsplit_set(l_addr, ", \t", -1);
3639 for(int i=0; addrs[i]!=NULL; i++) {
3640 if(addrs[i][0] == '\0') {
3643 memset(&hints, '\0', sizeof(hints));
3644 hints.ai_flags = AI_PASSIVE | AI_ADDRCONFIG;
3645 hints.ai_socktype = SOCK_STREAM;
3646 hints.ai_family = AF_UNSPEC;
3647 hints.ai_protocol = IPPROTO_TCP;
3648 e = getaddrinfo(addrs[i], port ? port : NBD_DEFAULT_PORT, &hints, &ai);
3650 if(e != 0 && addrs[i+1] == NULL && modernsocks->len == 0) {
3651 g_set_error(gerror, NBDS_ERR, NBDS_ERR_GAI,
3652 "failed to open a modern socket: "
3653 "failed to get address info: %s",
3659 sock = socket(ai->ai_family, ai->ai_socktype, ai->ai_protocol);
3661 g_set_error(gerror, NBDS_ERR, NBDS_ERR_SOCKET,
3662 "failed to open a modern socket: "
3663 "failed to create a socket: %s",
3668 if (dosockopts(sock, gerror) == -1) {
3669 g_prefix_error(gerror, "failed to open a modern socket: ");
3673 if(bind(sock, ai->ai_addr, ai->ai_addrlen)) {
3675 * Some systems will return multiple entries for the
3676 * same address when we ask it for something
3677 * AF_UNSPEC, even though the first entry will
3678 * listen to both protocols. Other systems will
3679 * return multiple entries too, but we actually
3680 * do need to open both.
3682 * Handle this by ignoring EADDRINUSE if we've
3683 * already got at least one socket open
3685 if(errno == EADDRINUSE && modernsocks->len > 0) {
3688 g_set_error(gerror, NBDS_ERR, NBDS_ERR_BIND,
3689 "failed to open a modern socket: "
3690 "failed to bind an address to a socket: %s",
3695 if(listen(sock, 10) <0) {
3696 g_set_error(gerror, NBDS_ERR, NBDS_ERR_BIND,
3697 "failed to open a modern socket: "
3698 "failed to start listening on a socket: %s",
3702 g_array_append_val(modernsocks, sock);
3707 freeaddrinfo(ai_bak);
3715 if (retval == -1 && sock >= 0) {
3719 freeaddrinfo(ai_bak);
3725 * Connect our servers.
3727 void setup_servers(GArray *const servers, const gchar *const modernaddr,
3728 const gchar *const modernport, const gchar* unixsock,
3729 const gint flags ) {
3730 struct sigaction sa;
3732 if(unixsock != NULL) {
3733 GError* gerror = NULL;
3734 if(open_unix(unixsock, &gerror) == -1) {
3735 msg(LOG_ERR, "failed to setup servers: %s",
3737 g_clear_error(&gerror);
3741 if (((flags & F_DUAL_LISTEN) != 0) || (unixsock == NULL)) {
3742 GError *gerror = NULL;
3743 if (open_modern(modernaddr, modernport, &gerror) == -1) {
3744 msg(LOG_ERR, "failed to setup servers: %s",
3746 g_clear_error(&gerror);
3750 children=g_hash_table_new_full(g_int_hash, g_int_equal, NULL, destroy_pid_t);
3752 sa.sa_handler = sigchld_handler;
3753 sigemptyset(&sa.sa_mask);
3754 sigaddset(&sa.sa_mask, SIGTERM);
3755 sa.sa_flags = SA_RESTART;
3756 if(sigaction(SIGCHLD, &sa, NULL) == -1)
3757 err("sigaction: %m");
3759 sa.sa_handler = sigterm_handler;
3760 sigemptyset(&sa.sa_mask);
3761 sigaddset(&sa.sa_mask, SIGCHLD);
3762 sa.sa_flags = SA_RESTART;
3763 if(sigaction(SIGTERM, &sa, NULL) == -1)
3764 err("sigaction: %m");
3766 sa.sa_handler = sighup_handler;
3767 sigemptyset(&sa.sa_mask);
3768 sa.sa_flags = SA_RESTART;
3769 if(sigaction(SIGHUP, &sa, NULL) == -1)
3770 err("sigaction: %m");
3772 sa.sa_handler = sigusr1_handler;
3773 sigemptyset(&sa.sa_mask);
3774 sa.sa_flags = SA_RESTART;
3775 if(sigaction(SIGUSR1, &sa, NULL) == -1)
3776 err("sigaction: %m");
3780 * Go daemon (unless we specified at compile time that we didn't want this)
3781 * @param serve the first server of our configuration. If its port is zero,
3782 * then do not daemonize, because we're doing inetd then. This parameter
3783 * is only used to create a PID file of the form
3784 * /var/run/nbd-server.<port>.pid; it's not modified in any way.
3786 #if !defined(NODAEMON)
3794 strncpy(pidfname, "/var/run/nbd-server.pid", 255);
3796 pidf=fopen(pidfname, "w");
3798 fprintf(pidf,"%d\n", (int)getpid());
3802 fprintf(stderr, "Not fatal; continuing");
3806 #define daemonize(serve)
3807 #endif /* !defined(NODAEMON) */
3810 * Everything beyond this point (in the file) is run in non-daemon mode.
3811 * The stuff above daemonize() isn't.
3815 * Set up user-ID and/or group-ID
3817 void dousers(const gchar *const username, const gchar *const groupname) {
3822 gr = getgrnam(groupname);
3824 str = g_strdup_printf("Invalid group name: %s", groupname);
3827 if(setgid(gr->gr_gid)<0) {
3828 err("Could not set GID: %m");
3832 pw = getpwnam(username);
3834 str = g_strdup_printf("Invalid user name: %s", username);
3837 if (setgroups(0, NULL)<0) {
3838 err("Could not set groups: %m");
3840 if(setuid(pw->pw_uid)<0) {
3841 err("Could not set UID: %m");
3847 void glib_message_syslog_redirect(const gchar *log_domain,
3848 GLogLevelFlags log_level,
3849 const gchar *message,
3852 int level=LOG_DEBUG;
3856 case G_LOG_FLAG_FATAL:
3857 case G_LOG_LEVEL_CRITICAL:
3858 case G_LOG_LEVEL_ERROR:
3861 case G_LOG_LEVEL_WARNING:
3864 case G_LOG_LEVEL_MESSAGE:
3865 case G_LOG_LEVEL_INFO:
3868 case G_LOG_LEVEL_DEBUG:
3874 syslog(level, "%s", message);
3879 * Main entry point...
3881 int main(int argc, char *argv[]) {
3885 struct generic_conf genconf;
3887 memset(&genconf, 0, sizeof(struct generic_conf));
3889 if (sizeof( struct nbd_request )!=28) {
3890 fprintf(stderr,"Bad size of structure. Alignment problems?\n");
3891 exit(EXIT_FAILURE) ;
3894 modernsocks = g_array_new(FALSE, FALSE, sizeof(int));
3895 childsocks = g_array_new(FALSE, FALSE, sizeof(int));
3898 config_file_pos = g_strdup(CFILE);
3899 serve=cmdline(argc, argv, &genconf);
3901 genconf.threads = 4;
3902 servers = parse_cfile(config_file_pos, &genconf, true, &gerr);
3904 /* Update global variables with parsed values. This will be
3905 * removed once we get rid of global configuration variables. */
3906 glob_flags |= genconf.flags;
3909 g_array_append_val(servers, serve);
3912 if(!servers || !servers->len) {
3913 if(gerr && !(gerr->domain == NBDS_ERR
3914 && gerr->code == NBDS_ERR_CFILE_NOTFOUND)) {
3915 g_warning("Could not parse config file: %s", gerr->message);
3919 g_warning("Specifying an export on the command line no longer uses the oldstyle protocol.");
3922 if((!serve) && (!servers||!servers->len)) {
3924 g_message("No configured exports; quitting.");
3930 setup_servers(servers, genconf.modernaddr, genconf.modernport,
3931 genconf.unixsock, genconf.flags);
3932 dousers(genconf.user, genconf.group);
3935 gnutls_global_init();
3936 static gnutls_dh_params_t dh_params;
3937 gnutls_dh_params_init(&dh_params);
3938 gnutls_dh_params_generate2(dh_params,
3939 gnutls_sec_param_to_pk_bits(GNUTLS_PK_DH,
3940 // Renamed in GnuTLS 3.3
3941 #if GNUTLS_VERSION_NUMBER >= 0x030300
3942 GNUTLS_SEC_PARAM_MEDIUM
3944 GNUTLS_SEC_PARAM_NORMAL
3949 if((genconf.modernport != NULL) && strcmp(genconf.modernport, "0")==0) {
3951 err("inetd mode requires syslog");
3953 CLIENT* client = negotiate(0, servers, &genconf);
3957 tpool = g_thread_pool_new(handle_request, NULL, genconf.threads, FALSE, NULL);
3958 mainloop_threaded(client);
3962 serveloop(servers, &genconf);