Revert "Fix memory leak"
[platform/upstream/nbd.git] / nbd-server.c
1 /*
2  * Network Block Device - server
3  *
4  * Copyright 1996-1998 Pavel Machek, distribute under GPL
5  *  <pavel@atrey.karlin.mff.cuni.cz>
6  * Copyright 2001-2004 Wouter Verhelst <wouter@debian.org>
7  * Copyright 2002 Anton Altaparmakov <aia21@cam.ac.uk>
8  *
9  * Version 1.0 - hopefully 64-bit-clean
10  * Version 1.1 - merging enhancements from Josh Parsons, <josh@coombs.anu.edu.au>
11  * Version 1.2 - autodetect size of block devices, thanx to Peter T. Breuer" <ptb@it.uc3m.es>
12  * Version 1.5 - can compile on Unix systems that don't have 64 bit integer
13  *      type, or don't have 64 bit file offsets by defining FS_32BIT
14  *      in compile options for nbd-server *only*. This can be done
15  *      with make FSCHOICE=-DFS_32BIT nbd-server. (I don't have the
16  *      original autoconf input file, or I would make it a configure
17  *      option.) Ken Yap <ken@nlc.net.au>.
18  * Version 1.6 - fix autodetection of block device size and really make 64 bit
19  *      clean on 32 bit machines. Anton Altaparmakov <aia21@cam.ac.uk>
20  * Version 2.0 - Version synchronised with client
21  * Version 2.1 - Reap zombie client processes when they exit. Removed
22  *      (uncommented) the _IO magic, it's no longer necessary. Wouter
23  *      Verhelst <wouter@debian.org>
24  * Version 2.2 - Auto switch to read-only mode (usefull for floppies).
25  * Version 2.3 - Fixed code so that Large File Support works. This
26  *      removes the FS_32BIT compile-time directive; define
27  *      _FILE_OFFSET_BITS=64 and _LARGEFILE_SOURCE if you used to be
28  *      using FS_32BIT. This will allow you to use files >2GB instead of
29  *      having to use the -m option. Wouter Verhelst <wouter@debian.org>
30  * Version 2.4 - Added code to keep track of children, so that we can
31  *      properly kill them from initscripts. Add a call to daemon(),
32  *      so that processes don't think they have to wait for us, which is
33  *      interesting for initscripts as well. Wouter Verhelst
34  *      <wouter@debian.org>
35  * Version 2.5 - Bugfix release: forgot to reset child_arraysize to
36  *      zero after fork()ing, resulting in nbd-server going berserk
37  *      when it receives a signal with at least one child open. Wouter
38  *      Verhelst <wouter@debian.org>
39  * 10/10/2003 - Added socket option SO_KEEPALIVE (sf.net bug 819235);
40  *      rectified type of mainloop::size_host (sf.net bugs 814435 and
41  *      817385); close the PID file after writing to it, so that the
42  *      daemon can actually be found. Wouter Verhelst
43  *      <wouter@debian.org>
44  * 10/10/2003 - Size of the data "size_host" was wrong and so was not
45  *      correctly put in network endianness. Many types were corrected
46  *      (size_t and off_t instead of int).  <vspaceg@sourceforge.net>
47  * Version 2.6 - Some code cleanup.
48  * Version 2.7 - Better build system.
49  * 11/02/2004 - Doxygenified the source, modularized it a bit. Needs a
50  *      lot more work, but this is a start. Wouter Verhelst
51  *      <wouter@debian.org>
52  * 16/03/2010 - Add IPv6 support.
53  *      Kitt Tientanopajai <kitt@kitty.in.th>
54  *      Neutron Soutmun <neo.neutron@gmail.com>
55  *      Suriya Soutmun <darksolar@gmail.com>
56  */
57
58 /* Includes LFS defines, which defines behaviours of some of the following
59  * headers, so must come before those */
60 #include "lfs.h"
61 #define _DEFAULT_SOURCE
62 #define _XOPEN_SOURCE 500 /* to get pread/pwrite */
63 #if NEED_BSD_SOURCE
64 #define _BSD_SOURCE /* to get DT_* macros on some platforms */
65 #endif
66 #define _DARWIN_C_SOURCE /* to get DT_* macros on OS X */
67
68 #include <assert.h>
69 #include <sys/types.h>
70 #include <sys/socket.h>
71 #include <sys/stat.h>
72 #include <sys/select.h>
73 #include <sys/wait.h>
74 #include <sys/un.h>
75 #ifdef HAVE_SYS_IOCTL_H
76 #include <sys/ioctl.h>
77 #endif
78 #ifdef HAVE_SYS_UIO_H
79 #include <sys/uio.h>
80 #endif
81 #include <sys/param.h>
82 #include <signal.h>
83 #include <errno.h>
84 #include <libgen.h>
85 #include <netinet/tcp.h>
86 #include <netinet/in.h>
87 #include <netdb.h>
88 #include <syslog.h>
89 #include <unistd.h>
90 #include <stdbool.h>
91 #include <stdio.h>
92 #include <stdlib.h>
93 #include <string.h>
94 #include <fcntl.h>
95 #if HAVE_FALLOC_PH
96 #include <linux/falloc.h>
97 #endif
98 #if HAVE_BLKDISCARD
99 #include <linux/fs.h>
100 #endif
101 #include <arpa/inet.h>
102 #include <strings.h>
103 #include <dirent.h>
104 #ifdef HAVE_SYS_DIR_H
105 #include <sys/dir.h>
106 #endif
107 #ifdef HAVE_SYS_DIRENT_H
108 #include <sys/dirent.h>
109 #endif
110 #include <getopt.h>
111 #include <pwd.h>
112 #include <grp.h>
113 #include <dirent.h>
114 #include <ctype.h>
115 #include <inttypes.h>
116
117 #include <glib.h>
118
119 /* used in cliserv.h, so must come first */
120 #define MY_NAME "nbd_server"
121 #include "cliserv.h"
122 #include "nbd-debug.h"
123 #include "netdb-compat.h"
124 #include "backend.h"
125 #include "treefiles.h"
126 #include "nbd-helper.h"
127
128 #ifdef WITH_SDP
129 #include <sdp_inet.h>
130 #endif
131
132 #if HAVE_FSCTL_SET_ZERO_DATA
133 #include <io.h>
134 /* don't include <windows.h> to avoid redefining eg the ERROR macro */
135 #define NOMINMAX 1
136 #include <windef.h>
137 #include <winbase.h>
138 #include <winioctl.h>
139 #endif
140
141 /** Default position of the config file */
142 #ifndef SYSCONFDIR
143 #define SYSCONFDIR "/etc"
144 #endif
145 #define CFILE SYSCONFDIR "/nbd-server/config"
146
147 #if HAVE_GNUTLS
148 #include <gnutls/gnutls.h>
149 #include <gnutls/x509.h>
150 #endif
151
152 #ifndef HAVE_G_MEMDUP2
153 /* Our uses of g_memdup2 below are safe from g_memdup's 32-bit overflow */
154 #define g_memdup2 g_memdup
155 #endif
156
157 /*
158  * Shorten error handling and regular function return sequences
159  * automatically freeing dynamically allocated resources
160  */
161 #define _cleanup_(x) __attribute__((__cleanup__(x)))
162 static inline void g_freep(void *p) {
163         g_free(*(void**) p);
164 }
165 #define _cleanup_g_free_ _cleanup_(g_freep)
166 #define DEFINE_TRIVIAL_CLEANUP_FUNC(type, func) \
167         static inline void func##p(type *p) {   \
168                 if (*p)                         \
169                         func(*p);               \
170         }
171 DEFINE_TRIVIAL_CLEANUP_FUNC(GKeyFile*, g_key_file_free)
172 DEFINE_TRIVIAL_CLEANUP_FUNC(gchar **, g_strfreev)
173
174 /** Where our config file actually is */
175 gchar* config_file_pos;
176
177 /** global flags */
178 int glob_flags=0;
179
180 /* Whether we should avoid daemonizing the main process */
181 int nodaemon = 0;
182
183 /* Whether we should avoid forking into child processes */
184 int dontfork = 0;
185
186 /**
187  * The highest value a variable of type off_t can reach. This is a signed
188  * integer, so set all bits except for the leftmost one.
189  **/
190 #define OFFT_MAX ~((off_t)1<<(sizeof(off_t)*8-1))
191 #define BUFSIZE ((1024*1024)+sizeof(struct nbd_reply)) /**< Size of buffer that can hold requests */
192 #define DIFFPAGESIZE 4096 /**< diff file uses those chunks */
193
194 /** Global flags: */
195 #define F_OLDSTYLE 1      /**< Allow oldstyle (port-based) exports */
196 #define F_LIST 2          /**< Allow clients to list the exports on a server */
197 #define F_NO_ZEROES 4     /**< Do not send zeros to client */
198 #define F_DUAL_LISTEN 8   /**< Listen on both TCP and unix socket */
199 // also accepts F_FORCEDTLS (which is 16384)
200 GHashTable *children;
201 char pidfname[256]; /**< name of our PID file */
202 char default_authname[] = SYSCONFDIR "/nbd-server/allow"; /**< default name of allow file */
203
204 #define NEG_INIT        (1 << 0)
205 #define NEG_OLD         (1 << 1)
206 #define NEG_MODERN      (1 << 2)
207
208 /*
209  * If we want what the system really has set we'd have to read
210  * /proc/sys/fs/pipe-max-size, but for now 1mb should be enough.
211  */
212 #define MAX_PIPE_SIZE (1 * 1024 * 1024)
213 #define SPLICE_IN       0
214 #define SPLICE_OUT      1
215
216 #include <nbdsrv.h>
217
218 /* Our thread pool */
219 GThreadPool *tpool = NULL;
220
221 /* A work package for the thread pool functions */
222 struct work_package {
223         CLIENT* client;
224         struct nbd_request* req;
225         int pipefd[2];
226         void* data; /**< for write requests */
227 };
228
229 static volatile sig_atomic_t is_sigchld_caught; /**< Flag set by
230                                                      SIGCHLD handler
231                                                      to mark a child
232                                                      exit */
233
234 static volatile sig_atomic_t is_sigterm_caught; /**< Flag set by
235                                                      SIGTERM handler
236                                                      to mark a exit
237                                                      request */
238
239 static volatile sig_atomic_t is_sighup_caught; /**< Flag set by SIGHUP
240                                                     handler to mark a
241                                                     reconfiguration
242                                                     request */
243
244 GArray* modernsocks;      /**< Sockets for the modern handler. Not used
245                                if a client was only specified on the
246                                command line; only port used if
247                                oldstyle is set to false (and then the
248                                command-line client isn't used, gna gna).
249                                This may be more than one socket on
250                                systems that don't support serving IPv4
251                                and IPv6 from the same socket (like,
252                                e.g., FreeBSD) */
253 GArray* childsocks;     /**< parent-side sockets for communication with children */
254 int commsocket;         /**< child-side socket for communication with parent */
255 static sem_t file_wait_sem;
256
257 bool logged_oversized=false;  /**< whether we logged oversized requests already */
258
259 /**
260  * Type of configuration file values
261  **/
262 typedef enum {
263         PARAM_INT,              /**< This parameter is an integer */
264         PARAM_INT64,            /**< This parameter is an integer */
265         PARAM_STRING,           /**< This parameter is a string */
266         PARAM_BOOL,             /**< This parameter is a boolean */
267 } PARAM_TYPE;
268
269 /**
270  * Configuration file values
271  **/
272 typedef struct {
273         gchar *paramname;       /**< Name of the parameter, as it appears in
274                                   the config file */
275         gboolean required;      /**< Whether this is a required (as opposed to
276                                   optional) parameter */
277         PARAM_TYPE ptype;       /**< Type of the parameter. */
278         gpointer target;        /**< Pointer to where the data of this
279                                   parameter should be written. If ptype is
280                                   PARAM_BOOL, the data is or'ed rather than
281                                   overwritten. */
282         gint flagval;           /**< Flag mask for this parameter in case ptype
283                                   is PARAM_BOOL. */
284 } PARAM;
285
286 /**
287  * Configuration file values of the "generic" section
288  **/
289 struct generic_conf {
290         gchar *user;            /**< user we run the server as    */
291         gchar *group;           /**< group we run running as      */
292         gchar *modernaddr;      /**< address of the modern socket */
293         gchar *modernport;      /**< port of the modern socket    */
294         gchar *unixsock;        /**< file name of the unix domain socket */
295         gchar *certfile;        /**< certificate file             */
296         gchar *keyfile;         /**< key file                     */
297         gchar *cacertfile;      /**< CA certificate file          */
298         gchar *tlsprio;         /**< TLS priority string          */
299         gint flags;             /**< global flags                 */
300         gint threads;           /**< maximum number of parallel threads we want to run */
301 };
302
303 #if HAVE_GNUTLS
304 static int writeit_tls(gnutls_session_t s, void *buf, size_t len) {
305         _cleanup_g_free_ char *m = NULL;
306         ssize_t res;
307         while(len > 0) {
308                 DEBUG("+");
309                 if ((res = gnutls_record_send(s, buf, len)) < 0 && !gnutls_error_is_fatal(res)) {
310                         m = g_strdup_printf("issue while sending data: %s", gnutls_strerror(res));
311                         err_nonfatal(m);
312                 } else if(res < 0) {
313                         m = g_strdup_printf("could not send data: %s", gnutls_strerror(res));
314                         err_nonfatal(m);
315                         return -1;
316                 } else {
317                         len -= res;
318                         buf += res;
319                 }
320         }
321         return 0;
322 }
323
324 static int readit_tls(gnutls_session_t s, void *buf, size_t len) {
325         _cleanup_g_free_ char *m = NULL;
326         ssize_t res;
327         while(len > 0) {
328                 DEBUG("*");
329                 if((res = gnutls_record_recv(s, buf, len)) < 0 && !gnutls_error_is_fatal(res)) {
330                         m = g_strdup_printf("issue while receiving data: %s", gnutls_strerror(res));
331                         err_nonfatal(m);
332                 } else if(res < 0) {
333                         m = g_strdup_printf("could not receive data: %s", gnutls_strerror(res));
334                         err_nonfatal(m);
335                         return -1;
336                 } else {
337                         len -= res;
338                         buf += res;
339                 }
340         }
341         return 0;
342 }
343
344 static int socket_read_tls(CLIENT* client, void *buf, size_t len) {
345         return readit_tls(*((gnutls_session_t*)client->tls_session), buf, len);
346 }
347
348 static int socket_write_tls(CLIENT* client, void *buf, size_t len) {
349         return writeit_tls(*((gnutls_session_t*)client->tls_session), buf, len);
350 }
351 #endif // HAVE_GNUTLS
352
353 static int socket_read_notls(CLIENT* client, void *buf, size_t len) {
354         return readit(client->net, buf, len);
355 }
356
357 static int socket_write_notls(CLIENT* client, void *buf, size_t len) {
358         return writeit(client->net, buf, len);
359 }
360
361 static void socket_read(CLIENT* client, void *buf, size_t len) {
362         g_assert(client->socket_read != NULL);
363         if(client->socket_read(client, buf, len)<0) {
364                 g_assert(client->socket_closed != NULL);
365                 client->socket_closed(client);
366         }
367 }
368
369 /**
370  * Consume data from a socket that we don't want
371  *
372  * @param c the client to read from
373  * @param len the number of bytes to consume
374  * @param buf a buffer
375  * @param bufsiz the size of the buffer
376  **/
377 static inline void consume(CLIENT* c, size_t len, void * buf, size_t bufsiz) {
378         size_t curlen;
379         while (len>0) {
380                 curlen = (len>bufsiz)?bufsiz:len;
381                 socket_read(c, buf, curlen);
382                 len -= curlen;
383         }
384 }
385
386 /**
387  * Consume a length field and corresponding payload that we don't want
388  *
389  * @param c the client to read from
390  **/
391 static inline void consume_len(CLIENT* c) {
392         uint32_t len;
393         char buf[1024];
394
395         socket_read(c, &len, sizeof(len));
396         len = ntohl(len);
397         consume(c, len, buf, sizeof(buf));
398 }
399
400 static void socket_write(CLIENT* client, void *buf, size_t len) {
401         g_assert(client->socket_write != NULL);
402         if(client->socket_write(client, buf, len)<0) {
403                 g_assert(client->socket_closed != NULL);
404                 client->socket_closed(client);
405         }
406 }
407
408 static inline void socket_closed_negotiate(CLIENT* client) {
409         err("Negotiation failed: %m");
410 }
411
412 static void cleanup_transactionlog(CLIENT *client) {
413
414         if (client->transactionlogfd != -1) {
415                 close(client->transactionlogfd);
416                 client->transactionlogfd = -1;
417         }
418         if (client->logsem != SEM_FAILED) {
419                 sem_close(client->logsem);
420                 client->logsem = SEM_FAILED;
421                 sem_unlink(client->semname);
422         }
423 }
424
425 static void lock_logsem(CLIENT *client) {
426         sem_wait(client->logsem);
427 }
428 static void unlock_logsem(CLIENT *client) {
429         sem_post(client->logsem);
430 }
431
432 /**
433  * Run a command. This is used for the ``prerun'' and ``postrun'' config file
434  * options
435  *
436  * @param command the command to be ran. Read from the config file
437  * @param file the file name we're about to export
438  **/
439 int do_run(gchar* command, gchar* file) {
440         _cleanup_g_free_ gchar* cmd = NULL;
441         int retval=0;
442
443         if(command && *command) {
444                 cmd = g_strdup_printf(command, file);
445                 retval=system(cmd);
446         }
447         return retval;
448 }
449
450 static inline void finalize_client(CLIENT* client) {
451         g_thread_pool_free(tpool, FALSE, TRUE);
452         do_run(client->server->postrun, client->exportname);
453         if(client->transactionlogfd != -1)
454                 cleanup_transactionlog(client);
455
456         if(client->server->flags & F_COPYONWRITE) {
457                 unlink(client->difffilename);
458         }
459         serve_dec_ref(client->server);
460 }
461
462 static inline void socket_closed_transmission(CLIENT* client) {
463         int saved_errno = errno;
464         finalize_client(client);
465         errno = saved_errno;
466         err("Connection dropped: %m");
467 }
468
469 #ifdef HAVE_SPLICE
470 /**
471  * Splice data between a pipe and a file descriptor
472  *
473  * @param fd_in The fd to splice from.
474  * @param off_in The fd_in offset to splice from.
475  * @param fd_out The fd to splice to.
476  * @param off_out The fd_out offset to splice to.
477  * @param len The length to splice.
478  */
479 static inline void spliceit(int fd_in, loff_t *off_in, int fd_out,
480                             loff_t *off_out, size_t len)
481 {
482         ssize_t ret;
483         while (len > 0) {
484                 if ((ret = splice(fd_in, off_in, fd_out, off_out, len,
485                                   SPLICE_F_MOVE)) <= 0)
486                         err("Splice failed: %m");
487                 len -= ret;
488         }
489 }
490 #endif
491
492 /**
493  * Print out a message about how to use nbd-server. Split out to a separate
494  * function so that we can call it from multiple places
495  */
496 void usage() {
497         printf("This is nbd-server version " VERSION "\n");
498         printf("Usage: [ip:|ip6@]port file_to_export [size][kKmM] [-l authorize_file] [-r] [-m] [-c] [-C configuration file] [-p PID file name] [-o section name] [-M max connections] [-V] [-n] [-d]\n"
499                "\t-r|--read-only\t\tread only\n"
500                "\t-m|--multi-file\t\tmultiple file\n"
501                "\t-c|--copy-on-write\tcopy on write\n"
502                "\t-C|--config-file\tspecify an alternate configuration file\n"
503                "\t-l|--authorize-file\tfile with list of hosts that are allowed to\n\t\t\t\tconnect.\n"
504                "\t-p|--pid-file\t\tspecify a filename to write our PID to\n"
505                "\t-o|--output-config\toutput a config file section for what you\n\t\t\t\tspecified on the command line, with the\n\t\t\t\tspecified section name\n"
506                "\t-M|--max-connection\tspecify the maximum number of opened connections\n"
507                "\t-V|--version\t\toutput the version and exit\n"
508                "\t-n|--nodaemon\t\tdo not daemonize main process\n"
509                "\t-d|--dont-fork\t\tdo not fork (implies --nodaemon)\n\n"
510                "\tif port is set to 0, stdin is used (for running from inetd).\n"
511                "\tif file_to_export contains '%%s', it is substituted with the IP\n"
512                "\t\taddress of the machine trying to connect\n"
513                "\tif ip is set, it contains the local IP address on which we're listening.\n\tif not, the server will listen on all local IP addresses\n");
514         printf("Using configuration file %s\n", CFILE);
515         printf("For help, or when encountering bugs, please contact %s\n", PACKAGE_BUGREPORT);
516 }
517
518 /* Dumps a config file section of the given SERVER*, and exits. */
519 void dump_section(SERVER* serve, gchar* section_header) {
520         printf("[%s]\n", section_header);
521         printf("\texportname = %s\n", serve->exportname);
522         printf("\tlistenaddr = %s\n", serve->listenaddr);
523         if(serve->flags & F_READONLY) {
524                 printf("\treadonly = true\n");
525         }
526         if(serve->flags & F_MULTIFILE) {
527                 printf("\tmultifile = true\n");
528         }
529         if(serve->flags & F_TREEFILES) {
530                 printf("\ttreefiles = true\n");
531         }
532         if(serve->flags & F_COPYONWRITE) {
533                 printf("\tcopyonwrite = true\n");
534         }
535         if(serve->expected_size) {
536                 printf("\tfilesize = %lld\n", (long long int)serve->expected_size);
537         }
538         if(serve->authname) {
539                 printf("\tauthfile = %s\n", serve->authname);
540         }
541         exit(EXIT_SUCCESS);
542 }
543
544 /**
545  * Parse the command line.
546  *
547  * @param argc the argc argument to main()
548  * @param argv the argv argument to main()
549  **/
550 SERVER* cmdline(int argc, char *argv[], struct generic_conf *genconf) {
551         int i=0;
552         int nonspecial=0;
553         int c;
554         struct option long_options[] = {
555                 {"read-only", no_argument, NULL, 'r'},
556                 {"multi-file", no_argument, NULL, 'm'},
557                 {"copy-on-write", no_argument, NULL, 'c'},
558                 {"nodaemon", no_argument, NULL, 'n'},
559                 {"dont-fork", no_argument, NULL, 'd'},
560                 {"authorize-file", required_argument, NULL, 'l'},
561                 {"config-file", required_argument, NULL, 'C'},
562                 {"pid-file", required_argument, NULL, 'p'},
563                 {"output-config", required_argument, NULL, 'o'},
564                 {"max-connection", required_argument, NULL, 'M'},
565                 {"version", no_argument, NULL, 'V'},
566                 {0,0,0,0}
567         };
568         SERVER *serve;
569         off_t es;
570         size_t last;
571         char suffix;
572         bool do_output=false;
573         gchar* section_header="";
574         gchar** addr_port;
575
576         if(argc==1) {
577                 return NULL;
578         }
579         serve=serve_inc_ref((SERVER*)g_new0(SERVER, 1));
580         serve->authname = g_strdup(default_authname);
581         serve->virtstyle=VIRT_IPLIT;
582         while((c=getopt_long(argc, argv, "-C:cwndl:mo:rp:M:V", long_options, &i))>=0) {
583                 switch (c) {
584                 case 1:
585                         /* non-option argument */
586                         switch(nonspecial++) {
587                         case 0:
588                                 if(strchr(optarg, ':') == strrchr(optarg, ':')) {
589                                         addr_port=g_strsplit(optarg, ":", 2);
590
591                                         /* Check for "@" - maybe user using this separator
592                                                  for IPv4 address */
593                                         if(!addr_port[1]) {
594                                                 g_strfreev(addr_port);
595                                                 addr_port=g_strsplit(optarg, "@", 2);
596                                         }
597                                 } else {
598                                         addr_port=g_strsplit(optarg, "@", 2);
599                                 }
600
601                                 if(addr_port[1]) {
602                                         genconf->modernport=g_strdup(addr_port[1]);
603                                         genconf->modernaddr=g_strdup(addr_port[0]);
604                                 } else {
605                                         g_free(genconf->modernaddr);
606                                         genconf->modernaddr=NULL;
607                                         genconf->modernport=g_strdup(addr_port[0]);
608                                 }
609                                 g_strfreev(addr_port);
610                                 break;
611                         case 1:
612                                 serve->exportname = g_strdup(optarg);
613                                 if(serve->exportname[0] != '/') {
614                                         fprintf(stderr, "E: The to be exported file needs to be an absolute filename!\n");
615                                         exit(EXIT_FAILURE);
616                                 }
617                                 break;
618                         case 2:
619                                 last=strlen(optarg)-1;
620                                 suffix=optarg[last];
621                                 if (suffix == 'k' || suffix == 'K' ||
622                                     suffix == 'm' || suffix == 'M')
623                                         optarg[last] = '\0';
624                                 es = (off_t)atoll(optarg);
625                                 switch (suffix) {
626                                         case 'm':
627                                         case 'M':  es <<= 10;
628                                         case 'k':
629                                         case 'K':  es <<= 10;
630                                         default :  break;
631                                 }
632                                 serve->expected_size = es;
633                                 break;
634                         }
635                         break;
636                 case 'r':
637                         serve->flags |= F_READONLY;
638                         break;
639                 case 'm':
640                         serve->flags |= F_MULTIFILE;
641                         break;
642                 case 'o':
643                         do_output = true;
644                         section_header = g_strdup(optarg);
645                         break;
646                 case 'p':
647                         strncpy(pidfname, optarg, 256);
648                         pidfname[255]='\0';
649                         break;
650                 case 'c':
651                         serve->flags |=F_COPYONWRITE;
652                         break;
653                 case 'n':
654                         nodaemon = 1;
655                         break;
656                 case 'd':
657                         dontfork = 1;
658                         nodaemon = 1;
659                         break;
660                 case 'C':
661                         g_free(config_file_pos);
662                         config_file_pos=g_strdup(optarg);
663                         break;
664                 case 'l':
665                         g_free(serve->authname);
666                         serve->authname=g_strdup(optarg);
667                         break;
668                 case 'M':
669                         serve->max_connections = strtol(optarg, NULL, 0);
670                         break;
671                 case 'V':
672                         printf("This is nbd-server version " VERSION "\n");
673                         exit(EXIT_SUCCESS);
674                         break;
675                 default:
676                         usage();
677                         exit(EXIT_FAILURE);
678                         break;
679                 }
680         }
681         /* What's left: the port to export, the name of the to be exported
682          * file, and, optionally, the size of the file, in that order. */
683         if(nonspecial<2) {
684                 serve=serve_dec_ref(serve);
685         } else {
686                 serve->servename = "";
687         }
688         if(do_output) {
689                 if(!serve) {
690                         g_critical("Need a complete configuration on the command line to output a config file section!");
691                         exit(EXIT_FAILURE);
692                 }
693                 dump_section(serve, section_header);
694         }
695         return serve;
696 }
697
698 /* forward definition of parse_cfile */
699 GArray* parse_cfile(gchar* f, struct generic_conf *genconf, bool expect_generic, GError** e);
700
701 #ifdef HAVE_STRUCT_DIRENT_D_TYPE
702 #define NBD_D_TYPE de->d_type
703 #else
704 #define NBD_D_TYPE 0
705 #define DT_UNKNOWN 0
706 #define DT_REG 1
707 #endif
708
709 /**
710  * Parse config file snippets in a directory. Uses readdir() and friends
711  * to find files and open them, then passes them on to parse_cfile
712  * with have_global set false
713  **/
714 GArray* do_cfile_dir(gchar* dir, struct generic_conf *const genconf, GError** e) {
715         DIR* dirh = opendir(dir);
716         struct dirent* de;
717         gchar* fname;
718         GArray* retval = NULL;
719         GArray* tmp;
720         struct stat stbuf;
721
722         if(!dirh) {
723                 g_set_error(e, NBDS_ERR, NBDS_ERR_CFILE_DIR_UNKNOWN, "Invalid directory specified: %s", strerror(errno));
724                 return NULL;
725         }
726         errno=0;
727         while((de = readdir(dirh))) {
728                 int saved_errno=errno;
729                 fname = g_build_filename(dir, de->d_name, NULL);
730                 switch(NBD_D_TYPE) {
731                         case DT_UNKNOWN:
732                                 /* Filesystem doesn't return type of
733                                  * file through readdir, or struct dirent
734                                  * doesn't have d_type. Run stat() on the file
735                                  * instead */
736                                 if(stat(fname, &stbuf)) {
737                                         perror("stat");
738                                         goto err_out;
739                                 }
740                                 if (!S_ISREG(stbuf.st_mode)) {
741                                         goto next;
742                                 }
743                         case DT_REG:
744                                 /* Skip unless the name ends with '.conf' */
745                                 if(strcmp((de->d_name + strlen(de->d_name) - 5), ".conf")) {
746                                         goto next;
747                                 }
748                                 tmp = parse_cfile(fname, genconf, false, e);
749                                 errno=saved_errno;
750                                 if(*e) {
751                                         goto err_out;
752                                 }
753                                 if(!retval)
754                                         retval = g_array_new(FALSE, TRUE, sizeof(SERVER*));
755                                 retval = g_array_append_vals(retval, tmp->data, tmp->len);
756                                 g_array_free(tmp, TRUE);
757                         default:
758                                 break;
759                 }
760         next:
761                 g_free(fname);
762         }
763         if(errno) {
764                 g_set_error(e, NBDS_ERR, NBDS_ERR_CFILE_READDIR_ERR, "Error trying to read directory: %s", strerror(errno));
765         err_out:
766                 if(retval)
767                         g_array_free(retval, TRUE);
768                 retval = NULL;
769         }
770         if(dirh)
771                 closedir(dirh);
772         return retval;
773 }
774
775 /**
776  * To be called by GArray clearing function.
777  * @param server pointer to server element
778  */
779 static void serve_clear_element(SERVER **server) {
780         serve_dec_ref(*server);
781 }
782
783 /**
784  * Parse the config file.
785  *
786  * @param f the name of the config file
787  *
788  * @param genconf a pointer to generic configuration which will get
789  *        updated with parsed values. If NULL, then parsed generic
790  *        configuration values are safely and silently discarded.
791  *
792  * @param e a GError. Error code can be any of the following:
793  *        NBDS_ERR_CFILE_NOTFOUND, NBDS_ERR_CFILE_MISSING_GENERIC,
794  *        NBDS_ERR_CFILE_VALUE_INVALID, NBDS_ERR_CFILE_VALUE_UNSUPPORTED
795  *        or NBDS_ERR_CFILE_NO_EXPORTS. @see NBDS_ERRS.
796  *
797  * @param expect_generic if true, we expect a configuration file that
798  *        contains a [generic] section. If false, we don't.
799  *
800  * @return a GArray of SERVER* pointers. If the config file is empty or does not
801  *      exist, returns an empty GArray; if the config file contains an
802  *      error, returns NULL, and e is set appropriately
803  **/
804 GArray* parse_cfile(gchar* f, struct generic_conf *const genconf, bool expect_generic, GError** e) {
805         const char* DEFAULT_ERROR = "Could not parse %s in group %s: %s";
806         const char* MISSING_REQUIRED_ERROR = "Could not find required value %s in group %s: %s";
807         gchar* cfdir = NULL;
808         SERVER s;
809         gchar *virtstyle=NULL;
810         PARAM lp[] = {
811                 { "exportname", TRUE,   PARAM_STRING,   &(s.exportname),        0 },
812                 { "authfile",   FALSE,  PARAM_STRING,   &(s.authname),          0 },
813                 { "filesize",   FALSE,  PARAM_OFFT,     &(s.expected_size),     0 },
814                 { "virtstyle",  FALSE,  PARAM_STRING,   &(virtstyle),           0 },
815                 { "prerun",     FALSE,  PARAM_STRING,   &(s.prerun),            0 },
816                 { "postrun",    FALSE,  PARAM_STRING,   &(s.postrun),           0 },
817                 { "transactionlog", FALSE, PARAM_STRING, &(s.transactionlog),   0 },
818                 { "cowdir",     FALSE,  PARAM_STRING,   &(s.cowdir),            0 },
819                 { "readonly",   FALSE,  PARAM_BOOL,     &(s.flags),             F_READONLY },
820                 { "multifile",  FALSE,  PARAM_BOOL,     &(s.flags),             F_MULTIFILE },
821                 { "treefiles",  FALSE,  PARAM_BOOL,     &(s.flags),             F_TREEFILES },
822                 { "copyonwrite", FALSE, PARAM_BOOL,     &(s.flags),             F_COPYONWRITE },
823                 { "waitfile",   FALSE,  PARAM_BOOL,     &(s.flags),             F_WAIT },
824                 { "sparse_cow", FALSE,  PARAM_BOOL,     &(s.flags),             F_SPARSE },
825                 { "sdp",        FALSE,  PARAM_BOOL,     &(s.flags),             F_SDP },
826                 { "sync",       FALSE,  PARAM_BOOL,     &(s.flags),             F_SYNC },
827                 { "flush",      FALSE,  PARAM_BOOL,     &(s.flags),             F_FLUSH },
828                 { "fua",        FALSE,  PARAM_BOOL,     &(s.flags),             F_FUA },
829                 { "rotational", FALSE,  PARAM_BOOL,     &(s.flags),             F_ROTATIONAL },
830                 { "temporary",  FALSE,  PARAM_BOOL,     &(s.flags),             F_TEMPORARY },
831                 { "trim",       FALSE,  PARAM_BOOL,     &(s.flags),             F_TRIM },
832                 { "datalog",    FALSE,  PARAM_BOOL,     &(s.flags),             F_DATALOG },
833                 { "listenaddr", FALSE,  PARAM_STRING,   &(s.listenaddr),        0 },
834                 { "maxconnections", FALSE, PARAM_INT,   &(s.max_connections),   0 },
835                 { "force_tls",  FALSE,  PARAM_BOOL,     &(s.flags),             F_FORCEDTLS },
836                 { "splice",     FALSE,  PARAM_BOOL,     &(s.flags),             F_SPLICE},
837         };
838         const int lp_size=sizeof(lp)/sizeof(PARAM);
839         struct generic_conf genconftmp;
840         PARAM gp[] = {
841                 { "user",       FALSE, PARAM_STRING,    &(genconftmp.user),       0 },
842                 { "group",      FALSE, PARAM_STRING,    &(genconftmp.group),      0 },
843                 { "oldstyle",   FALSE, PARAM_BOOL,      &(genconftmp.flags),      F_OLDSTYLE }, // only left here so we can issue an appropriate error message when the option is used
844                 { "listenaddr", FALSE, PARAM_STRING,    &(genconftmp.modernaddr), 0 },
845                 { "port",       FALSE, PARAM_STRING,    &(genconftmp.modernport), 0 },
846                 { "includedir", FALSE, PARAM_STRING,    &cfdir,                   0 },
847                 { "allowlist",  FALSE, PARAM_BOOL,      &(genconftmp.flags),      F_LIST },
848                 { "unixsock",   FALSE, PARAM_STRING,    &(genconftmp.unixsock),   0 },
849                 { "duallisten", FALSE, PARAM_BOOL,      &(genconftmp.flags),      F_DUAL_LISTEN }, // Used to listen on both TCP and unix socket
850                 { "max_threads", FALSE, PARAM_INT,      &(genconftmp.threads),    0 },
851                 { "force_tls", FALSE, PARAM_BOOL,       &(genconftmp.flags),      F_FORCEDTLS },
852                 { "certfile",   FALSE, PARAM_STRING,    &(genconftmp.certfile),   0 },
853                 { "keyfile",    FALSE, PARAM_STRING,    &(genconftmp.keyfile),    0 },
854                 { "cacertfile", FALSE, PARAM_STRING,    &(genconftmp.cacertfile), 0 },
855                 { "tlsprio",    FALSE,  PARAM_STRING,   &(genconftmp.tlsprio),    0 },
856         };
857         PARAM* p=gp;
858         int p_size=sizeof(gp)/sizeof(PARAM);
859         _cleanup_(g_key_file_freep) GKeyFile *cfile = NULL;
860         g_autoptr(GError) err = NULL;
861         const char *err_msg=NULL;
862         GArray *retval=NULL;
863         gchar **groups;
864         gboolean bval;
865         gint ival;
866         gint64 i64val;
867         gchar* sval;
868         _cleanup_g_free_ gchar* startgroup = NULL;
869         gint i;
870         gint j;
871
872         memset(&genconftmp, 0, sizeof(struct generic_conf));
873
874         genconftmp.tlsprio = "NORMAL:-VERS-TLS-ALL:+VERS-TLS1.2:%SERVER_PRECEDENCE";
875
876         if (genconf) {
877                 /* Use the passed configuration values as defaults. The
878                  * parsing algorithm below updates all parameter targets
879                  * found from configuration files. */
880                 memcpy(&genconftmp, genconf, sizeof(struct generic_conf));
881         }
882
883         cfile = g_key_file_new();
884         retval = g_array_new(FALSE, TRUE, sizeof(SERVER*));
885         if(expect_generic) {
886                 g_array_set_clear_func(retval, (GDestroyNotify)serve_clear_element);
887         }
888         if(!g_key_file_load_from_file(cfile, f, G_KEY_FILE_KEEP_COMMENTS |
889                         G_KEY_FILE_KEEP_TRANSLATIONS, &err)) {
890                 g_set_error(e, NBDS_ERR, NBDS_ERR_CFILE_NOTFOUND, "Could not open config file %s: %s",
891                                 f, err->message);
892                 g_key_file_free(cfile);
893                 return retval;
894         }
895         startgroup = g_key_file_get_start_group(cfile);
896         if((!startgroup || strcmp(startgroup, "generic")) && expect_generic) {
897                 g_set_error(e, NBDS_ERR, NBDS_ERR_CFILE_MISSING_GENERIC, "Config file does not contain the [generic] group!");
898                 return NULL;
899         }
900         groups = g_key_file_get_groups(cfile, NULL);
901         for(i=0;groups[i];i++) {
902                 memset(&s, '\0', sizeof(SERVER));
903
904                 /* After the [generic] group or when we're parsing an include
905                  * directory, start parsing exports */
906                 if(i==1 || !expect_generic) {
907                         p=lp;
908                         p_size=lp_size;
909                 }
910                 for(j=0;j<p_size;j++) {
911                         assert(p[j].target != NULL);
912                         assert(p[j].ptype==PARAM_INT||p[j].ptype==PARAM_STRING||p[j].ptype==PARAM_BOOL||p[j].ptype==PARAM_INT64);
913                         switch(p[j].ptype) {
914                                 case PARAM_INT:
915                                         ival = g_key_file_get_integer(cfile,
916                                                                 groups[i],
917                                                                 p[j].paramname,
918                                                                 &err);
919                                         if(!err) {
920                                                 *((gint*)p[j].target) = ival;
921                                         }
922                                         break;
923                                 case PARAM_INT64:
924                                         i64val = g_key_file_get_int64(cfile,
925                                                                 groups[i],
926                                                                 p[j].paramname,
927                                                                 &err);
928                                         if(!err) {
929                                                 *((gint64*)p[j].target) = i64val;
930                                         }
931                                         break;
932                                 case PARAM_STRING:
933                                         sval = g_key_file_get_string(cfile,
934                                                                 groups[i],
935                                                                 p[j].paramname,
936                                                                 &err);
937                                         if(!err) {
938                                                 *((gchar**)p[j].target) = sval;
939                                         }
940                                         break;
941                                 case PARAM_BOOL:
942                                         bval = g_key_file_get_boolean(cfile,
943                                                         groups[i],
944                                                         p[j].paramname, &err);
945                                         if(!err) {
946                                                 if(bval) {
947                                                         *((gint*)p[j].target) |= p[j].flagval;
948                                                 } else {
949                                                         *((gint*)p[j].target) &= ~(p[j].flagval);
950                                                 }
951                                         }
952                                         break;
953                         }
954                         if(err) {
955                                 if(err->code == G_KEY_FILE_ERROR_KEY_NOT_FOUND) {
956                                         if(!p[j].required) {
957                                                 /* Ignore not-found error for optional values */
958                                                 g_clear_error(&err);
959                                                 continue;
960                                         } else {
961                                                 err_msg = MISSING_REQUIRED_ERROR;
962                                         }
963                                 } else {
964                                         err_msg = DEFAULT_ERROR;
965                                 }
966                                 g_set_error(e, NBDS_ERR, NBDS_ERR_CFILE_VALUE_INVALID, err_msg, p[j].paramname, groups[i], err->message);
967                                 g_array_free(retval, TRUE);
968                                 return NULL;
969                         }
970                 }
971                 if(virtstyle) {
972                         if(!strncmp(virtstyle, "none", 4)) {
973                                 s.virtstyle=VIRT_NONE;
974                         } else if(!strncmp(virtstyle, "ipliteral", 9)) {
975                                 s.virtstyle=VIRT_IPLIT;
976                         } else if(!strncmp(virtstyle, "iphash", 6)) {
977                                 s.virtstyle=VIRT_IPHASH;
978                         } else if(!strncmp(virtstyle, "cidrhash", 8)) {
979                                 s.virtstyle=VIRT_CIDR;
980                                 if(strlen(virtstyle)<10) {
981                                         g_set_error(e, NBDS_ERR, NBDS_ERR_CFILE_VALUE_INVALID, "Invalid value %s for parameter virtstyle in group %s: missing length", virtstyle, groups[i]);
982                                         g_array_free(retval, TRUE);
983                                         return NULL;
984                                 }
985                                 s.cidrlen=strtol(virtstyle+8, NULL, 0);
986                         } else {
987                                 g_set_error(e, NBDS_ERR, NBDS_ERR_CFILE_VALUE_INVALID, "Invalid value %s for parameter virtstyle in group %s", virtstyle, groups[i]);
988                                 g_array_free(retval, TRUE);
989                                 return NULL;
990                         }
991                 } else {
992                         s.virtstyle=VIRT_IPLIT;
993                 }
994                 if(genconftmp.flags & F_OLDSTYLE) {
995                         g_message("Since 3.10, the oldstyle protocol is no longer supported. Please migrate to the newstyle protocol.");
996                         g_message("Exiting.");
997                         return NULL;
998                 }
999 #ifndef HAVE_SPLICE
1000                 if (s.flags & F_SPLICE) {
1001                         g_set_error(e, NBDS_ERR, NBDS_ERR_CFILE_VALUE_UNSUPPORTED, "This nbd-server was built without splice support, yet group %s uses it", groups[i]);
1002                         g_array_free(retval, TRUE);
1003                         return NULL;
1004                 }
1005 #endif
1006                 /* We can't mix copyonwrite and splice. */
1007                 if ((s.flags & F_COPYONWRITE) && (s.flags & F_SPLICE)) {
1008                         g_set_error(e, NBDS_ERR, NBDS_ERR_CFILE_INVALID_SPLICE,
1009                                     "Cannot mix copyonwrite with splice for an export in group %s",
1010                                     groups[i]);
1011                         g_array_free(retval, TRUE);
1012                         return NULL;
1013                 }
1014                 if ((s.flags & F_COPYONWRITE) && (s.flags & F_WAIT)) {
1015                         g_set_error(e, NBDS_ERR, NBDS_ERR_CFILE_INVALID_WAIT,
1016                                     "Cannot mix copyonwrite with waitfile for an export in group %s",
1017                                     groups[i]);
1018                         g_array_free(retval, TRUE);
1019                         return NULL;
1020                 }
1021                 /* We can't mix datalog and splice. */
1022                 if ((s.flags & F_DATALOG) && (s.flags & F_SPLICE)) {
1023                         g_set_error(e, NBDS_ERR, NBDS_ERR_CFILE_INVALID_SPLICE,
1024                                     "Cannot mix datalog with splice for an export in group %s",
1025                                     groups[i]);
1026                         g_array_free(retval, TRUE);
1027                         return NULL;
1028                 }
1029                 /* Don't need to free this, it's not our string */
1030                 virtstyle=NULL;
1031                 /* Don't append values for the [generic] group */
1032                 if(i>0 || !expect_generic) {
1033                         s.servename = groups[i];
1034
1035                         SERVER *srv = serve_inc_ref(g_memdup2(&s, sizeof(SERVER)));
1036                         g_array_append_val(retval, srv);
1037                 }
1038 #ifndef WITH_SDP
1039                 if(s.flags & F_SDP) {
1040                         g_set_error(e, NBDS_ERR, NBDS_ERR_CFILE_VALUE_UNSUPPORTED, "This nbd-server was built without support for SDP, yet group %s uses it", groups[i]);
1041                         g_array_free(retval, TRUE);
1042                         return NULL;
1043                 }
1044 #endif
1045         }
1046         if(cfdir) {
1047                 GArray* extra = do_cfile_dir(cfdir, &genconftmp, e);
1048                 if(extra) {
1049                         retval = g_array_append_vals(retval, extra->data, extra->len);
1050                         i+=extra->len;
1051                         g_array_free(extra, TRUE);
1052                 } else {
1053                         if(*e) {
1054                                 g_array_free(retval, TRUE);
1055                                 return NULL;
1056                         }
1057                 }
1058         }
1059         if(i==1 && expect_generic) {
1060                 g_set_error(e, NBDS_ERR, NBDS_ERR_CFILE_NO_EXPORTS, "The config file does not specify any exports");
1061         }
1062
1063         if (genconf) {
1064                 /* Return the updated generic configuration through the
1065                  * pointer parameter. */
1066                 memcpy(genconf, &genconftmp, sizeof(struct generic_conf));
1067         }
1068
1069         return retval;
1070 }
1071
1072 /**
1073  * Handle SIGCHLD by setting atomically a flag which will be evaluated in the
1074  * main loop of the root server process. This allows us to separate the signal
1075  * catching from th actual task triggered by SIGCHLD and hence processing in the
1076  * interrupt context is kept as minimial as possible.
1077  *
1078  * @param s the signal we're handling (must be SIGCHLD, or something
1079  * is severely wrong)
1080  **/
1081 static void sigchld_handler(const int s G_GNUC_UNUSED) {
1082         is_sigchld_caught = 1;
1083 }
1084
1085 /**
1086  * Kill a child. Called from sigterm_handler::g_hash_table_foreach.
1087  *
1088  * @param key the key
1089  * @param value the value corresponding to the above key
1090  * @param user_data a pointer which we always set to 1, so that we know what
1091  * will happen next.
1092  **/
1093 void killchild(gpointer key, gpointer value, gpointer user_data) {
1094         pid_t *pid=value;
1095
1096         kill(*pid, SIGTERM);
1097 }
1098
1099 /**
1100  * Handle SIGTERM by setting atomically a flag which will be evaluated in the
1101  * main loop of the root server process. This allows us to separate the signal
1102  * catching from th actual task triggered by SIGTERM and hence processing in the
1103  * interrupt context is kept as minimial as possible.
1104  *
1105  * @param s the signal we're handling (must be SIGTERM, or something
1106  * is severely wrong).
1107  **/
1108 static void sigterm_handler(const int s G_GNUC_UNUSED) {
1109         is_sigterm_caught = 1;
1110 }
1111
1112 /**
1113  * Handle SIGHUP by setting atomically a flag which will be evaluated in
1114  * the main loop of the root server process. This allows us to separate
1115  * the signal catching from th actual task triggered by SIGHUP and hence
1116  * processing in the interrupt context is kept as minimial as possible.
1117  *
1118  * @param s the signal we're handling (must be SIGHUP, or something
1119  * is severely wrong).
1120  **/
1121 static void sighup_handler(const int s G_GNUC_UNUSED) {
1122         is_sighup_caught = 1;
1123 }
1124
1125 static void sigusr1_handler(const int s G_GNUC_UNUSED) {
1126         msg(LOG_INFO, "Got SIGUSR1");
1127         sem_post(&file_wait_sem);
1128 }
1129
1130 /**
1131  * Get the file handle and offset, given an export offset.
1132  *
1133  * @param client The client we're serving for
1134  * @param a The offset to get corresponding file/offset for
1135  * @param fhandle [out] File descriptor
1136  * @param foffset [out] Offset into fhandle
1137  * @param maxbytes [out] Tells how many bytes can be read/written
1138  * from fhandle starting at foffset (0 if there is no limit)
1139  * @return 0 on success, -1 on failure
1140  **/
1141 int get_filepos(CLIENT *client, off_t a, int* fhandle, off_t* foffset, size_t* maxbytes ) {
1142         GArray * const export = client->export;
1143
1144         /* Negative offset not allowed */
1145         if(a < 0)
1146                 return -1;
1147
1148         /* Open separate file for treefiles */
1149         if (client->server->flags & F_TREEFILES) {
1150                 *foffset = a % TREEPAGESIZE;
1151                 *maxbytes = (( 1 + (a/TREEPAGESIZE) ) * TREEPAGESIZE) - a; // start position of next block
1152                 *fhandle = open_treefile(client->exportname, ((client->server->flags & F_READONLY) ? O_RDONLY : O_RDWR), client->exportsize,a, &client->lock);
1153                 return 0;
1154         }
1155
1156         /* Binary search for last file with starting offset <= a */
1157         FILE_INFO fi;
1158         int start = 0;
1159         int end = export->len - 1;
1160         while( start <= end ) {
1161                 int mid = (start + end) / 2;
1162                 fi = g_array_index(export, FILE_INFO, mid);
1163                 if( fi.startoff < a ) {
1164                         start = mid + 1;
1165                 } else if( fi.startoff > a ) {
1166                         end = mid - 1;
1167                 } else {
1168                         start = end = mid;
1169                         break;
1170                 }
1171         }
1172
1173         /* end should never go negative, since first startoff is 0 and a >= 0 */
1174         assert(end >= 0);
1175
1176         fi = g_array_index(export, FILE_INFO, end);
1177         *fhandle = fi.fhandle;
1178         *foffset = a - fi.startoff;
1179         *maxbytes = 0;
1180         if( end+1 < export->len ) {
1181                 FILE_INFO fi_next = g_array_index(export, FILE_INFO, end+1);
1182                 *maxbytes = fi_next.startoff - a;
1183         }
1184
1185         return 0;
1186 }
1187
1188 /**
1189  * Write an amount of bytes at a given offset to the right file. This
1190  * abstracts the write-side of the multiple file option.
1191  *
1192  * @param a The offset where the write should start
1193  * @param buf The buffer to write from
1194  * @param len The length of buf
1195  * @param client The client we're serving for
1196  * @param fua Flag to indicate 'Force Unit Access'
1197  * @return The number of bytes actually written, or -1 in case of an error
1198  **/
1199 ssize_t rawexpwrite(off_t a, char *buf, size_t len, CLIENT *client, int fua) {
1200         int fhandle;
1201         off_t foffset;
1202         size_t maxbytes;
1203         ssize_t retval;
1204
1205         if(get_filepos(client, a, &fhandle, &foffset, &maxbytes))
1206                 return -1;
1207         if(maxbytes && len > maxbytes)
1208                 len = maxbytes;
1209
1210         DEBUG("(WRITE to fd %d offset %llu len %u fua %d), ", fhandle, (long long unsigned)foffset, (unsigned int)len, fua);
1211
1212         retval = pwrite(fhandle, buf, len, foffset);
1213         if(client->server->flags & F_SYNC) {
1214                 fsync(fhandle);
1215         } else if (fua) {
1216
1217           /* This is where we would do the following
1218            *   #ifdef USE_SYNC_FILE_RANGE
1219            * However, we don't, for the reasons set out below
1220            * by Christoph Hellwig <hch@infradead.org>
1221            *
1222            * [BEGINS]
1223            * fdatasync is equivalent to fsync except that it does not flush
1224            * non-essential metadata (basically just timestamps in practice), but it
1225            * does flush metadata requried to find the data again, e.g. allocation
1226            * information and extent maps.  sync_file_range does nothing but flush
1227            * out pagecache content - it means you basically won't get your data
1228            * back in case of a crash if you either:
1229            *
1230            *  a) have a volatile write cache in your disk (e.g. any normal SATA disk)
1231            *  b) are using a sparse file on a filesystem
1232            *  c) are using a fallocate-preallocated file on a filesystem
1233            *  d) use any file on a COW filesystem like btrfs
1234            *
1235            * e.g. it only does anything useful for you if you do not have a volatile
1236            * write cache, and either use a raw block device node, or just overwrite
1237            * an already fully allocated (and not preallocated) file on a non-COW
1238            * filesystem.
1239            * [ENDS]
1240            *
1241            * What we should do is open a second FD with O_DSYNC set, then write to
1242            * that when appropriate. However, with a Linux client, every REQ_FUA
1243            * immediately follows a REQ_FLUSH, so fdatasync does not cause performance
1244            * problems.
1245            *
1246            */
1247 #if 0
1248                 sync_file_range(fhandle, foffset, len,
1249                                 SYNC_FILE_RANGE_WAIT_BEFORE | SYNC_FILE_RANGE_WRITE |
1250                                 SYNC_FILE_RANGE_WAIT_AFTER);
1251 #else
1252                 fdatasync(fhandle);
1253 #endif
1254         }
1255         /* close file pointer in case of treefiles */
1256         if (client->server->flags & F_TREEFILES) {
1257                 close(fhandle);
1258         }
1259         return retval;
1260 }
1261
1262 /**
1263  * Call rawexpwrite repeatedly until all data has been written.
1264  *
1265  * @param a The offset where the write should start
1266  * @param buf The buffer to write from
1267  * @param len The length of buf
1268  * @param client The client we're serving for
1269  * @param fua Flag to indicate 'Force Unit Access'
1270  * @return 0 on success, nonzero on failure
1271  **/
1272 int rawexpwrite_fully(off_t a, char *buf, size_t len, CLIENT *client, int fua) {
1273         ssize_t ret=0;
1274
1275         while(len > 0 && (ret=rawexpwrite(a, buf, len, client, fua)) > 0 ) {
1276                 a += ret;
1277                 buf += ret;
1278                 len -= ret;
1279         }
1280         return (ret < 0 || len != 0);
1281 }
1282
1283 static void setup_reply(struct nbd_reply* rep, struct nbd_request* req) {
1284         rep->magic = htonl(NBD_REPLY_MAGIC);
1285         rep->error = 0;
1286         rep->cookie = req->cookie;
1287 }
1288
1289 static void log_reply(CLIENT *client, struct nbd_reply *prply) {
1290         if (client->transactionlogfd != -1) {
1291                 lock_logsem(client);
1292                 writeit(client->transactionlogfd, prply, sizeof(*prply));
1293                 unlock_logsem(client);
1294         }
1295 }
1296
1297 static void log_structured_reply(CLIENT *client, struct nbd_structured_reply *prply) {
1298         if (client->transactionlogfd != -1) {
1299                 lock_logsem(client);
1300                 writeit(client->transactionlogfd, prply, sizeof(*prply));
1301                 unlock_logsem(client);
1302         }
1303 }
1304
1305 void send_structured_chunk(CLIENT *client, struct nbd_request *req, uint16_t flags, uint16_t type, uint32_t length, int bufcount, void *buf[], size_t buflen[]) {
1306         struct nbd_structured_reply rep;
1307         rep.magic = htonl(NBD_STRUCTURED_REPLY_MAGIC);
1308         rep.flags = htons(flags);
1309         rep.type = htons(type);
1310         rep.cookie = req->cookie;
1311         rep.paylen = htonl(length);
1312         pthread_mutex_lock(&(client->lock));
1313         socket_write(client, &rep, sizeof rep);
1314         for(int i=0; i<bufcount; i++) {
1315                 socket_write(client, buf[i], buflen[i]);
1316         }
1317         pthread_mutex_unlock(&(client->lock));
1318         log_structured_reply(client, &rep);
1319 }
1320
1321 void send_structured_chunk_v(CLIENT *client, struct nbd_request *req, uint16_t flags, uint16_t type, uint32_t length, int bufcount, ...) {
1322         struct nbd_structured_reply rep;
1323         va_list ap;
1324         rep.magic = htonl(NBD_STRUCTURED_REPLY_MAGIC);
1325         rep.flags = htons(flags);
1326         rep.type = htons(type);
1327         rep.cookie = req->cookie;
1328         rep.paylen = htonl(length);
1329         va_start(ap, bufcount);
1330         pthread_mutex_lock(&(client->lock));
1331         socket_write(client, &rep, sizeof rep);
1332         for(int i=0; i<bufcount; i++) {
1333                 void *buf = va_arg(ap, void*);
1334                 size_t size = va_arg(ap, size_t);
1335                 socket_write(client, buf, size);
1336         }
1337         pthread_mutex_unlock(&(client->lock));
1338         log_structured_reply(client, &rep);
1339         va_end(ap);
1340 }
1341
1342 /**
1343  * Find the location to write the data for the next chunk to.
1344  * Assumes checks on memory sizes etc have already been done.
1345  *
1346  * @param ctx the context we're working with
1347  * @param offset the offset into the request
1348  * @param len the length of this chunk.
1349  */
1350 char * find_read_buf(READ_CTX *ctx) {
1351         if(!(ctx->is_structured) || ctx->df) {
1352                 return ctx->buf + ctx->current_offset;
1353         }
1354         ctx->buf = malloc(ctx->current_len);
1355         if(!(ctx->buf)) {
1356                 err("Could not allocate memory for request");
1357         }
1358         return ctx->buf;
1359 }
1360
1361 void confirm_read(CLIENT *client, READ_CTX *ctx, size_t len_read) {
1362         if(ctx->is_structured && !(ctx->df)) {
1363                 uint64_t offset = htonll(ctx->req->from + (uint64_t)(ctx->current_offset));
1364                 send_structured_chunk_v(client, ctx->req, 0, NBD_REPLY_TYPE_OFFSET_DATA, len_read + 8, 2, &offset, sizeof offset, ctx->buf, (size_t)len_read);
1365                 free(ctx->buf);
1366         }
1367 }
1368
1369 void complete_read(CLIENT *client, READ_CTX *ctx, uint32_t error, char *errmsg, uint16_t msglen, bool with_offset, uint64_t err_offset) {
1370         uint16_t type;
1371         uint64_t offset = 0;
1372         if(ctx->is_structured) {
1373                 if(ctx->df) {
1374                         uint32_t len = ctx->req->len;
1375                         if(error != 0 && with_offset) {
1376                                 len = err_offset;
1377                         }
1378                         if(error == 0 || with_offset) {
1379                                 offset = htonll(ctx->req->from);
1380                                 send_structured_chunk_v(client, ctx->req, 0, NBD_REPLY_TYPE_OFFSET_DATA, len + 8, 2, &offset, sizeof offset, ctx->buf, err_offset);
1381                         }
1382                         free(ctx->buf);
1383                 }
1384                 if(error != 0) {
1385                         struct nbd_structured_error_payload pl;
1386                         void *buf[3];
1387                         size_t bufsize[3];
1388                         int payloads = 1;
1389                         size_t total_size;
1390                         pl.error = error;
1391                         pl.msglen = msglen;
1392                         if(with_offset) {
1393                                 offset += err_offset;
1394                                 type = NBD_REPLY_TYPE_ERROR_OFFSET;
1395                         } else {
1396                                 type = NBD_REPLY_TYPE_ERROR;
1397                         }
1398                         buf[0] = &pl;
1399                         bufsize[0] = sizeof pl;
1400                         total_size = bufsize[0];
1401                         if(msglen > 0) {
1402                                 buf[payloads] = errmsg;
1403                                 bufsize[payloads++] = msglen;
1404                                 total_size += msglen;
1405                         }
1406                         if(with_offset) {
1407                                 buf[payloads] = &offset;
1408                                 bufsize[payloads++] = sizeof offset;
1409                                 total_size += sizeof offset;
1410                         }
1411                         send_structured_chunk(client, ctx->req, NBD_REPLY_FLAG_DONE, type, total_size, payloads, buf, bufsize);
1412                         return;
1413                 }
1414                 send_structured_chunk_v(client, ctx->req, NBD_REPLY_FLAG_DONE, NBD_REPLY_TYPE_NONE, 0, 0);
1415         } else {
1416                 struct nbd_reply rep;
1417                 setup_reply(&rep, ctx->req);
1418                 if(error) {
1419                         rep.error = error;
1420                 }
1421                 log_reply(client, &rep);
1422                 pthread_mutex_lock(&(client->lock));
1423                 socket_write(client, &rep, sizeof rep);
1424                 if(!error) {
1425                         socket_write(client, ctx->buf, ctx->buflen);
1426                 }
1427                 pthread_mutex_unlock(&(client->lock));
1428                 free(ctx->buf);
1429         }
1430 }
1431
1432 /**
1433  * Read an amount of bytes at a given offset from the right file. This
1434  * abstracts the read-side of the multiple files option.
1435  *
1436  * @param a The offset where the read should start
1437  * @param buf A buffer to read into
1438  * @param len The size of buf
1439  * @param client The client we're serving for
1440  * @return The number of bytes actually read, or -1 in case of an
1441  * error.
1442  **/
1443 ssize_t rawexpread(off_t a, char *buf, size_t len, CLIENT *client) {
1444         int fhandle;
1445         off_t foffset;
1446         size_t maxbytes;
1447         ssize_t retval;
1448
1449         if(get_filepos(client, a, &fhandle, &foffset, &maxbytes))
1450                 return -1;
1451         if(maxbytes && len > maxbytes)
1452                 len = maxbytes;
1453
1454         DEBUG("(READ from fd %d offset %llu len %u), ", fhandle, (long long unsigned int)foffset, (unsigned int)len);
1455
1456         retval = pread(fhandle, buf, len, foffset);
1457         if (client->server->flags & F_TREEFILES) {
1458                 close(fhandle);
1459         }
1460         return retval;
1461 }
1462
1463 /**
1464  * Call rawexpread repeatedly until all data has been read.
1465  * @return 0 on success, nonzero on failure
1466  **/
1467 int rawexpread_fully(READ_CTX *ctx, CLIENT *client) {
1468         ssize_t ret=0;
1469
1470         char *buf;
1471
1472         while(ctx->current_len > 0) {
1473                 buf = find_read_buf(ctx);
1474                 if((ret = rawexpread((off_t)ctx->req->from + (off_t)ctx->current_offset, buf, ctx->current_len, client)) <= 0) {
1475                         break;
1476                 }
1477                 confirm_read(client, ctx, ret);
1478                 ctx->current_offset += ret;
1479                 ctx->current_len -= ret;
1480         }
1481         return (ret < 0 || ctx->current_len != 0);
1482 }
1483
1484 #ifdef HAVE_SPLICE
1485 int rawexpsplice(int pipe, off_t a, size_t len, CLIENT *client, int dir,
1486                  int fua)
1487 {
1488         int fhandle;
1489         off_t foffset;
1490         size_t maxbytes;
1491         ssize_t retval;
1492
1493         if (get_filepos(client, a, &fhandle, &foffset, &maxbytes))
1494                 return -1;
1495         if (maxbytes && len > maxbytes)
1496                 len = maxbytes;
1497
1498         DEBUG("(SPLICE %s fd %d offset %llu len %u), ",
1499               (dir == SPLICE_IN) ? "from" : "to", fhandle,
1500               (unsigned long long)a, (unsigned)len);
1501
1502         /*
1503          * SPLICE_F_MOVE doesn't actually work at the moment, but in the future
1504          * it might, so go ahead and use it.
1505          */
1506         if (dir == SPLICE_IN) {
1507                 retval = splice(fhandle, &foffset, pipe, NULL, len,
1508                                 SPLICE_F_MOVE);
1509         } else {
1510                 retval = splice(pipe, NULL, fhandle, &foffset, len,
1511                                 SPLICE_F_MOVE);
1512                 if (client->server->flags & F_SYNC)
1513                         fsync(fhandle);
1514                 else if (fua)
1515                         fdatasync(fhandle);
1516         }
1517         if (client->server->flags & F_TREEFILES)
1518                 close(fhandle);
1519         return retval;
1520 }
1521
1522 /**
1523  * Splice an amount of bytes from the given offset from/into the right file
1524  * from/into the given pipe.
1525  * @param pipe The pipe we are using for this splice.
1526  * @param a The offset of the file we are operating on.
1527  * @param len The length of the splice.
1528  * @param client The client we're splicing for.
1529  * @param dir The direction we are doing the splice in.
1530  * @param fua Set if this is a write and we need to fua.
1531  * @return 0 on success, nonzero on failure.
1532  */
1533 int expsplice(int pipe, off_t a, size_t len, CLIENT *client, int dir, int fua)
1534 {
1535         ssize_t ret = 0;
1536
1537         while (len > 0 &&
1538                (ret = rawexpsplice(pipe, a, len, client, dir, fua)) > 0) {
1539                 a += ret;
1540                 len -= ret;
1541         }
1542         return (ret < 0 || len != 0);
1543 }
1544 #endif /* HAVE_SPLICE */
1545
1546 /**
1547  * Read an amount of bytes at a given offset from the right file. This
1548  * abstracts the read-side of the copyonwrite stuff, and calls
1549  * rawexpread() with the right parameters to do the actual work.
1550  * @param a The offset where the read should start
1551  * @param buf A buffer to read into
1552  * @param len The size of buf
1553  * @param client The client we're going to read for
1554  * @return 0 on success, nonzero on failure
1555  **/
1556 int expread(READ_CTX *ctx, CLIENT *client) {
1557         off_t rdlen, offset;
1558         off_t mapcnt, mapl, maph, pagestart;
1559         off_t a = (off_t)ctx->current_offset + (off_t)ctx->req->from;
1560         size_t len = (size_t) ctx->req->len;
1561         int rv = 0;
1562
1563         DEBUG("Asked to read %u bytes at %llu.\n", (unsigned int)len, (unsigned long long)a);
1564
1565         if (!(client->server->flags & F_COPYONWRITE) && !((client->server->flags & F_WAIT) && (client->export == NULL)))
1566                 return(rawexpread_fully(ctx, client));
1567
1568         mapl=a/DIFFPAGESIZE; maph=(a+len-1)/DIFFPAGESIZE;
1569
1570         for (mapcnt=mapl;mapcnt<=maph;mapcnt++) {
1571                 pagestart=mapcnt*DIFFPAGESIZE;
1572                 offset=a-pagestart;
1573                 rdlen=(0<DIFFPAGESIZE-offset && len<(size_t)(DIFFPAGESIZE-offset)) ?
1574                         len : (size_t)DIFFPAGESIZE-offset;
1575                 if (!(client->server->flags & F_COPYONWRITE))
1576                         pthread_rwlock_rdlock(&client->export_lock);
1577                 if (client->difmap[mapcnt]!=(u32)(-1)) { /* the block is already there */
1578                         DEBUG("Page %llu is at %lu\n", (unsigned long long)mapcnt,
1579                                (unsigned long)(client->difmap[mapcnt]));
1580                         char *buf = find_read_buf(ctx);
1581                         if (pread(client->difffile, buf, rdlen, client->difmap[mapcnt]*DIFFPAGESIZE+offset) != rdlen) {
1582                                 goto fail;
1583                         }
1584                         confirm_read(client, ctx, rdlen);
1585                 } else { /* the block is not there */
1586                         if ((client->server->flags & F_WAIT) && (client->export == NULL)){
1587                                 DEBUG("Page %llu is not here, and waiting for file\n",
1588                                        (unsigned long long)mapcnt);
1589                                 goto fail;
1590                         } else {
1591                                 DEBUG("Page %llu is not here, we read the original one\n",
1592                                        (unsigned long long)mapcnt);
1593                                 ctx->current_len = rdlen;
1594                                 if(rawexpread_fully(ctx, client)) goto fail;
1595                         }
1596                 }
1597                 if (!(client->server->flags & F_COPYONWRITE))
1598                         pthread_rwlock_unlock(&client->export_lock);
1599                 len-=rdlen; a+=rdlen;
1600         }
1601         rv = 0;
1602         goto end;
1603 fail:
1604         if (!(client->server->flags & F_COPYONWRITE))
1605                 pthread_rwlock_unlock(&client->export_lock);
1606         rv = -1;
1607 end:
1608         return rv;
1609 }
1610
1611 /**
1612  * Write an amount of bytes at a given offset to the right file. This
1613  * abstracts the write-side of the copyonwrite option, and calls
1614  * rawexpwrite() with the right parameters to do the actual work.
1615  *
1616  * @param a The offset where the write should start
1617  * @param buf The buffer to write from
1618  * @param len The length of buf
1619  * @param client The client we're going to write for.
1620  * @param fua Flag to indicate 'Force Unit Access'
1621  * @return 0 on success, nonzero on failure
1622  **/
1623 int expwrite(off_t a, char *buf, size_t len, CLIENT *client, int fua) {
1624         char pagebuf[DIFFPAGESIZE];
1625         off_t mapcnt,mapl,maph;
1626         off_t wrlen,rdlen;
1627         off_t pagestart;
1628         off_t offset;
1629
1630         DEBUG("Asked to write %u bytes at %llu.\n", (unsigned int)len, (unsigned long long)a);
1631
1632
1633         if (!(client->server->flags & F_COPYONWRITE) && !((client->server->flags & F_WAIT) && (client->export == NULL)))
1634                 return(rawexpwrite_fully(a, buf, len, client, fua));
1635
1636         mapl=a/DIFFPAGESIZE ; maph=(a+len-1)/DIFFPAGESIZE ;
1637
1638         for (mapcnt=mapl;mapcnt<=maph;mapcnt++) {
1639                 pagestart=mapcnt*DIFFPAGESIZE ;
1640                 offset=a-pagestart ;
1641                 wrlen=(0<DIFFPAGESIZE-offset && len<(size_t)(DIFFPAGESIZE-offset)) ?
1642                         len : (size_t)DIFFPAGESIZE-offset;
1643
1644                 if (!(client->server->flags & F_COPYONWRITE))
1645                         pthread_rwlock_rdlock(&client->export_lock);
1646                 if (client->difmap[mapcnt]!=(u32)(-1)) { /* the block is already there */
1647                         DEBUG("Page %llu is at %lu\n", (unsigned long long)mapcnt,
1648                                (unsigned long)(client->difmap[mapcnt])) ;
1649                         if (pwrite(client->difffile, buf, wrlen, client->difmap[mapcnt]*DIFFPAGESIZE+offset) != wrlen) goto fail;
1650                 } else { /* the block is not there */
1651                         client->difmap[mapcnt]=(client->server->flags&F_SPARSE)?mapcnt:client->difffilelen++;
1652                         DEBUG("Page %llu is not here, we put it at %lu\n",
1653                                (unsigned long long)mapcnt,
1654                                (unsigned long)(client->difmap[mapcnt]));
1655                         if ((offset != 0) || (wrlen != DIFFPAGESIZE)){
1656                                 if ((client->server->flags & F_WAIT) && (client->export == NULL)){
1657                                         DEBUG("error: we can write only whole page while waiting for file\n");
1658                                         goto fail;
1659                                 }
1660                                 rdlen=DIFFPAGESIZE;
1661                                 int ret;
1662                                 char *ptr = pagebuf;
1663                                 while(rdlen > 0 && (ret = rawexpread(pagestart, ptr, rdlen, client)) > 0) {
1664                                         pagestart += ret;
1665                                         ptr += ret;
1666                                         rdlen -= ret;
1667                                 }
1668                                 if(ret < 0 ) goto fail;
1669                         }
1670                         memcpy(pagebuf+offset,buf,wrlen) ;
1671                         if (write(client->difffile, pagebuf, DIFFPAGESIZE) != DIFFPAGESIZE)
1672                                 goto fail;
1673                 }
1674                 if (!(client->server->flags & F_COPYONWRITE))
1675                         pthread_rwlock_unlock(&client->export_lock);
1676                 len-=wrlen ; a+=wrlen ; buf+=wrlen ;
1677         }
1678         if (client->server->flags & F_SYNC) {
1679                 fsync(client->difffile);
1680         } else if (fua) {
1681                 /* open question: would it be cheaper to do multiple sync_file_ranges?
1682                    as we iterate through the above?
1683                  */
1684                 fdatasync(client->difffile);
1685         }
1686         return 0;
1687 fail:
1688         if (!(client->server->flags & F_COPYONWRITE))
1689                 pthread_rwlock_unlock(&client->export_lock);
1690         return -1;
1691 }
1692
1693
1694 /**
1695  * Write an amount of zeroes at a given offset to the right file.
1696  * This routine could be optimised by not calling expwrite. However,
1697  * this is by far the simplest way to do it.
1698  *
1699  * @param req the request
1700  * @param client The client we're going to write for.
1701  * @return 0 on success, nonzero on failure
1702  **/
1703 int expwrite_zeroes(struct nbd_request* req, CLIENT* client, int fua) {
1704         off_t a = req->from;
1705         size_t len = req->len;
1706         size_t maxsize = 64LL*1024LL*1024LL;
1707         /* use calloc() as sadly MAP_ANON is apparently not POSIX standard */
1708         char *buf = calloc (1, maxsize);
1709         int ret;
1710         while (len > 0) {
1711                 size_t l = len;
1712                 if (l > maxsize)
1713                         l = maxsize;
1714                 ret = expwrite(a, buf, l, client, fua);
1715                 if (ret) {
1716                         free(buf);
1717                         return ret;
1718                 }
1719                 len -= l;
1720         }
1721         free(buf);
1722         return 0;
1723 }
1724
1725 /**
1726  * Flush data to a client
1727  *
1728  * @param client The client we're going to write for.
1729  * @return 0 on success, nonzero on failure
1730  **/
1731 int expflush(CLIENT *client) {
1732         gint i;
1733
1734         if (client->server->flags & F_COPYONWRITE) {
1735                 return fsync(client->difffile);
1736         }
1737
1738         if (client->server->flags & F_WAIT) {
1739                 return fsync(client->difffile);
1740         }
1741
1742         if (client->server->flags & F_TREEFILES ) {
1743                 // all we can do is force sync the entire filesystem containing the tree
1744                 if (client->server->flags & F_READONLY)
1745                         return 0;
1746                 sync();
1747                 return 0;
1748         }
1749
1750         for (i = 0; i < client->export->len; i++) {
1751                 FILE_INFO fi = g_array_index(client->export, FILE_INFO, i);
1752                 if (fsync(fi.fhandle) < 0)
1753                         return -1;
1754         }
1755
1756         return 0;
1757 }
1758
1759 void punch_hole(int fd, off_t off, off_t len) {
1760         DEBUG("Request to punch a hole in fd=%d, starting from %llu, length %llu\n", fd, (unsigned long long)off, (unsigned long long)len);
1761         errno = 0;
1762 // fallocate -- files, Linux
1763 #if HAVE_FALLOC_PH
1764         do {
1765                 if(fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, off, len) == 0)
1766                         return;
1767         } while(errno == EINTR);
1768 #endif
1769 // ioctl(BLKDISCARD) -- block devices, Linux
1770 #if HAVE_BLKDISCARD
1771         uint64_t range[2] = {off, len};
1772         do {
1773                 if(ioctl(fd, BLKDISCARD, range) == 0)
1774                         return;
1775         } while(errno == EINTR);
1776 #endif
1777 // Windows
1778 #if HAVE_FSCTL_SET_ZERO_DATA
1779         FILE_ZERO_DATA_INFORMATION zerodata;
1780         zerodata.FileOffset.QuadPart = off;
1781         zerodata.BeyondFinalZero.QuadPart = off + len;
1782         HANDLE w32handle = (HANDLE)_get_osfhandle(fd);
1783         DWORD bytesret;
1784         DeviceIoControl(w32handle, FSCTL_SET_ZERO_DATA, &zerodata, sizeof(zerodata), NULL, 0, &bytesret, NULL);
1785         return;
1786 #endif
1787         if(errno) {
1788                 DEBUG("punching holes failed: %s", strerror(errno));
1789         } else {
1790                 DEBUG("punching holes not supported on this platform\n");
1791         }
1792 }
1793
1794 static void send_reply(CLIENT* client, uint32_t opt, uint32_t reply_type, ssize_t datasize, void* data) {
1795         struct {
1796                 uint64_t magic;
1797                 uint32_t opt;
1798                 uint32_t reply_type;
1799                 uint32_t datasize;
1800         } __attribute__ ((packed)) header = {
1801                 htonll(0x3e889045565a9LL),
1802                 htonl(opt),
1803                 htonl(reply_type),
1804                 htonl(datasize),
1805         };
1806         if(datasize < 0) {
1807                 datasize = strlen((char*)data);
1808                 header.datasize = htonl(datasize);
1809         }
1810         socket_write(client, &header, sizeof(header));
1811         if(data != NULL) {
1812                 socket_write(client, data, datasize);
1813         }
1814 }
1815
1816 /**
1817  * Find the name of the file we have to serve. This will use g_strdup_printf
1818  * to put the IP address of the client inside a filename containing
1819  * "%s" (in the form as specified by the "virtstyle" option). That name
1820  * is then written to client->exportname.
1821  *
1822  * @param net A socket connected to an nbd client
1823  * @param client information about the client. The IP address in human-readable
1824  * format will be written to a new char* buffer, the address of which will be
1825  * stored in client->clientname.
1826  * @return: 0 - OK, -1 - failed.
1827  **/
1828 int set_peername(int net, CLIENT *client) {
1829         struct sockaddr_storage netaddr;
1830         struct sockaddr* addr = (struct sockaddr*)&netaddr;
1831         socklen_t addrinlen = sizeof( struct sockaddr_storage );
1832         struct addrinfo hints;
1833         struct addrinfo *ai = NULL;
1834         char peername[NI_MAXHOST];
1835         char netname[NI_MAXHOST];
1836         char *tmp = NULL;
1837         int i;
1838         int e;
1839
1840         if (getsockname(net, addr, &addrinlen) < 0) {
1841                 msg(LOG_INFO, "getsockname failed: %m");
1842                 return -1;
1843         }
1844
1845         if(netaddr.ss_family == AF_UNIX) {
1846                 client->clientaddr.ss_family = AF_UNIX;
1847                 strcpy(peername, "unix");
1848         } else {
1849                 if (getpeername(net, (struct sockaddr *) &(client->clientaddr), &addrinlen) < 0) {
1850                         msg(LOG_INFO, "getpeername failed: %m");
1851                         return -1;
1852                 }
1853                 if((e = getnameinfo((struct sockaddr *)&(client->clientaddr), addrinlen,
1854                                 peername, sizeof (peername), NULL, 0, NI_NUMERICHOST))) {
1855                         msg(LOG_INFO, "getnameinfo failed: %s", gai_strerror(e));
1856                         return -1;
1857                 }
1858
1859                 memset(&hints, '\0', sizeof (hints));
1860                 hints.ai_flags = AI_ADDRCONFIG;
1861                 e = getaddrinfo(peername, NULL, &hints, &ai);
1862
1863                 if(e != 0) {
1864                         msg(LOG_INFO, "getaddrinfo failed: %s", gai_strerror(e));
1865                         freeaddrinfo(ai);
1866                         return -1;
1867                 }
1868         }
1869
1870         if(strncmp(peername, "::ffff:", 7) == 0) {
1871                 memmove(peername, peername+7, strlen(peername));
1872         }
1873
1874         switch(client->server->virtstyle) {
1875                 case VIRT_NONE:
1876                         msg(LOG_DEBUG, "virtualization is off");
1877                         client->exportname=g_strdup(client->server->exportname);
1878                         break;
1879                 case VIRT_IPHASH:
1880                         msg(LOG_DEBUG, "virtstyle iphash");
1881                         for(i=0;i<strlen(peername);i++) {
1882                                 if(peername[i]=='.') {
1883                                         peername[i]='/';
1884                                 }
1885                         }
1886                         break;
1887                 case VIRT_IPLIT:
1888                         msg(LOG_DEBUG, "virtstyle ipliteral");
1889                         client->exportname=g_strdup_printf(client->server->exportname, peername);
1890                         break;
1891                 case VIRT_CIDR:
1892                         msg(LOG_DEBUG, "virtstyle cidr %d", client->server->cidrlen);
1893                         memcpy(&netaddr, &(client->clientaddr), addrinlen);
1894                         int addrbits;
1895                         if(client->clientaddr.ss_family == AF_UNIX) {
1896                                 tmp = g_strdup(peername);
1897                         } else {
1898                                 assert((ai->ai_family == AF_INET) || (ai->ai_family == AF_INET6));
1899                                 if(ai->ai_family == AF_INET) {
1900                                         addrbits = 32;
1901                                 } else if(ai->ai_family == AF_INET6) {
1902                                         addrbits = 128;
1903                                 } else {
1904                                         g_assert_not_reached();
1905                                 }
1906                                 uint8_t* addrptr = (uint8_t*)(((struct sockaddr*)&netaddr)->sa_data);
1907                                 for(int i = 0; i < addrbits; i+=8) {
1908                                         int masklen = client->server->cidrlen - i;
1909                                         masklen = masklen > 0 ? masklen : 0;
1910                                         uint8_t mask = getmaskbyte(masklen);
1911                                         *addrptr &= mask;
1912                                         addrptr++;
1913                                 }
1914                                 getnameinfo((struct sockaddr *) &netaddr, addrinlen,
1915                                                                 netname, sizeof (netname), NULL, 0, NI_NUMERICHOST);
1916                                 tmp=g_strdup_printf("%s/%s", netname, peername);
1917                         }
1918
1919                         if(tmp != NULL) {
1920                                 client->exportname=g_strdup_printf(client->server->exportname, tmp);
1921                                 g_free(tmp);
1922                         }
1923
1924                         break;
1925         }
1926
1927         if(ai) {
1928                 freeaddrinfo(ai);
1929         }
1930         msg(LOG_INFO, "connect from %s, assigned file is %s",
1931             peername, client->exportname);
1932         client->clientname=g_strdup(peername);
1933         return 0;
1934 }
1935
1936 int commit_diff(CLIENT* client, bool lock, int fhandle){
1937         int dirtycount = 0;
1938         int pagecount = client->exportsize/DIFFPAGESIZE;
1939         off_t offset;
1940         char* buf = malloc(sizeof(char)*DIFFPAGESIZE);
1941
1942         for (int i=0; i<pagecount; i++){
1943                 offset = DIFFPAGESIZE*i;
1944                 if (lock)
1945                         pthread_rwlock_wrlock(&client->export_lock);
1946                 if (client->difmap[i] != (u32)-1){
1947                         dirtycount += 1;
1948                         DEBUG("flushing dirty page %d, offset %ld\n", i, offset);
1949                         if (pread(client->difffile, buf, DIFFPAGESIZE, client->difmap[i]*DIFFPAGESIZE) != DIFFPAGESIZE) {
1950                                 msg(LOG_WARNING, "could not read while committing diff: %m");
1951                                 if(lock) {
1952                                         pthread_rwlock_unlock(&client->export_lock);
1953                                 }
1954                                 break;
1955                         }
1956                         if (pwrite(fhandle, buf, DIFFPAGESIZE, offset) != DIFFPAGESIZE) {
1957                                 msg(LOG_WARNING, "could not write while committing diff: %m");
1958                                 if (lock) {
1959                                         pthread_rwlock_unlock(&client->export_lock);
1960                                 }
1961                                 break;
1962                         }
1963                         client->difmap[i] = (u32)-1;
1964                 }
1965                 if (lock)
1966                         pthread_rwlock_unlock(&client->export_lock);
1967         }
1968
1969         free(buf);
1970         return dirtycount;
1971 }
1972
1973 void* wait_file(void *void_ptr) {
1974         CLIENT* client = (CLIENT *)void_ptr;
1975         FILE_INFO fi;
1976         GArray* export;
1977         mode_t mode = O_RDWR;
1978         int dirtycount;
1979
1980         fi.fhandle = -1;
1981         fi.startoff = 0;
1982
1983         while (fi.fhandle < 1){
1984                 sem_wait(&file_wait_sem);
1985                 msg(LOG_INFO, "checking for file %s", client->server->exportname);
1986                 fi.fhandle = open(client->server->exportname, mode);
1987         }
1988
1989         msg(LOG_INFO, "File %s appeared, fd %d", client->server->exportname, fi.fhandle);
1990
1991         // first time there may be lot of data so we lock only per page
1992         do {
1993                 dirtycount = commit_diff(client, true, fi.fhandle);
1994         } while (dirtycount > 0);
1995
1996         //last time we lock export for the whole time until we switch write destination
1997         pthread_rwlock_wrlock(&client->export_lock);
1998         do {
1999                 dirtycount = commit_diff(client, false, fi.fhandle);
2000         } while (dirtycount > 0);
2001
2002         export = g_array_new(TRUE, TRUE, sizeof(FILE_INFO));
2003         g_array_append_val(export, fi);
2004
2005         client->export = export;
2006         pthread_rwlock_unlock(&client->export_lock);
2007         msg(LOG_INFO, "Waiting for file ended, switching to exported file %s", client->server->exportname);
2008
2009         return NULL;
2010 }
2011
2012 /**
2013  * Set up client export array, which is an array of FILE_INFO.
2014  * Also, split a single exportfile into multiple ones, if that was asked.
2015  * @param client information on the client which we want to setup export for
2016  **/
2017 bool setupexport(CLIENT* client) {
2018         int i = 0;
2019         off_t laststartoff = 0, lastsize = 0;
2020         int multifile = (client->server->flags & F_MULTIFILE);
2021         int treefile = (client->server->flags & F_TREEFILES);
2022         int temporary = (client->server->flags & F_TEMPORARY) && !multifile;
2023         int cancreate = (client->server->expected_size) && !multifile;
2024
2025         if (treefile || (client->server->flags & F_WAIT)) {
2026                 client->export = NULL; // this could be thousands of files so we open handles on demand although its slower
2027                 client->exportsize = client->server->expected_size; // available space is not checked, as it could change during runtime anyway
2028
2029                 if(client->server->flags & F_WAIT){
2030                         pthread_t wait_file_thread;
2031                         if (pthread_create(&wait_file_thread, NULL, wait_file, client)){
2032                                 DEBUG("failed to create wait_file thread");
2033                                 return false;
2034                         }
2035                 }
2036
2037         } else {
2038                 client->export = g_array_new(TRUE, TRUE, sizeof(FILE_INFO));
2039
2040                 /* If multi-file, open as many files as we can.
2041                  * If not, open exactly one file.
2042                  * Calculate file sizes as we go to get total size. */
2043                 for(i=0; ; i++) {
2044                         FILE_INFO fi;
2045                         _cleanup_g_free_ gchar *tmpname = NULL;
2046                         _cleanup_g_free_ gchar* error_string = NULL;
2047
2048                         if (i)
2049                                 cancreate = 0;
2050                         /* if expected_size is specified, and this is the first file, we can create the file */
2051                         mode_t mode = (client->server->flags & F_READONLY) ?
2052                           O_RDONLY : (O_RDWR | (cancreate?O_CREAT:0));
2053
2054                         if (temporary) {
2055                                 tmpname=g_strdup_printf("%s.%d-XXXXXX", client->exportname, i);
2056                                 DEBUG( "Opening %s\n", tmpname );
2057                                 fi.fhandle = mkstemp(tmpname);
2058                         } else {
2059                                 if(multifile) {
2060                                         tmpname=g_strdup_printf("%s.%d", client->exportname, i);
2061                                 } else {
2062                                         tmpname=g_strdup(client->exportname);
2063                                 }
2064                                 DEBUG( "Opening %s\n", tmpname );
2065                                 fi.fhandle = open(tmpname, mode, 0600);
2066                                 if(fi.fhandle == -1 && mode == O_RDWR) {
2067                                         /* Try again because maybe media was read-only */
2068                                         fi.fhandle = open(tmpname, O_RDONLY);
2069                                         if(fi.fhandle != -1) {
2070                                                 /* Opening the base file in copyonwrite mode is
2071                                                  * okay */
2072                                                 if(!(client->server->flags & F_COPYONWRITE)) {
2073                                                         client->server->flags |= F_AUTOREADONLY;
2074                                                         client->server->flags |= F_READONLY;
2075                                                 }
2076                                         }
2077                                 }
2078                         }
2079                         if(fi.fhandle == -1) {
2080                                 if(multifile && i>0)
2081                                         break;
2082                                 error_string=g_strdup_printf(
2083                                         "Could not open exported file %s: %%m",
2084                                         tmpname);
2085                                 err_nonfatal(error_string);
2086                                 return false;
2087                         }
2088
2089                         if (temporary) {
2090                                 unlink(tmpname); /* File will stick around whilst FD open */
2091                         }
2092
2093                         fi.startoff = laststartoff + lastsize;
2094                         g_array_append_val(client->export, fi);
2095
2096                         /* Starting offset and size of this file will be used to
2097                          * calculate starting offset of next file */
2098                         laststartoff = fi.startoff;
2099                         lastsize = size_autodetect(fi.fhandle);
2100
2101                         /* If we created the file, it will be length zero */
2102                         if (!lastsize && cancreate) {
2103                                 assert(!multifile);
2104                                 if(ftruncate (fi.fhandle, client->server->expected_size)<0) {
2105                                         err_nonfatal("Could not expand file: %m");
2106                                         return false;
2107                                 }
2108                                 lastsize = client->server->expected_size;
2109                                 break; /* don't look for any more files */
2110                         }
2111
2112                         if(!multifile || temporary)
2113                                 break;
2114                 }
2115
2116                 /* Set export size to total calculated size */
2117                 client->exportsize = laststartoff + lastsize;
2118
2119                 /* Export size may be overridden */
2120                 if(client->server->expected_size) {
2121                         /* desired size must be <= total calculated size */
2122                         if(client->server->expected_size > client->exportsize) {
2123                                 err_nonfatal("Size of exported file is too big\n");
2124                                 return false;
2125                         }
2126
2127                         client->exportsize = client->server->expected_size;
2128                 }
2129         }
2130
2131         msg(LOG_INFO, "Size of exported file/device is %llu", (unsigned long long)client->exportsize);
2132         if(multifile) {
2133                 msg(LOG_INFO, "Total number of files: %d", i);
2134         }
2135         if(treefile) {
2136                 msg(LOG_INFO, "Total number of (potential) files: %" PRId64, (client->exportsize+TREEPAGESIZE-1)/TREEPAGESIZE);
2137         }
2138         return true;
2139 }
2140
2141 bool copyonwrite_prepare(CLIENT* client) {
2142         off_t i;
2143         _cleanup_g_free_ gchar* dir = NULL;
2144         _cleanup_g_free_ gchar* export_base = NULL;
2145         if (client->server->cowdir != NULL) {
2146                 dir = g_strdup(client->server->cowdir);
2147         } else {
2148                 dir = g_strdup(dirname(client->exportname));
2149         }
2150         export_base = g_strdup(basename(client->exportname));
2151         client->difffilename = g_strdup_printf("%s/%s-%s-%d.diff",dir,export_base,client->clientname,
2152                 (int)getpid());
2153         msg(LOG_INFO, "About to create map and diff file %s", client->difffilename) ;
2154         client->difffile=open(client->difffilename,O_RDWR | O_CREAT | O_TRUNC,0600) ;
2155         if (client->difffile<0) {
2156                 err("Could not create diff file (%m)");
2157                 return false;
2158         }
2159         if ((client->difmap=calloc(client->exportsize/DIFFPAGESIZE,sizeof(u32)))==NULL) {
2160                 err("Could not allocate memory");
2161                 return false;
2162         }
2163         for (i=0;i<client->exportsize/DIFFPAGESIZE;i++) client->difmap[i]=(u32)-1;
2164
2165         return true;
2166 }
2167
2168 void send_export_info(CLIENT* client, SERVER* server, bool maybe_zeroes) {
2169         uint64_t size_host = htonll((u64)(client->exportsize));
2170         uint16_t flags = NBD_FLAG_HAS_FLAGS | NBD_FLAG_SEND_WRITE_ZEROES;
2171
2172         socket_write(client, &size_host, 8);
2173         if (server->flags & F_READONLY)
2174                 flags |= NBD_FLAG_READ_ONLY;
2175         if (server->flags & F_FLUSH)
2176                 flags |= NBD_FLAG_SEND_FLUSH;
2177         if (server->flags & F_FUA)
2178                 flags |= NBD_FLAG_SEND_FUA;
2179         if (server->flags & F_ROTATIONAL)
2180                 flags |= NBD_FLAG_ROTATIONAL;
2181         if (server->flags & F_TRIM)
2182                 flags |= NBD_FLAG_SEND_TRIM;
2183         if (!(server->flags & F_COPYONWRITE))
2184                 flags |= NBD_FLAG_CAN_MULTI_CONN;
2185         if (client->clientflags & F_STRUCTURED)
2186                 flags |= NBD_FLAG_SEND_DF;
2187         flags = htons(flags);
2188         socket_write(client, &flags, sizeof(flags));
2189         if (!(glob_flags & F_NO_ZEROES) && maybe_zeroes) {
2190                 char zeros[128];
2191                 memset(zeros, '\0', sizeof(zeros));
2192                 socket_write(client, zeros, 124);
2193         }
2194 }
2195
2196 /**
2197   * Setup the transaction log
2198   *
2199   * The function does all things required for the transaction log:
2200   * - Create a new log file.
2201   * - allocate the posix semaphore for synchronization.
2202   * - Report if a log file already exists.
2203   * - If needed add a header to the log.
2204   *
2205   * If something goes wrong, logging is disabled.
2206   *
2207   * @param client the CLIENT structure with .server and .net members set
2208   * up correctly
2209   */
2210 static void setup_transactionlog(CLIENT *client) {
2211         struct stat fdinfo;
2212         int ret;
2213
2214         /* 1) create the file */
2215         if((client->transactionlogfd =
2216                                 open(client->server->transactionlog,
2217                                         O_WRONLY | O_CREAT,
2218                                         S_IRUSR | S_IWUSR)) ==
2219                         -1) {
2220                 msg(LOG_INFO, "Could not open transactionlog %s, moving on without it",
2221                                 client->server->transactionlog);
2222                 return;
2223         }
2224
2225         /* 2) If needed, write flags */
2226         if (client->server->flags & F_DATALOG) {
2227                 struct nbd_request req;
2228                 int ret;
2229
2230                 req.magic = htonl(NBD_TRACELOG_MAGIC);
2231                 req.type = htonl(NBD_TRACELOG_SET_DATALOG);
2232                 req.cookie = 0;
2233                 req.from = htonll(NBD_TRACELOG_FROM_MAGIC);
2234                 req.len = htonl(TRUE);
2235
2236                 ret = writeit(client->transactionlogfd, &req, sizeof(struct nbd_request));
2237                 if (ret < 0) {
2238                         msg(LOG_INFO, "Could not write to transactionlog %s, moving on without it",
2239                                 client->server->transactionlog);
2240                         close(client->transactionlogfd);
2241                         client->transactionlogfd = -1;
2242                         return;
2243                 }
2244         }
2245
2246         /* 3) Allocate the semaphore used for locking */
2247         ret = fstat(client->transactionlogfd, &fdinfo);
2248         if (ret == -1) {
2249                 msg(LOG_INFO, "Could not stat transactionlog %s, moving on without it",
2250                         client->server->transactionlog);
2251                 close(client->transactionlogfd);
2252                 client->transactionlogfd = -1;
2253                 return;
2254         }
2255         snprintf(client->semname, sizeof(client->semname), "/nbd-server-%llx-%llx",
2256                                 (unsigned long long)fdinfo.st_dev,
2257                                 (unsigned long long)fdinfo.st_ino);
2258         client->logsem = sem_open(client->semname, O_CREAT, 0600, 1);
2259         if (client->logsem == SEM_FAILED) {
2260                 msg(LOG_INFO, "Could not allocate semaphore for transactionlog %s, moving on without it",
2261                         client->server->transactionlog);
2262                 close(client->transactionlogfd);
2263                 client->transactionlogfd = -1;
2264         }
2265 }
2266
2267 /**
2268   * Commit to exporting the chosen export
2269   *
2270   * When a client sends NBD_OPT_EXPORT_NAME or NBD_OPT_GO, we need to do
2271   * a number of things (verify whether the client is allowed access, try
2272   * to open files, etc etc) before we're ready to actually serve the
2273   * export.
2274   *
2275   * This function does all those things.
2276   *
2277   * @param client the CLIENT structure with .server and .net members set
2278   * up correctly
2279   * @return true if the client is allowed access to the export, false
2280   * otherwise
2281   */
2282 static bool commit_client(CLIENT* client, SERVER* server) {
2283         char acl;
2284         uint32_t len;
2285
2286         client->server = serve_inc_ref(server);
2287         client->exportsize = OFFT_MAX;
2288         client->transactionlogfd = -1;
2289         if(pthread_mutex_init(&(client->lock), NULL)) {
2290                 msg(LOG_ERR, "Unable to initialize mutex");
2291                 return false;
2292         }
2293         if (pthread_rwlock_init(&client->export_lock, NULL)){
2294                 msg(LOG_ERR, "Unable to initialize write lock");
2295                 return false;
2296         }
2297         /* Check whether we exceeded the maximum number of allowed
2298          * clients already */
2299         if(dontfork) {
2300                 acl = 'Y';
2301         } else {
2302                 len = strlen(client->server->servename);
2303                 writeit(commsocket, &len, sizeof len);
2304                 writeit(commsocket, client->server->servename, len);
2305                 readit(commsocket, &acl, 1);
2306                 close(commsocket);
2307         }
2308         switch(acl) {
2309                 case 'N':
2310                         msg(LOG_ERR, "Connection not allowed (too many clients)");
2311                         return false;
2312                 case 'X':
2313                         msg(LOG_ERR, "Connection not allowed (unknown by parent?!?)");
2314                         return false;
2315         }
2316
2317         /* Check whether the client is listed in the authfile */
2318         if (set_peername(client->net, client)) {
2319                 msg(LOG_ERR, "Failed to set peername");
2320                 return false;
2321         }
2322
2323         if (!authorized_client(client)) {
2324                 msg(LOG_INFO, "Client '%s' is not authorized to access",
2325                     client->clientname);
2326                 return false;
2327         }
2328
2329         /* Set up the transactionlog, if we need one */
2330         if (client->server->transactionlog && (client->transactionlogfd == -1))
2331                 setup_transactionlog(client);
2332
2333         /* Run any pre scripts that we may need */
2334         if (do_run(client->server->prerun, client->exportname)) {
2335                 msg(LOG_INFO, "Client '%s' not allowed access by prerun script",
2336                                 client->clientname);
2337                 return false;
2338         }
2339         client->socket_closed = socket_closed_transmission;
2340         if(!setupexport(client)) {
2341                 return false;
2342         }
2343
2344         if (client->server->flags & F_COPYONWRITE) {
2345                 if(!copyonwrite_prepare(client)) {
2346                         return false;
2347                 }
2348         }
2349
2350         if (client->server->flags & F_WAIT) {
2351                 if(!copyonwrite_prepare(client)) {
2352                         return false;
2353                 }
2354         }
2355
2356         setmysockopt(client->net);
2357
2358         return true;
2359 }
2360
2361 static CLIENT* handle_export_name(CLIENT* client, uint32_t opt, GArray* servers, uint32_t cflags) {
2362         uint32_t namelen;
2363         char* name;
2364         int i;
2365
2366         socket_read(client, &namelen, sizeof(namelen));
2367         namelen = ntohl(namelen);
2368         if(namelen > 4096) {
2369                 return NULL;
2370         }
2371         if(namelen > 0) {
2372                 name = malloc(namelen+1);
2373                 name[namelen]=0;
2374                 socket_read(client, name, namelen);
2375         } else {
2376                 name = strdup("");
2377         }
2378         for(i=0; i<servers->len; i++) {
2379                 SERVER* serve = (g_array_index(servers, SERVER*, i));
2380                 // hide exports that are TLS-only if we haven't negotiated TLS
2381                 // yet
2382                 if ((serve->flags & F_FORCEDTLS) && !client->tls_session) {
2383                         continue;
2384                 }
2385                 if(!strcmp(serve->servename, name)) {
2386                         client->clientfeats = cflags;
2387                         free(name);
2388                         if(!commit_client(client, serve)) {
2389                                 return NULL;
2390                         }
2391                         send_export_info(client, serve, true);
2392                         return client;
2393                 }
2394         }
2395         free(name);
2396         err("Negotiation failed/8a: Requested export not found, or is TLS-only and client did not negotiate TLS");
2397 }
2398
2399 static void handle_list(CLIENT* client, uint32_t opt, GArray* servers, uint32_t cflags) {
2400         uint32_t len;
2401         int i;
2402         char buf[1024];
2403         char *ptr = buf + sizeof(len);
2404
2405         socket_read(client, &len, sizeof(len));
2406         len = ntohl(len);
2407         if(len) {
2408                 send_reply(client, opt, NBD_REP_ERR_INVALID, -1, "NBD_OPT_LIST with nonzero data length is not a valid request");
2409         }
2410         if(!(glob_flags & F_LIST)) {
2411                 send_reply(client, opt, NBD_REP_ERR_POLICY, -1, "Listing of exports denied by server configuration");
2412                 err_nonfatal("Client tried disallowed list option");
2413                 return;
2414         }
2415         for(i=0; i<servers->len; i++) {
2416                 SERVER* serve = (g_array_index(servers, SERVER*, i));
2417                 // Hide TLS-only exports if we haven't negotiated TLS yet
2418                 if(!client->tls_session && (serve->flags & F_FORCEDTLS)) {
2419                         continue;
2420                 }
2421                 len = htonl(strlen(serve->servename));
2422                 memcpy(buf, &len, sizeof(len));
2423                 strncpy(ptr, serve->servename, sizeof(buf) - sizeof(len));
2424                 send_reply(client, opt, NBD_REP_SERVER, strlen(serve->servename)+sizeof(len), buf);
2425         }
2426         send_reply(client, opt, NBD_REP_ACK, 0, NULL);
2427 }
2428
2429 #if HAVE_GNUTLS
2430 static int verify_cert(gnutls_session_t session) {
2431         int ret;
2432         unsigned int status, cert_list_size;
2433         const gnutls_datum_t *cert_list;
2434         gnutls_x509_crt_t cert;
2435         time_t now = time(NULL);
2436
2437         ret = gnutls_certificate_verify_peers2(session, &status);
2438         if(ret < 0 || status != 0 || gnutls_certificate_type_get(session) !=
2439                         GNUTLS_CRT_X509) {
2440                 goto err;
2441         }
2442
2443         if(gnutls_x509_crt_init(&cert) < 0) {
2444                 goto err;
2445         }
2446
2447         cert_list = gnutls_certificate_get_peers(session, &cert_list_size);
2448         if(cert_list == NULL) {
2449                 goto err;
2450         }
2451         if(gnutls_x509_crt_import(cert, &cert_list[0], GNUTLS_X509_FMT_DER) < 0) {
2452                 goto err;
2453         }
2454         if(gnutls_x509_crt_get_activation_time(cert) > now) {
2455                 goto err;
2456         }
2457         if(gnutls_x509_crt_get_expiration_time(cert) < now) {
2458                 goto err;
2459         }
2460         // TODO: check CRLs and/or OCSP etc. Patches welcome.
2461         msg(LOG_INFO, "client certificate verification successful");
2462         return 0;
2463 err:
2464         msg(LOG_ERR, "E: client certificate verification failed");
2465         return GNUTLS_E_CERTIFICATE_ERROR;
2466 }
2467
2468 CLIENT* handle_starttls(CLIENT* client, int opt, GArray* servers, uint32_t cflags, struct generic_conf *genconf) {
2469 #define check_rv(c) if((c)<0) { retval = NULL; goto exit; }
2470         gnutls_certificate_credentials_t x509_cred;
2471         CLIENT* retval = client;
2472         gnutls_priority_t priority_cache;
2473         gnutls_session_t *session = g_new0(gnutls_session_t, 1);
2474         int ret;
2475         int len;
2476
2477         socket_read(client, &len, sizeof(len));
2478         if(G_UNLIKELY(len != 0)) {
2479                 char buf[1024*1024];
2480                 consume(client, len, buf, sizeof(buf));
2481                 send_reply(client, opt, NBD_REP_ERR_INVALID, -1, "Sending a STARTTLS command with data is invalid");
2482                 return NULL;
2483         }
2484
2485         send_reply(client, opt, NBD_REP_ACK, 0, NULL);
2486
2487         check_rv(gnutls_certificate_allocate_credentials(&x509_cred));
2488         gnutls_certificate_set_verify_function(x509_cred, verify_cert);
2489         check_rv(gnutls_certificate_set_x509_trust_file(x509_cred, genconf->cacertfile, GNUTLS_X509_FMT_PEM));
2490         check_rv(gnutls_certificate_set_x509_key_file(x509_cred, genconf->certfile, genconf->keyfile, GNUTLS_X509_FMT_PEM));
2491         check_rv(gnutls_priority_init(&priority_cache, genconf->tlsprio, NULL));
2492         check_rv(gnutls_init(session, GNUTLS_SERVER));
2493         check_rv(gnutls_priority_set(*session, priority_cache));
2494         check_rv(gnutls_credentials_set(*session, GNUTLS_CRD_CERTIFICATE, x509_cred));
2495
2496         gnutls_certificate_server_set_request(*session, GNUTLS_CERT_REQUEST);
2497 #if GNUTLS_VERSION_NUMBER >= 0x030109
2498         gnutls_transport_set_int(*session, client->net);
2499 #else
2500         gnutls_transport_set_ptr(*session, (gnutls_transport_ptr_t) (intptr_t) client->net);
2501 #endif
2502         do {
2503                 ret = gnutls_handshake(*session);
2504         } while(ret < 0 && gnutls_error_is_fatal(ret) == 0);
2505
2506         if (ret < 0) {
2507                 err_nonfatal(gnutls_strerror(ret));
2508                 gnutls_bye(*session, GNUTLS_SHUT_RDWR);
2509                 gnutls_deinit(*session);
2510                 g_free(session);
2511                 return NULL;
2512         }
2513         client->tls_session = session;
2514         client->socket_read = socket_read_tls;
2515         client->socket_write = socket_write_tls;
2516 #undef check_rv
2517 exit:
2518         if(retval == NULL && session != NULL) {
2519                 g_free(session);
2520         }
2521         /* export names cannot be chosen before NBD_OPT_STARTTLS and be retained */
2522         if(retval != NULL && retval->server != NULL) {
2523                 retval->server = NULL;
2524         }
2525         return retval;
2526 }
2527 #endif
2528
2529 /**
2530   * Handle an NBD_OPT_STRUCTURED_REPLY message
2531   */
2532 static void handle_structured_reply(CLIENT *client, uint32_t opt, GArray *servers, uint32_t cflags) {
2533         uint32_t len;
2534         int i;
2535
2536         socket_read(client, &len, sizeof(len));
2537         len = ntohl(len);
2538         if(len) {
2539                 send_reply(client, opt, NBD_REP_ERR_INVALID, -1, "NBD_OPT_STRUCTURED_REPLY with nonzero data length is not a valid request");
2540                 char buf[1024];
2541                 consume(client, len, buf, sizeof buf);
2542                 return;
2543         }
2544         if(client->clientflags & F_STRUCTURED) {
2545                 send_reply(client, opt, NBD_REP_ERR_INVALID, -1, "NBD_OPT_STRUCTURED_REPLY has already been called");
2546                 return;
2547         }
2548         client->clientflags |= F_STRUCTURED;
2549         send_reply(client, opt, NBD_REP_ACK, 0, NULL);
2550 }
2551
2552 /**
2553   * Handle an NBD_OPT_INFO or NBD_OPT_GO request.
2554   */
2555 static bool handle_info(CLIENT* client, uint32_t opt, GArray* servers, uint32_t cflags) {
2556         uint32_t namelen, len;
2557         char *name;
2558         int i;
2559         SERVER *server = NULL;
2560         uint16_t n_requests;
2561         uint16_t request;
2562         char buf[1024];
2563         bool sent_export = false;
2564         uint32_t reptype = NBD_REP_ERR_UNKNOWN;
2565         char *msg = "Export unknown";
2566
2567         socket_read(client, &len, sizeof(len));
2568         len = htonl(len);
2569         socket_read(client, &namelen, sizeof(namelen));
2570         namelen = htonl(namelen);
2571         if(namelen > (len - 6)) {
2572                 send_reply(client, opt, NBD_REP_ERR_INVALID, -1, "An OPT_INFO request cannot be smaller than the length of the name + 6");
2573                 consume(client, len - sizeof(namelen), buf, sizeof(buf));
2574         }
2575         if(namelen > 4096) {
2576                 send_reply(client, opt, NBD_REP_ERR_INVALID, -1, "The name for this OPT_INFO request is too long");
2577                 consume(client, namelen, buf, sizeof(buf));
2578         }
2579         if(namelen > 0) {
2580                 name = malloc(namelen + 1);
2581                 if (!name) {
2582                         send_reply(client, opt, reptype, -1, "nbd server out of memory");
2583                         return false;
2584                 }
2585                 name[namelen] = 0;
2586                 socket_read(client, name, namelen);
2587         } else {
2588                 name = strdup("");
2589         }
2590         for(i=0; i<servers->len; i++) {
2591                 SERVER *serve = (g_array_index(servers, SERVER*, i));
2592                 if (!strcmp(serve->servename, name)) {
2593                         if ((serve->flags & F_FORCEDTLS) && !client->tls_session) {
2594                                 reptype = NBD_REP_ERR_TLS_REQD;
2595                                 msg = "TLS is required for that export";
2596                                 continue;
2597                         }
2598                         server = serve;
2599                 }
2600         }
2601         free(name);
2602         socket_read(client, &n_requests, sizeof(n_requests));
2603         n_requests = ntohs(n_requests);
2604         if(!server) {
2605                 consume(client, n_requests * sizeof(request), buf,
2606                                 sizeof(buf));
2607                 send_reply(client, opt, reptype, -1, msg);
2608                 return false;
2609         }
2610         if (opt == NBD_OPT_GO) {
2611                 client->clientfeats = cflags;
2612                 if(!commit_client(client, server)) {
2613                         send_reply(client, opt, NBD_REP_ERR_POLICY, -1, "Access denied by server configuration");
2614                         return false;
2615                 }
2616         }
2617         for(i=0; i<n_requests; i++) {
2618                 socket_read(client, &request, sizeof(request));
2619                 switch(ntohs(request)) {
2620                         case NBD_INFO_EXPORT:
2621                                 send_reply(client, opt, NBD_REP_INFO, 12, NULL);
2622                                 socket_write(client, &request, 2);
2623                                 send_export_info(client, server, false);
2624                                 sent_export = true;
2625                                 break;
2626                         default:
2627                                 // ignore all other options for now.
2628                                 break;
2629                 }
2630         }
2631         if(!sent_export) {
2632                 request = htons(NBD_INFO_EXPORT);
2633                 send_reply(client, opt, NBD_REP_INFO, 12, NULL);
2634                 socket_write(client, &request, 2);
2635                 send_export_info(client, server, false);
2636         }
2637         send_reply(client, opt, NBD_REP_ACK, 0, NULL);
2638
2639         return true;
2640 }
2641
2642 /**
2643  * Do the initial negotiation.
2644  *
2645  * @param net The socket we're doing the negotiation over.
2646  * @param servers The array of known servers.
2647  * @param genconf the global options (needed for accessing TLS config data)
2648  **/
2649 CLIENT* negotiate(int net, GArray* servers, struct generic_conf *genconf) {
2650         uint16_t smallflags = NBD_FLAG_FIXED_NEWSTYLE | NBD_FLAG_NO_ZEROES;
2651         uint64_t magic;
2652         uint32_t cflags = 0;
2653         uint32_t opt;
2654         CLIENT* client = g_new0(CLIENT, 1);
2655         client->net = net;
2656         client->socket_read = socket_read_notls;
2657         client->socket_write = socket_write_notls;
2658         client->socket_closed = socket_closed_negotiate;
2659         client->transactionlogfd = -1;
2660         client->logsem = SEM_FAILED;
2661
2662         assert(servers != NULL);
2663         socket_write(client, INIT_PASSWD, 8);
2664         magic = htonll(opts_magic);
2665         socket_write(client, &magic, sizeof(magic));
2666
2667         smallflags = htons(smallflags);
2668         socket_write(client, &smallflags, sizeof(uint16_t));
2669         socket_read(client, &cflags, sizeof(cflags));
2670         cflags = htonl(cflags);
2671         if (cflags & NBD_FLAG_C_NO_ZEROES) {
2672                 glob_flags |= F_NO_ZEROES;
2673         }
2674         do {
2675                 socket_read(client, &magic, sizeof(magic));
2676                 magic = ntohll(magic);
2677                 if(magic != opts_magic) {
2678                         err_nonfatal("Negotiation failed/5a: magic mismatch");
2679                         goto handler_err;
2680                 }
2681                 socket_read(client, &opt, sizeof(opt));
2682                 opt = ntohl(opt);
2683                 if(client->tls_session == NULL
2684                                 && glob_flags & F_FORCEDTLS
2685                                 && opt != NBD_OPT_STARTTLS) {
2686                         if(opt == NBD_OPT_EXPORT_NAME) {
2687                                 // can't send an error message for EXPORT_NAME,
2688                                 // so must do hard close
2689                                 goto handler_err;
2690                         }
2691                         if(opt == NBD_OPT_ABORT) {
2692                                 // handled below
2693                                 break;
2694                         }
2695                         consume_len(client);
2696                         send_reply(client, opt, NBD_REP_ERR_TLS_REQD, -1, "TLS is required on this server");
2697                         continue;
2698                 }
2699                 switch(opt) {
2700                 case NBD_OPT_EXPORT_NAME:
2701                         // NBD_OPT_EXPORT_NAME must be the last
2702                         // selected option, so return from here
2703                         // if that is chosen.
2704                         if(handle_export_name(client, opt, servers, cflags) != NULL) {
2705                                 return client;
2706                         } else {
2707                                 goto handler_err;
2708                         }
2709                         break;
2710                 case NBD_OPT_LIST:
2711                         handle_list(client, opt, servers, cflags);
2712                         break;
2713                 case NBD_OPT_ABORT:
2714                         // handled below
2715                         break;
2716                 case NBD_OPT_STARTTLS:
2717 #if !HAVE_GNUTLS
2718                         consume_len(client);
2719                         send_reply(client, opt, NBD_REP_ERR_PLATFORM, -1, "This nbd-server was compiled without TLS support");
2720 #else
2721                         if(client->tls_session != NULL) {
2722                                 consume_len(client);
2723                                 send_reply(client, opt, NBD_REP_ERR_INVALID, -1, "Invalid STARTTLS request: TLS has already been negotiated!");
2724                                 continue;
2725                         }
2726                         if(genconf->keyfile == NULL) {
2727                                 consume_len(client);
2728                                 send_reply(client, opt, NBD_REP_ERR_POLICY, -1, "TLS not allowed on this server");
2729                                 continue;
2730                         }
2731                         if(handle_starttls(client, opt, servers, cflags, genconf) == NULL) {
2732                                 // can't recover from failed TLS negotiation.
2733                                 goto handler_err;
2734                         }
2735                         // once TLS has been negotiated, any state must be cleared
2736                         client->clientflags = 0;
2737 #endif
2738                         break;
2739                 case NBD_OPT_GO:
2740                 case NBD_OPT_INFO:
2741                         if(handle_info(client, opt, servers, cflags) && opt == NBD_OPT_GO) {
2742                                 return client;
2743                         }
2744                         break;
2745                 case NBD_OPT_STRUCTURED_REPLY:
2746                         handle_structured_reply(client, opt, servers, cflags);
2747                         break;
2748                 default:
2749                         consume_len(client);
2750                         send_reply(client, opt, NBD_REP_ERR_UNSUP, -1, "The given option is unknown to this server implementation");
2751                         break;
2752                 }
2753         } while((opt != NBD_OPT_EXPORT_NAME) && (opt != NBD_OPT_ABORT));
2754         if(opt == NBD_OPT_ABORT) {
2755                 err_nonfatal("Session terminated by client");
2756                 goto handler_err;
2757         }
2758         err_nonfatal("Weird things happened: reached end of negotiation without success");
2759 handler_err:
2760         g_free(client);
2761         return NULL;
2762 }
2763
2764 static int nbd_errno(int errcode) {
2765         switch (errcode) {
2766         case EPERM:
2767                 return htonl(1);
2768         case EIO:
2769                 return htonl(5);
2770         case ENOMEM:
2771                 return htonl(12);
2772         case EINVAL:
2773                 return htonl(22);
2774         case EFBIG:
2775         case ENOSPC:
2776 #ifdef EDQUOT
2777         case EDQUOT:
2778 #endif
2779                 return htonl(28); // ENOSPC
2780         default:
2781                 return htonl(22); // EINVAL
2782         }
2783 }
2784
2785 static void package_dispose(struct work_package* package) {
2786         if (package->pipefd[0] > 0)
2787                 close(package->pipefd[0]);
2788         if (package->pipefd[1] > 0)
2789                 close(package->pipefd[1]);
2790         g_free(package->data);
2791         g_free(package->req);
2792         g_free(package);
2793 }
2794
2795 static int mkpipe(int pipefd[2], size_t len)
2796 {
2797         if (len > MAX_PIPE_SIZE)
2798                 return -1;
2799         if (pipe(pipefd))
2800                 return -1;
2801
2802 #ifdef HAVE_SPLICE
2803         if (fcntl(pipefd[1], F_SETPIPE_SZ, MAX_PIPE_SIZE) < MAX_PIPE_SIZE) {
2804                 close(pipefd[0]);
2805                 close(pipefd[1]);
2806                 pipefd[0] = -1;
2807                 pipefd[1] = -1;
2808                 return -1;
2809         }
2810 #endif
2811
2812         return 0;
2813 }
2814
2815 struct work_package* package_create(CLIENT* client, struct nbd_request* req) {
2816         struct work_package* rv = calloc(sizeof (struct work_package), 1);
2817
2818         rv->req = req;
2819         rv->client = client;
2820         rv->data = NULL;
2821         rv->pipefd[0] = -1;
2822         rv->pipefd[1] = -1;
2823
2824         if((req->type & NBD_CMD_MASK_COMMAND) == NBD_CMD_WRITE) {
2825                 if (client->server->flags & F_SPLICE) {
2826                         if (mkpipe(rv->pipefd, req->len))
2827                                 rv->data = malloc(req->len);
2828                 } else {
2829                         rv->data = malloc(req->len);
2830                 }
2831         }
2832
2833         return rv;
2834 }
2835
2836 #ifdef HAVE_SPLICE
2837 static int handle_splice_read(CLIENT *client, struct nbd_request *req)
2838 {
2839         struct nbd_reply rep;
2840         int pipefd[2];
2841
2842         // splice doesn't work with TLS
2843         if (client->tls_session != NULL)
2844                 return -1;
2845
2846         if (mkpipe(pipefd, req->len))
2847                 return -1;
2848
2849         if (expsplice(pipefd[1], req->from, req->len, client, SPLICE_IN, 0)) {
2850                 close(pipefd[1]);
2851                 close(pipefd[0]);
2852                 return -1;
2853         }
2854
2855         DEBUG("handling read request (splice)\n");
2856         setup_reply(&rep, req);
2857         log_reply(client, &rep);
2858         pthread_mutex_lock(&(client->lock));
2859         writeit(client->net, &rep, sizeof(rep));
2860         spliceit(pipefd[0], NULL, client->net, NULL, req->len);
2861         pthread_mutex_unlock(&(client->lock));
2862         close(pipefd[0]);
2863         close(pipefd[1]);
2864         return 0;
2865 }
2866 #endif
2867
2868 static void handle_normal_read(CLIENT *client, struct nbd_request *req)
2869 {
2870         DEBUG("handling read request\n");
2871         char read_failed[] = "Read failed";
2872         _cleanup_g_free_ READ_CTX *ctx = g_new0(READ_CTX, 1);
2873         ctx->req = req;
2874         ctx->current_len = req->len;
2875         uint32_t error = 0;
2876         char *errmsg = NULL;
2877         uint16_t msglen = 0;
2878         if(client->clientflags & F_STRUCTURED) {
2879                 ctx->is_structured = 1;
2880         } else {
2881                 ctx->is_structured = 0;
2882         }
2883         if(req->type & NBD_CMD_FLAG_DF != 0) {
2884                 ctx->df = 1;
2885         }
2886         if(ctx->is_structured && ctx->df && req->len > (1 << 20)) {
2887                 /* standard requires a minimum of 64KiB; we are more generous
2888                  * by allowing up to 1MiB as our largest unfragmented answer */
2889                 const char too_long[] = "Request too long for unfragmented reply";
2890                 struct nbd_structured_error_payload pl;
2891                 pl.error = NBD_EOVERFLOW;
2892                 pl.msglen = sizeof too_long;
2893                 send_structured_chunk_v(client, req, NBD_REPLY_FLAG_DONE, NBD_REPLY_TYPE_ERROR, 6 + pl.msglen, 2, &pl, sizeof pl, too_long, sizeof too_long);
2894                 return;
2895         }
2896         if(ctx->df || !(ctx->is_structured)) {
2897                 ctx->buf = malloc(req->len);
2898                 if(!(ctx->buf)) {
2899                         err("Could not allocate memory for request");
2900                 }
2901                 ctx->buflen = req->len;
2902         }
2903         if(expread(ctx, client)) {
2904                 DEBUG("Read failed: %m");
2905                 error = nbd_errno(errno);
2906                 errmsg = read_failed;
2907                 msglen = sizeof read_failed;
2908         }
2909         complete_read(client, ctx, error, errmsg, msglen, false, 0);
2910 }
2911
2912 static void handle_read(CLIENT* client, struct nbd_request* req)
2913 {
2914 #ifdef HAVE_SPLICE
2915         /*
2916          * If we have splice set we want to try that first, and if that fails
2917          * for whatever reason we fall through to ye olde read.
2918          */
2919         if (client->server->flags & F_SPLICE)
2920                 if (!handle_splice_read(client, req))
2921                         return;
2922 #endif
2923         handle_normal_read(client, req);
2924 }
2925
2926 static void handle_write(struct work_package *pkg)
2927 {
2928         CLIENT *client = pkg->client;
2929         struct nbd_request *req = pkg->req;
2930         struct nbd_reply rep;
2931         int fua = !!(req->type & NBD_CMD_FLAG_FUA);
2932
2933         DEBUG("handling write request\n");
2934         setup_reply(&rep, req);
2935
2936 #ifdef HAVE_SPLICE
2937         if (!pkg->data) {
2938                 if (expsplice(pkg->pipefd[0], req->from, req->len, client,
2939                               SPLICE_OUT, fua)) {
2940                         DEBUG("Splice failed: %m");
2941                         rep.error = nbd_errno(errno);
2942                 }
2943         } else
2944 #endif
2945         {
2946                 if(expwrite(req->from, pkg->data, req->len, client, fua)) {
2947                         DEBUG("Write failed: %m");
2948                         rep.error = nbd_errno(errno);
2949                 }
2950         }
2951         log_reply(client, &rep);
2952         pthread_mutex_lock(&(client->lock));
2953         socket_write(client, &rep, sizeof rep);
2954         pthread_mutex_unlock(&(client->lock));
2955 }
2956
2957 static void handle_flush(CLIENT* client, struct nbd_request* req) {
2958         struct nbd_reply rep;
2959         DEBUG("handling flush request\n");
2960         setup_reply(&rep, req);
2961         if(expflush(client)) {
2962                 DEBUG("Flush failed: %m");
2963                 rep.error = nbd_errno(errno);
2964         }
2965         log_reply(client, &rep);
2966         pthread_mutex_lock(&(client->lock));
2967         socket_write(client, &rep, sizeof rep);
2968         pthread_mutex_unlock(&(client->lock));
2969 }
2970
2971 static void handle_trim(CLIENT* client, struct nbd_request* req) {
2972         struct nbd_reply rep;
2973         DEBUG("handling trim request\n");
2974         setup_reply(&rep, req);
2975         if(exptrim(req, client)) {
2976                 DEBUG("Trim failed: %m");
2977                 rep.error = nbd_errno(errno);
2978         }
2979         log_reply(client, &rep);
2980         pthread_mutex_lock(&(client->lock));
2981         socket_write(client, &rep, sizeof rep);
2982         pthread_mutex_unlock(&(client->lock));
2983 }
2984
2985 static void handle_write_zeroes(CLIENT* client, struct nbd_request* req) {
2986         struct nbd_reply rep;
2987         DEBUG("handling write_zeroes request\n");
2988         int fua = !!(req->type & NBD_CMD_FLAG_FUA);
2989         setup_reply(&rep, req);
2990         if(expwrite_zeroes(req, client, fua)) {
2991                 DEBUG("Write_zeroes failed: %m");
2992                 rep.error = nbd_errno(errno);
2993         }
2994         // For now, don't trim
2995         // TODO: handle this far more efficiently with reference to the
2996         // actual backing driver
2997         log_reply(client, &rep);
2998         pthread_mutex_lock(&(client->lock));
2999         socket_write(client, &rep, sizeof rep);
3000         pthread_mutex_unlock(&(client->lock));
3001 }
3002
3003
3004 static bool bad_write(CLIENT* client, struct nbd_request* req) {
3005         if ((client->server->flags & F_READONLY) ||
3006             (client->server->flags & F_AUTOREADONLY)) {
3007                 DEBUG("[WRITE to READONLY!]");
3008                 return true;
3009         }
3010         return false;
3011 }
3012
3013 static bool bad_range(CLIENT* client, struct nbd_request* req) {
3014         if(req->from > client->exportsize ||
3015            req->from + req->len > client->exportsize) {
3016                 DEBUG("[out of bounds!]");
3017                 return true;
3018         }
3019         return false;
3020 }
3021
3022 static void handle_request(gpointer data, gpointer user_data) {
3023         struct work_package* package = (struct work_package*) data;
3024         uint32_t type = package->req->type & NBD_CMD_MASK_COMMAND;
3025         uint32_t flags = package->req->type & ~NBD_CMD_MASK_COMMAND;
3026         struct nbd_reply rep;
3027         int err = EINVAL;
3028
3029         if(flags & ~(NBD_CMD_FLAG_FUA | NBD_CMD_FLAG_NO_HOLE)) {
3030                 msg(LOG_ERR, "E: received invalid flag %d on command %d, ignoring", flags, type);
3031                 goto error;
3032         }
3033
3034         switch(type) {
3035                 case NBD_CMD_READ:
3036                         if (bad_range(package->client, package->req)) {
3037                                 goto error;
3038                         }
3039                         handle_read(package->client, package->req);
3040                         break;
3041                 case NBD_CMD_WRITE:
3042                         if (bad_write(package->client, package->req)) {
3043                                 err = EPERM;
3044                                 goto error;
3045                         }
3046                         if (bad_range(package->client, package->req)) {
3047                                 err = ENOSPC;
3048                                 goto error;
3049                         }
3050                         handle_write(package);
3051                         break;
3052                 case NBD_CMD_FLUSH:
3053                         handle_flush(package->client, package->req);
3054                         break;
3055                 case NBD_CMD_TRIM:
3056                         if (bad_write(package->client, package->req)) {
3057                                 err = EPERM;
3058                                 goto error;
3059                         }
3060                         if (bad_range(package->client, package->req)) {
3061                                 goto error;
3062                         }
3063                         handle_trim(package->client, package->req);
3064                         break;
3065                 case NBD_CMD_WRITE_ZEROES:
3066                         if (bad_write(package->client, package->req)) {
3067                                 err = EPERM;
3068                                 goto error;
3069                         }
3070                         if (bad_range(package->client, package->req)) {
3071                                 err = ENOSPC;
3072                                 goto error;
3073                         }
3074                         handle_write_zeroes(package->client, package->req);
3075                         break;
3076                 default:
3077                         msg(LOG_ERR, "E: received unknown command %d of type, ignoring", package->req->type);
3078                         goto error;
3079         }
3080         goto end;
3081 error:
3082         setup_reply(&rep, package->req);
3083         rep.error = nbd_errno(err);
3084         log_reply(package->client, &rep);
3085         pthread_mutex_lock(&(package->client->lock));
3086         socket_write(package->client, &rep, sizeof rep);
3087         pthread_mutex_unlock(&(package->client->lock));
3088 end:
3089         package_dispose(package);
3090 }
3091
3092 static int mainloop_threaded(CLIENT* client) {
3093         struct nbd_request* req;
3094         struct work_package* pkg;
3095         int write_data = false;
3096
3097         DEBUG("Entering request loop\n");
3098         while(1) {
3099                 req = calloc(sizeof (struct nbd_request), 1);
3100
3101                 socket_read(client, req, sizeof(struct nbd_request));
3102
3103                 if(client->transactionlogfd != -1) {
3104                         lock_logsem(client);
3105                         writeit(client->transactionlogfd, req, sizeof(struct nbd_request));
3106                         if(((ntohl(req->type) & NBD_CMD_MASK_COMMAND) == NBD_CMD_WRITE) &&
3107                                         (client->server->flags & F_DATALOG) &&
3108                                         !(client->server->flags & F_SPLICE)) {
3109                                 write_data = true;
3110                         } else {
3111                                 write_data = false;
3112                                 unlock_logsem(client);
3113                         }
3114                 }
3115
3116                 req->from = ntohll(req->from);
3117                 req->type = ntohl(req->type);
3118                 req->len = ntohl(req->len);
3119
3120                 if(req->magic != htonl(NBD_REQUEST_MAGIC))
3121                         err("Protocol error: not enough magic.");
3122
3123                 pkg = package_create(client, req);
3124
3125                 if((req->type & NBD_CMD_MASK_COMMAND) == NBD_CMD_WRITE) {
3126 #ifdef HAVE_SPLICE
3127                         if ((client->server->flags & F_SPLICE) &&
3128                             (req->len <= MAX_PIPE_SIZE && pkg->pipefd[1] > 0) &&
3129                             (client->tls_session == NULL))
3130                                 spliceit(client->net, NULL, pkg->pipefd[1],
3131                                          NULL, req->len);
3132                         else
3133 #endif
3134                                 socket_read(client, pkg->data, req->len);
3135
3136                         if (write_data) {
3137                                 writeit(client->transactionlogfd, pkg->data, req->len);
3138                                 unlock_logsem(client);
3139                                 write_data = false;
3140                         }
3141                 }
3142                 if(req->type == NBD_CMD_DISC) {
3143                         finalize_client(client);
3144                         package_dispose(pkg);
3145                         return 0;
3146                 }
3147                 g_thread_pool_push(tpool, pkg, NULL);
3148         }
3149 }
3150
3151 /**
3152  * Destroy a pid_t*
3153  * @param data a pointer to pid_t which should be freed
3154  **/
3155 void destroy_pid_t(gpointer data) {
3156         g_free(data);
3157 }
3158
3159 static pid_t spawn_child(int* socket) {
3160         pid_t pid;
3161         sigset_t newset;
3162         sigset_t oldset;
3163         int sockets[2];
3164
3165         sigemptyset(&newset);
3166         sigaddset(&newset, SIGCHLD);
3167         sigaddset(&newset, SIGTERM);
3168         sigprocmask(SIG_BLOCK, &newset, &oldset);
3169         socketpair(AF_UNIX, SOCK_STREAM, 0, sockets);
3170         pid = fork();
3171         if (pid < 0) {
3172                 msg(LOG_ERR, "Could not fork (%s)", strerror(errno));
3173                 close(sockets[0]);
3174                 close(sockets[1]);
3175                 goto out;
3176         }
3177         if (pid > 0) { /* Parent */
3178                 pid_t *pidp;
3179
3180                 pidp = g_malloc(sizeof(pid_t));
3181                 *pidp = pid;
3182                 *socket = sockets[1];
3183                 close(sockets[0]);
3184                 g_hash_table_insert(children, pidp, pidp);
3185                 goto out;
3186         }
3187         /* Child */
3188         *socket = sockets[0];
3189         close(sockets[1]);
3190         /* Child's signal disposition is reset to default. */
3191         signal(SIGCHLD, SIG_DFL);
3192         signal(SIGTERM, SIG_DFL);
3193         signal(SIGHUP, SIG_DFL);
3194         sigemptyset(&oldset);
3195 out:
3196         sigprocmask(SIG_SETMASK, &oldset, NULL);
3197         return pid;
3198 }
3199
3200 static int
3201 socket_accept(const int sock)
3202 {
3203         struct sockaddr_storage addrin;
3204         socklen_t addrinlen = sizeof(addrin);
3205         int net;
3206
3207         net = accept(sock, (struct sockaddr *) &addrin, &addrinlen);
3208         if (net < 0) {
3209                 err_nonfatal("Failed to accept socket connection: %m");
3210         }
3211
3212         return net;
3213 }
3214
3215 static void
3216 handle_modern_connection(GArray *const servers, const int sock, struct generic_conf *genconf)
3217 {
3218         int net;
3219         pid_t pid;
3220         CLIENT *client = NULL;
3221         int sock_flags_old;
3222         int sock_flags_new;
3223
3224         net = socket_accept(sock);
3225         if (net < 0)
3226                 return;
3227
3228         if (!dontfork) {
3229                 pid = spawn_child(&commsocket);
3230                 if (pid) {
3231                         if (pid > 0) {
3232                                 msg(LOG_INFO, "Spawned a child process");
3233                                 g_array_append_val(childsocks, commsocket);
3234                         }
3235                         if (pid < 0)
3236                                 msg(LOG_ERR, "Failed to spawn a child process");
3237                         close(net);
3238                         return;
3239                 }
3240                 /* Child just continues. */
3241         }
3242         tpool = g_thread_pool_new(handle_request, NULL, genconf->threads, FALSE, NULL);
3243
3244         sock_flags_old = fcntl(net, F_GETFL, 0);
3245         if (sock_flags_old == -1) {
3246                 msg(LOG_ERR, "Failed to get socket flags");
3247                 goto handler_err;
3248         }
3249
3250         sock_flags_new = sock_flags_old & ~O_NONBLOCK;
3251         if (sock_flags_new != sock_flags_old &&
3252             fcntl(net, F_SETFL, sock_flags_new) == -1) {
3253                 msg(LOG_ERR, "Failed to set socket to blocking mode");
3254                 goto handler_err;
3255         }
3256
3257         client = negotiate(net, servers, genconf);
3258         if (!client) {
3259                 msg(LOG_ERR, "Modern initial negotiation failed");
3260                 goto handler_err;
3261         }
3262
3263         if (!dontfork) {
3264                 int i;
3265
3266                 /* Free all root server resources here, because we are
3267                  * currently in the child process serving one specific
3268                  * connection. These are not simply needed anymore. */
3269                 g_hash_table_destroy(children);
3270                 children = NULL;
3271                 for (i = 0; i < modernsocks->len; i++) {
3272                         close(g_array_index(modernsocks, int, i));
3273                 }
3274                 g_array_free(modernsocks, TRUE);
3275
3276                 /* Now that we are in the child process after a
3277                  * succesful negotiation, we do not need the list of
3278                  * servers anymore, get rid of it.*/
3279                 g_array_free(servers, FALSE);
3280         }
3281
3282         msg(LOG_INFO, "Starting to serve");
3283         mainloop_threaded(client);
3284         exit(EXIT_SUCCESS);
3285
3286 handler_err:
3287         close(net);
3288         g_free(client);
3289
3290         if (!dontfork) {
3291                 exit(EXIT_FAILURE);
3292         }
3293 }
3294
3295 static int handle_childname(GArray* servers, int socket)
3296 {
3297         uint32_t len;
3298         _cleanup_g_free_ char *buf = NULL;
3299         int i, r, rt = 0;
3300
3301         while(rt < sizeof(len)) {
3302                 switch((r = read(socket, &len, sizeof len))) {
3303                         case 0:
3304                                 return -1;
3305                         case -1:
3306                                 err_nonfatal("Error reading from acl socket: %m");
3307                                 return -1;
3308                         default:
3309                                 rt += r;
3310                                 break;
3311                 }
3312         }
3313         if (len >= ULONG_MAX - 1) {
3314                 err_nonfatal("Value out of range");
3315                 return -1;
3316         }
3317         buf = g_malloc0(len + 1);
3318         readit(socket, buf, len);
3319         buf[len] = 0;
3320         for(i=0; i<servers->len; i++) {
3321                 SERVER* srv = g_array_index(servers, SERVER*, i);
3322                 if(strcmp(srv->servename, buf) == 0) {
3323                         if(srv->max_connections == 0 || srv->max_connections > srv->numclients) {
3324                                 writeit(socket, "Y", 1);
3325                                 srv->numclients++;
3326                         } else {
3327                                 writeit(socket, "N", 1);
3328                         }
3329                         goto exit;
3330                 }
3331         }
3332         writeit(socket, "X", 1);
3333 exit:
3334         return 0;
3335 }
3336
3337 /**
3338  * Return the index of the server whose servename matches the given
3339  * name.
3340  *
3341  * @param servename a string to match
3342  * @param servers an array of servers
3343  * @return the first index of the server whose servename matches the
3344  *         given name or -1 if one cannot be found
3345  **/
3346 static int get_index_by_servename(const gchar *const servename,
3347                                   const GArray *const servers) {
3348         int i;
3349
3350         for (i = 0; i < servers->len; ++i) {
3351                 const SERVER* server = g_array_index(servers, SERVER*, i);
3352
3353                 if (strcmp(servename, server->servename) == 0)
3354                         return i;
3355         }
3356
3357         return -1;
3358 }
3359
3360 /**
3361  * Parse configuration files and add servers to the array if they don't
3362  * already exist there. The existence is tested by comparing
3363  * servenames. A server is appended to the array only if its servename
3364  * is unique among all other servers.
3365  *
3366  * @param servers an array of servers
3367  * @param genconf a pointer to generic configuration
3368  * @return the number of new servers appended to the array, or -1 in
3369  *         case of an error
3370  **/
3371 static int append_new_servers(GArray *const servers, struct generic_conf *genconf, GError **const gerror) {
3372         int i;
3373         GArray *new_servers;
3374         const int old_len = servers->len;
3375         int retval = -1;
3376
3377         new_servers = parse_cfile(config_file_pos, genconf, true, gerror);
3378         if(tpool) g_thread_pool_set_max_threads(tpool, genconf->threads, NULL);
3379         if(!new_servers)
3380                 goto out;
3381
3382         for(i = 0; i < new_servers->len; ++i) {
3383                 SERVER *new_server = g_array_index(new_servers, SERVER*, i);
3384
3385                 if (new_server->servename
3386                     && -1 == get_index_by_servename(new_server->servename,
3387                                                     servers)) {
3388                         serve_inc_ref(new_server);
3389                         g_array_append_val(servers, new_server);
3390                 }
3391         }
3392
3393         retval = servers->len - old_len;
3394 out:
3395         g_array_free(new_servers, TRUE);
3396
3397         return retval;
3398 }
3399
3400 void serveloop(GArray* servers, struct generic_conf *genconf) G_GNUC_NORETURN;
3401 /**
3402  * Loop through the available servers, and serve them. Never returns.
3403  **/
3404 void serveloop(GArray* servers, struct generic_conf *genconf) {
3405         int i;
3406         int mmax, max;
3407         fd_set mset;
3408         fd_set rset;
3409         sigset_t blocking_mask;
3410         sigset_t original_mask;
3411
3412         /*
3413          * Set up the master fd_set. The set of descriptors we need
3414          * to select() for never changes anyway and it buys us a *lot*
3415          * of time to only build this once. However, if we ever choose
3416          * to not fork() for clients anymore, we may have to revisit
3417          * this.
3418          */
3419         mmax=0;
3420         FD_ZERO(&mset);
3421         for(i=0;i<modernsocks->len;i++) {
3422                 int sock = g_array_index(modernsocks, int, i);
3423                 FD_SET(sock, &mset);
3424                 mmax=sock>mmax?sock:mmax;
3425         }
3426
3427         /* Construct a signal mask which is used to make signal testing and
3428          * receiving an atomic operation to ensure no signal is received between
3429          * tests and blocking pselect(). */
3430         if (sigemptyset(&blocking_mask) == -1)
3431                 err("failed to initialize blocking_mask: %m");
3432
3433         if (sigaddset(&blocking_mask, SIGCHLD) == -1)
3434                 err("failed to add SIGCHLD to blocking_mask: %m");
3435
3436         if (sigaddset(&blocking_mask, SIGHUP) == -1)
3437                 err("failed to add SIGHUP to blocking_mask: %m");
3438
3439         if (sigaddset(&blocking_mask, SIGTERM) == -1)
3440                 err("failed to add SIGTERM to blocking_mask: %m");
3441
3442         if (sigprocmask(SIG_BLOCK, &blocking_mask, &original_mask) == -1)
3443             err("failed to block signals: %m");
3444
3445         for(;;) {
3446                 if (is_sigterm_caught) {
3447                         is_sigterm_caught = 0;
3448
3449                         g_hash_table_foreach(children, killchild, NULL);
3450                         unlink(pidfname);
3451
3452                         exit(EXIT_SUCCESS);
3453                 }
3454
3455                 if (is_sigchld_caught) {
3456                         int status;
3457                         int* i;
3458                         pid_t pid;
3459
3460                         is_sigchld_caught = 0;
3461
3462                         while ((pid=waitpid(-1, &status, WNOHANG)) > 0) {
3463                                 if (WIFEXITED(status)) {
3464                                         msg(LOG_INFO, "Child exited with %d", WEXITSTATUS(status));
3465                                 }
3466                                 i = g_hash_table_lookup(children, &pid);
3467                                 if (!i) {
3468                                         msg(LOG_INFO, "SIGCHLD received for an unknown child with PID %ld", (long)pid);
3469                                 } else {
3470                                         DEBUG("Removing %d from the list of children", pid);
3471                                         g_hash_table_remove(children, &pid);
3472                                 }
3473                         }
3474                 }
3475
3476                 /* SIGHUP causes the root server process to reconfigure
3477                  * itself and add new export servers for each newly
3478                  * found export configuration group, i.e. spawn new
3479                  * server processes for each previously non-existent
3480                  * export. This does not alter old runtime configuration
3481                  * but just appends new exports. */
3482                 if (is_sighup_caught) {
3483                         int n;
3484                         GError *gerror = NULL;
3485
3486                         msg(LOG_INFO, "reconfiguration request received");
3487                         is_sighup_caught = 0; /* Reset to allow catching
3488                                                * it again. */
3489
3490                         n = append_new_servers(servers, genconf, &gerror);
3491                         if (n == -1)
3492                                 msg(LOG_ERR, "failed to append new servers: %s",
3493                                     gerror->message);
3494
3495                         for (i = servers->len - n; i < servers->len; ++i) {
3496                                 const SERVER *server = g_array_index(servers,
3497                                                                     SERVER*, i);
3498
3499                                 msg(LOG_INFO, "reconfigured new server: %s",
3500                                     server->servename);
3501                         }
3502                 }
3503
3504                 memcpy(&rset, &mset, sizeof(fd_set));
3505                 max=mmax;
3506                 for(i=0;i<childsocks->len;i++) {
3507                         int sock = g_array_index(childsocks, int, i);
3508                         FD_SET(sock, &rset);
3509                         max=sock>max?sock:max;
3510                 }
3511
3512                 if (pselect(max + 1, &rset, NULL, NULL, NULL, &original_mask) > 0) {
3513                         DEBUG("accept, ");
3514                         for(i=0; i < modernsocks->len; i++) {
3515                                 int sock = g_array_index(modernsocks, int, i);
3516                                 if(!FD_ISSET(sock, &rset)) {
3517                                         continue;
3518                                 }
3519
3520                                 handle_modern_connection(servers, sock, genconf);
3521                         }
3522                         for(i=0; i < childsocks->len; i++) {
3523                                 int sock = g_array_index(childsocks, int, i);
3524
3525                                 if(FD_ISSET(sock, &rset)) {
3526                                         if(handle_childname(servers, sock) < 0) {
3527                                                 close(sock);
3528                                                 g_array_remove_index(childsocks, i);
3529                                         }
3530                                 }
3531                         }
3532                 }
3533         }
3534 }
3535
3536 /**
3537  * Set server socket options.
3538  *
3539  * @param socket a socket descriptor of the server
3540  *
3541  * @param gerror a pointer to an error object pointer used for reporting
3542  *        errors. On error, if gerror is not NULL, *gerror is set and -1
3543  *        is returned.
3544  *
3545  * @return 0 on success, -1 on error
3546  **/
3547 int dosockopts(const int socket, GError **const gerror) {
3548 #ifndef sun
3549         int yes=1;
3550 #else
3551         char yes='1';
3552 #endif /* sun */
3553         struct linger l;
3554
3555         /* lose the pesky "Address already in use" error message */
3556         if (setsockopt(socket,SOL_SOCKET,SO_REUSEADDR,&yes,sizeof(int)) == -1) {
3557                 g_set_error(gerror, NBDS_ERR, NBDS_ERR_SO_REUSEADDR,
3558                             "failed to set socket option SO_REUSEADDR: %s",
3559                             strerror(errno));
3560                 return -1;
3561         }
3562         l.l_onoff = 1;
3563         l.l_linger = 10;
3564         if (setsockopt(socket,SOL_SOCKET,SO_LINGER,&l,sizeof(l)) == -1) {
3565                 g_set_error(gerror, NBDS_ERR, NBDS_ERR_SO_LINGER,
3566                             "failed to set socket option SO_LINGER: %s",
3567                             strerror(errno));
3568                 return -1;
3569         }
3570         if (setsockopt(socket,SOL_SOCKET,SO_KEEPALIVE,&yes,sizeof(int)) == -1) {
3571                 g_set_error(gerror, NBDS_ERR, NBDS_ERR_SO_KEEPALIVE,
3572                             "failed to set socket option SO_KEEPALIVE: %s",
3573                             strerror(errno));
3574                 return -1;
3575         }
3576
3577         return 0;
3578 }
3579
3580 int open_unix(const gchar *const sockname, GError **const gerror) {
3581         struct sockaddr_un sa;
3582         int sock=-1;
3583         int retval=-1;
3584
3585         memset(&sa, 0, sizeof(struct sockaddr_un));
3586         sa.sun_family = AF_UNIX;
3587         strncpy(sa.sun_path, sockname, sizeof sa.sun_path);
3588         sa.sun_path[sizeof(sa.sun_path)-1] = '\0';
3589         sock = socket(AF_UNIX, SOCK_STREAM, 0);
3590         if(sock < 0) {
3591                 g_set_error(gerror, NBDS_ERR, NBDS_ERR_SOCKET,
3592                                 "failed to open a unix socket: "
3593                                 "failed to create socket: %s",
3594                                 strerror(errno));
3595                 goto out;
3596         }
3597         if(bind(sock, (struct sockaddr*)&sa, sizeof(struct sockaddr_un))<0) {
3598                 g_set_error(gerror, NBDS_ERR, NBDS_ERR_BIND,
3599                                 "failed to open a unix socket: "
3600                                 "failed to bind to address %s: %s",
3601                                 sockname, strerror(errno));
3602                 goto out;
3603         }
3604         if(listen(sock, 10)<0) {
3605                 g_set_error(gerror, NBDS_ERR, NBDS_ERR_BIND,
3606                                 "failed to open a unix socket: "
3607                                 "failed to start listening: %s",
3608                                 strerror(errno));
3609                 goto out;
3610         }
3611         retval=0;
3612         g_array_append_val(modernsocks, sock);
3613 out:
3614         if(retval<0 && sock >= 0) {
3615                 close(sock);
3616         }
3617
3618         return retval;
3619 }
3620
3621 int open_modern(const gchar *const addr, const gchar *const port,
3622                 GError **const gerror) {
3623         struct addrinfo hints;
3624         struct addrinfo* ai = NULL;
3625         struct addrinfo* ai_bak = NULL;
3626         struct sock_flags;
3627         int e;
3628         int retval = -1;
3629         int sock = -1;
3630         _cleanup_(g_strfreevp) gchar** addrs;
3631         gchar const* l_addr = addr;
3632
3633         if(!addr || strlen(addr) == 0) {
3634                 l_addr = "::, 0.0.0.0";
3635         }
3636
3637         addrs = g_strsplit_set(l_addr, ", \t", -1);
3638
3639         for(int i=0; addrs[i]!=NULL; i++) {
3640                 if(addrs[i][0] == '\0') {
3641                         continue;
3642                 }
3643                 memset(&hints, '\0', sizeof(hints));
3644                 hints.ai_flags = AI_PASSIVE | AI_ADDRCONFIG;
3645                 hints.ai_socktype = SOCK_STREAM;
3646                 hints.ai_family = AF_UNSPEC;
3647                 hints.ai_protocol = IPPROTO_TCP;
3648                 e = getaddrinfo(addrs[i], port ? port : NBD_DEFAULT_PORT, &hints, &ai);
3649                 ai_bak = ai;
3650                 if(e != 0 && addrs[i+1] == NULL && modernsocks->len == 0) {
3651                         g_set_error(gerror, NBDS_ERR, NBDS_ERR_GAI,
3652                                     "failed to open a modern socket: "
3653                                     "failed to get address info: %s",
3654                                     gai_strerror(e));
3655                         goto out;
3656                 }
3657
3658                 while(ai != NULL) {
3659                         sock = socket(ai->ai_family, ai->ai_socktype, ai->ai_protocol);
3660                         if(sock<0) {
3661                                 g_set_error(gerror, NBDS_ERR, NBDS_ERR_SOCKET,
3662                                             "failed to open a modern socket: "
3663                                             "failed to create a socket: %s",
3664                                             strerror(errno));
3665                                 goto out;
3666                         }
3667
3668                         if (dosockopts(sock, gerror) == -1) {
3669                                 g_prefix_error(gerror, "failed to open a modern socket: ");
3670                                 goto out;
3671                         }
3672
3673                         if(bind(sock, ai->ai_addr, ai->ai_addrlen)) {
3674                                 /*
3675                                  * Some systems will return multiple entries for the
3676                                  * same address when we ask it for something
3677                                  * AF_UNSPEC, even though the first entry will
3678                                  * listen to both protocols. Other systems will
3679                                  * return multiple entries too, but we actually
3680                                  * do need to open both.
3681                                  *
3682                                  * Handle this by ignoring EADDRINUSE if we've
3683                                  * already got at least one socket open
3684                                  */
3685                                 if(errno == EADDRINUSE && modernsocks->len > 0) {
3686                                         goto next;
3687                                 }
3688                                 g_set_error(gerror, NBDS_ERR, NBDS_ERR_BIND,
3689                                             "failed to open a modern socket: "
3690                                             "failed to bind an address to a socket: %s",
3691                                             strerror(errno));
3692                                 goto out;
3693                         }
3694
3695                         if(listen(sock, 10) <0) {
3696                                 g_set_error(gerror, NBDS_ERR, NBDS_ERR_BIND,
3697                                             "failed to open a modern socket: "
3698                                             "failed to start listening on a socket: %s",
3699                                             strerror(errno));
3700                                 goto out;
3701                         }
3702                         g_array_append_val(modernsocks, sock);
3703                 next:
3704                         ai = ai->ai_next;
3705                 }
3706                 if(ai_bak) {
3707                         freeaddrinfo(ai_bak);
3708                         ai_bak=NULL;
3709                 }
3710         }
3711
3712         retval = 0;
3713 out:
3714
3715         if (retval == -1 && sock >= 0) {
3716                 close(sock);
3717         }
3718         if(ai_bak)
3719                 freeaddrinfo(ai_bak);
3720
3721         return retval;
3722 }
3723
3724 /**
3725  * Connect our servers.
3726  **/
3727 void setup_servers(GArray *const servers, const gchar *const modernaddr,
3728                    const gchar *const modernport, const gchar* unixsock,
3729                    const gint flags ) {
3730         struct sigaction sa;
3731
3732         if(unixsock != NULL) {
3733                 GError* gerror = NULL;
3734                 if(open_unix(unixsock, &gerror) == -1) {
3735                         msg(LOG_ERR, "failed to setup servers: %s",
3736                                         gerror->message);
3737                         g_clear_error(&gerror);
3738                         exit(EXIT_FAILURE);
3739                 }
3740         }
3741         if (((flags & F_DUAL_LISTEN) != 0) || (unixsock == NULL)) {
3742                 GError *gerror = NULL;
3743                 if (open_modern(modernaddr, modernport, &gerror) == -1) {
3744                         msg(LOG_ERR, "failed to setup servers: %s",
3745                                 gerror->message);
3746                         g_clear_error(&gerror);
3747                         exit(EXIT_FAILURE);
3748                 }
3749         }
3750         children=g_hash_table_new_full(g_int_hash, g_int_equal, NULL, destroy_pid_t);
3751
3752         sa.sa_handler = sigchld_handler;
3753         sigemptyset(&sa.sa_mask);
3754         sigaddset(&sa.sa_mask, SIGTERM);
3755         sa.sa_flags = SA_RESTART;
3756         if(sigaction(SIGCHLD, &sa, NULL) == -1)
3757                 err("sigaction: %m");
3758
3759         sa.sa_handler = sigterm_handler;
3760         sigemptyset(&sa.sa_mask);
3761         sigaddset(&sa.sa_mask, SIGCHLD);
3762         sa.sa_flags = SA_RESTART;
3763         if(sigaction(SIGTERM, &sa, NULL) == -1)
3764                 err("sigaction: %m");
3765
3766         sa.sa_handler = sighup_handler;
3767         sigemptyset(&sa.sa_mask);
3768         sa.sa_flags = SA_RESTART;
3769         if(sigaction(SIGHUP, &sa, NULL) == -1)
3770                 err("sigaction: %m");
3771
3772         sa.sa_handler = sigusr1_handler;
3773         sigemptyset(&sa.sa_mask);
3774         sa.sa_flags = SA_RESTART;
3775         if(sigaction(SIGUSR1, &sa, NULL) == -1)
3776                 err("sigaction: %m");
3777 }
3778
3779 /**
3780  * Go daemon (unless we specified at compile time that we didn't want this)
3781  * @param serve the first server of our configuration. If its port is zero,
3782  *      then do not daemonize, because we're doing inetd then. This parameter
3783  *      is only used to create a PID file of the form
3784  *      /var/run/nbd-server.&lt;port&gt;.pid; it's not modified in any way.
3785  **/
3786 #if !defined(NODAEMON)
3787 void daemonize() {
3788         FILE*pidf;
3789
3790         if(daemon(0,0)<0) {
3791                 err("daemon");
3792         }
3793         if(!*pidfname) {
3794                 strncpy(pidfname, "/var/run/nbd-server.pid", 255);
3795         }
3796         pidf=fopen(pidfname, "w");
3797         if(pidf) {
3798                 fprintf(pidf,"%d\n", (int)getpid());
3799                 fclose(pidf);
3800         } else {
3801                 perror("fopen");
3802                 fprintf(stderr, "Not fatal; continuing");
3803         }
3804 }
3805 #else
3806 #define daemonize(serve)
3807 #endif /* !defined(NODAEMON) */
3808
3809 /*
3810  * Everything beyond this point (in the file) is run in non-daemon mode.
3811  * The stuff above daemonize() isn't.
3812  */
3813
3814 /**
3815  * Set up user-ID and/or group-ID
3816  **/
3817 void dousers(const gchar *const username, const gchar *const groupname) {
3818         struct passwd *pw;
3819         struct group *gr;
3820         gchar* str;
3821         if (groupname) {
3822                 gr = getgrnam(groupname);
3823                 if(!gr) {
3824                         str = g_strdup_printf("Invalid group name: %s", groupname);
3825                         err(str);
3826                 }
3827                 if(setgid(gr->gr_gid)<0) {
3828                         err("Could not set GID: %m");
3829                 }
3830         }
3831         if (username) {
3832                 pw = getpwnam(username);
3833                 if(!pw) {
3834                         str = g_strdup_printf("Invalid user name: %s", username);
3835                         err(str);
3836                 }
3837                 if (setgroups(0, NULL)<0) {
3838                         err("Could not set groups: %m");
3839                 }
3840                 if(setuid(pw->pw_uid)<0) {
3841                         err("Could not set UID: %m");
3842                 }
3843         }
3844 }
3845
3846 #ifndef ISSERVER
3847 void glib_message_syslog_redirect(const gchar *log_domain,
3848                                   GLogLevelFlags log_level,
3849                                   const gchar *message,
3850                                   gpointer user_data)
3851 {
3852     int level=LOG_DEBUG;
3853
3854     switch( log_level )
3855     {
3856       case G_LOG_FLAG_FATAL:
3857       case G_LOG_LEVEL_CRITICAL:
3858       case G_LOG_LEVEL_ERROR:
3859         level=LOG_ERR;
3860         break;
3861       case G_LOG_LEVEL_WARNING:
3862         level=LOG_WARNING;
3863         break;
3864       case G_LOG_LEVEL_MESSAGE:
3865       case G_LOG_LEVEL_INFO:
3866         level=LOG_INFO;
3867         break;
3868       case G_LOG_LEVEL_DEBUG:
3869         level=LOG_DEBUG;
3870         break;
3871       default:
3872         level=LOG_ERR;
3873     }
3874     syslog(level, "%s", message);
3875 }
3876 #endif
3877
3878 /**
3879  * Main entry point...
3880  **/
3881 int main(int argc, char *argv[]) {
3882         SERVER *serve;
3883         GArray *servers;
3884         GError *gerr=NULL;
3885         struct generic_conf genconf;
3886
3887         memset(&genconf, 0, sizeof(struct generic_conf));
3888
3889         if (sizeof( struct nbd_request )!=28) {
3890                 fprintf(stderr,"Bad size of structure. Alignment problems?\n");
3891                 exit(EXIT_FAILURE) ;
3892         }
3893
3894         modernsocks = g_array_new(FALSE, FALSE, sizeof(int));
3895         childsocks = g_array_new(FALSE, FALSE, sizeof(int));
3896
3897         logging(MY_NAME);
3898         config_file_pos = g_strdup(CFILE);
3899         serve=cmdline(argc, argv, &genconf);
3900
3901         genconf.threads = 4;
3902         servers = parse_cfile(config_file_pos, &genconf, true, &gerr);
3903
3904         /* Update global variables with parsed values. This will be
3905          * removed once we get rid of global configuration variables. */
3906         glob_flags   |= genconf.flags;
3907
3908         if(serve) {
3909                 g_array_append_val(servers, serve);
3910         }
3911
3912         if(!servers || !servers->len) {
3913                 if(gerr && !(gerr->domain == NBDS_ERR
3914                             && gerr->code == NBDS_ERR_CFILE_NOTFOUND)) {
3915                         g_warning("Could not parse config file: %s", gerr->message);
3916                 }
3917         }
3918         if(serve) {
3919                 g_warning("Specifying an export on the command line no longer uses the oldstyle protocol.");
3920         }
3921
3922         if((!serve) && (!servers||!servers->len)) {
3923                 if(gerr)
3924                         g_message("No configured exports; quitting.");
3925                 exit(EXIT_FAILURE);
3926         }
3927         if (!nodaemon)
3928                 daemonize();
3929
3930         setup_servers(servers, genconf.modernaddr, genconf.modernport,
3931                         genconf.unixsock, genconf.flags);
3932         dousers(genconf.user, genconf.group);
3933
3934 #if HAVE_GNUTLS
3935         gnutls_global_init();
3936         static gnutls_dh_params_t dh_params;
3937         gnutls_dh_params_init(&dh_params);
3938         gnutls_dh_params_generate2(dh_params,
3939                                 gnutls_sec_param_to_pk_bits(GNUTLS_PK_DH,
3940 // Renamed in GnuTLS 3.3
3941 #if GNUTLS_VERSION_NUMBER >= 0x030300
3942                                         GNUTLS_SEC_PARAM_MEDIUM
3943 #else
3944                                         GNUTLS_SEC_PARAM_NORMAL
3945 #endif
3946                                         ));
3947 #endif
3948
3949         if((genconf.modernport != NULL) && strcmp(genconf.modernport, "0")==0) {
3950 #ifndef ISSERVER
3951                 err("inetd mode requires syslog");
3952 #endif
3953                 CLIENT* client = negotiate(0, servers, &genconf);
3954                 if(!client) {
3955                         exit(EXIT_FAILURE);
3956                 }
3957                 tpool = g_thread_pool_new(handle_request, NULL, genconf.threads, FALSE, NULL);
3958                 mainloop_threaded(client);
3959                 return 0;
3960         }
3961
3962         serveloop(servers, &genconf);
3963 }