2 /* Copyright 1998 by the Massachusetts Institute of Technology.
3 * Copyright (C) 2004-2017 by Daniel Stenberg
5 * Permission to use, copy, modify, and distribute this
6 * software and its documentation for any purpose and without
7 * fee is hereby granted, provided that the above copyright
8 * notice appear in all copies and that both that copyright
9 * notice and this permission notice appear in supporting
10 * documentation, and that the name of M.I.T. not be used in
11 * advertising or publicity pertaining to distribution of the
12 * software without specific, written prior permission.
13 * M.I.T. makes no representations about the suitability of
14 * this software for any purpose. It is provided "as is"
15 * without express or implied warranty.
18 #include "ares_setup.h"
23 #ifdef HAVE_NETINET_IN_H
24 # include <netinet/in.h>
26 #ifdef HAVE_NETINET_TCP_H
27 # include <netinet/tcp.h>
32 #ifdef HAVE_ARPA_INET_H
33 # include <arpa/inet.h>
35 #ifdef HAVE_ARPA_NAMESER_H
36 # include <arpa/nameser.h>
38 #ifdef HAVE_ARPA_NAMESER_COMPAT_H
39 # include <arpa/nameser_compat.h>
47 #ifdef HAVE_SYS_IOCTL_H
48 # include <sys/ioctl.h>
51 # include <sys/filio.h>
60 #include "ares_nowarn.h"
61 #include "ares_private.h"
64 static int try_again(int errnum);
65 static void write_tcp_data(ares_channel channel, fd_set *write_fds,
66 ares_socket_t write_fd, struct timeval *now);
67 static void read_tcp_data(ares_channel channel, fd_set *read_fds,
68 ares_socket_t read_fd, struct timeval *now);
69 static void read_udp_packets(ares_channel channel, fd_set *read_fds,
70 ares_socket_t read_fd, struct timeval *now);
71 static void advance_tcp_send_queue(ares_channel channel, int whichserver,
72 ares_ssize_t num_bytes);
73 static void process_timeouts(ares_channel channel, struct timeval *now);
74 static void process_broken_connections(ares_channel channel,
76 static void process_answer(ares_channel channel, unsigned char *abuf,
77 int alen, int whichserver, int tcp,
79 static void handle_error(ares_channel channel, int whichserver,
81 static void skip_server(ares_channel channel, struct query *query,
83 static void next_server(ares_channel channel, struct query *query,
85 static int open_tcp_socket(ares_channel channel, struct server_state *server);
86 static int open_udp_socket(ares_channel channel, struct server_state *server);
87 static int same_questions(const unsigned char *qbuf, int qlen,
88 const unsigned char *abuf, int alen);
89 static int same_address(struct sockaddr *sa, struct ares_addr *aa);
90 static int has_opt_rr(const unsigned char *abuf, int alen);
91 static void end_query(ares_channel channel, struct query *query, int status,
92 unsigned char *abuf, int alen);
94 /* return true if now is exactly check time or later */
95 int ares__timedout(struct timeval *now,
96 struct timeval *check)
98 long secs = (now->tv_sec - check->tv_sec);
101 return 1; /* yes, timed out */
103 return 0; /* nope, not timed out */
105 /* if the full seconds were identical, check the sub second parts */
106 return (now->tv_usec - check->tv_usec >= 0);
109 /* add the specific number of milliseconds to the time in the first argument */
110 static void timeadd(struct timeval *now, int millisecs)
112 now->tv_sec += millisecs/1000;
113 now->tv_usec += (millisecs%1000)*1000;
115 if(now->tv_usec >= 1000000) {
117 now->tv_usec -= 1000000;
122 * generic process function
124 static void processfds(ares_channel channel,
125 fd_set *read_fds, ares_socket_t read_fd,
126 fd_set *write_fds, ares_socket_t write_fd)
128 struct timeval now = ares__tvnow();
130 write_tcp_data(channel, write_fds, write_fd, &now);
131 read_tcp_data(channel, read_fds, read_fd, &now);
132 read_udp_packets(channel, read_fds, read_fd, &now);
133 process_timeouts(channel, &now);
134 process_broken_connections(channel, &now);
137 /* Something interesting happened on the wire, or there was a timeout.
138 * See what's up and respond accordingly.
140 void ares_process(ares_channel channel, fd_set *read_fds, fd_set *write_fds)
142 processfds(channel, read_fds, ARES_SOCKET_BAD, write_fds, ARES_SOCKET_BAD);
145 /* Something interesting happened on the wire, or there was a timeout.
146 * See what's up and respond accordingly.
148 void ares_process_fd(ares_channel channel,
149 ares_socket_t read_fd, /* use ARES_SOCKET_BAD or valid
151 ares_socket_t write_fd)
153 processfds(channel, NULL, read_fd, NULL, write_fd);
157 /* Return 1 if the specified error number describes a readiness error, or 0
158 * otherwise. This is mostly for HP-UX, which could return EAGAIN or
159 * EWOULDBLOCK. See this man page
161 * http://devrsrc1.external.hp.com/STKS/cgi-bin/man2html?
162 * manpage=/usr/share/man/man2.Z/send.2
164 static int try_again(int errnum)
166 #if !defined EWOULDBLOCK && !defined EAGAIN
167 #error "Neither EWOULDBLOCK nor EAGAIN defined"
175 #if defined EAGAIN && EAGAIN != EWOULDBLOCK
183 static ares_ssize_t socket_writev(ares_channel channel, ares_socket_t s, const struct iovec * vec, int len)
185 if (channel->sock_funcs)
186 return channel->sock_funcs->asendv(s, vec, len, channel->sock_func_cb_data);
188 return writev(s, vec, len);
191 static ares_ssize_t socket_write(ares_channel channel, ares_socket_t s, const void * data, size_t len)
193 if (channel->sock_funcs)
196 vec.iov_base = (void*)data;
198 return channel->sock_funcs->asendv(s, &vec, 1, channel->sock_func_cb_data);
200 return swrite(s, data, len);
203 /* If any TCP sockets select true for writing, write out queued data
206 static void write_tcp_data(ares_channel channel,
208 ares_socket_t write_fd,
211 struct server_state *server;
212 struct send_request *sendreq;
219 if(!write_fds && (write_fd == ARES_SOCKET_BAD))
220 /* no possible action */
223 for (i = 0; i < channel->nservers; i++)
225 /* Make sure server has data to send and is selected in write_fds or
227 server = &channel->servers[i];
228 if (!server->qhead || server->tcp_socket == ARES_SOCKET_BAD ||
233 if(!FD_ISSET(server->tcp_socket, write_fds))
237 if(server->tcp_socket != write_fd)
242 /* If there's an error and we close this socket, then open
243 * another with the same fd to talk to another server, then we
244 * don't want to think that it was the new socket that was
245 * ready. This is not disastrous, but is likely to result in
246 * extra system calls and confusion. */
247 FD_CLR(server->tcp_socket, write_fds);
249 /* Count the number of send queue items. */
251 for (sendreq = server->qhead; sendreq; sendreq = sendreq->next)
254 /* Allocate iovecs so we can send all our data at once. */
255 vec = ares_malloc(n * sizeof(struct iovec));
258 /* Fill in the iovecs and send. */
260 for (sendreq = server->qhead; sendreq; sendreq = sendreq->next)
262 vec[n].iov_base = (char *) sendreq->data;
263 vec[n].iov_len = sendreq->len;
266 wcount = socket_writev(channel, server->tcp_socket, vec, (int)n);
270 if (!try_again(SOCKERRNO))
271 handle_error(channel, i, now);
275 /* Advance the send queue by as many bytes as we sent. */
276 advance_tcp_send_queue(channel, i, wcount);
280 /* Can't allocate iovecs; just send the first request. */
281 sendreq = server->qhead;
283 scount = socket_write(channel, server->tcp_socket, sendreq->data, sendreq->len);
286 if (!try_again(SOCKERRNO))
287 handle_error(channel, i, now);
291 /* Advance the send queue by as many bytes as we sent. */
292 advance_tcp_send_queue(channel, i, scount);
297 /* Consume the given number of bytes from the head of the TCP send queue. */
298 static void advance_tcp_send_queue(ares_channel channel, int whichserver,
299 ares_ssize_t num_bytes)
301 struct send_request *sendreq;
302 struct server_state *server = &channel->servers[whichserver];
303 while (num_bytes > 0) {
304 sendreq = server->qhead;
305 if ((size_t)num_bytes >= sendreq->len) {
306 num_bytes -= sendreq->len;
307 server->qhead = sendreq->next;
308 if (sendreq->data_storage)
309 ares_free(sendreq->data_storage);
311 if (server->qhead == NULL) {
312 SOCK_STATE_CALLBACK(channel, server->tcp_socket, 1, 0);
313 server->qtail = NULL;
315 /* qhead is NULL so we cannot continue this loop */
320 sendreq->data += num_bytes;
321 sendreq->len -= num_bytes;
327 static ares_ssize_t socket_recvfrom(ares_channel channel,
332 struct sockaddr *from,
333 ares_socklen_t *from_len)
335 if (channel->sock_funcs)
336 return channel->sock_funcs->arecvfrom(s, data, data_len,
337 flags, from, from_len,
338 channel->sock_func_cb_data);
341 return recvfrom(s, data, data_len, flags, from, from_len);
343 return sread(s, data, data_len);
347 static ares_ssize_t socket_recv(ares_channel channel,
352 if (channel->sock_funcs)
353 return channel->sock_funcs->arecvfrom(s, data, data_len, 0, 0, 0,
354 channel->sock_func_cb_data);
356 return sread(s, data, data_len);
359 /* If any TCP socket selects true for reading, read some data,
360 * allocate a buffer if we finish reading the length word, and process
361 * a packet if we finish reading one.
363 static void read_tcp_data(ares_channel channel, fd_set *read_fds,
364 ares_socket_t read_fd, struct timeval *now)
366 struct server_state *server;
370 if(!read_fds && (read_fd == ARES_SOCKET_BAD))
371 /* no possible action */
374 for (i = 0; i < channel->nservers; i++)
376 /* Make sure the server has a socket and is selected in read_fds. */
377 server = &channel->servers[i];
378 if (server->tcp_socket == ARES_SOCKET_BAD || server->is_broken)
382 if(!FD_ISSET(server->tcp_socket, read_fds))
386 if(server->tcp_socket != read_fd)
391 /* If there's an error and we close this socket, then open another
392 * with the same fd to talk to another server, then we don't want to
393 * think that it was the new socket that was ready. This is not
394 * disastrous, but is likely to result in extra system calls and
396 FD_CLR(server->tcp_socket, read_fds);
398 if (server->tcp_lenbuf_pos != 2)
400 /* We haven't yet read a length word, so read that (or
401 * what's left to read of it).
403 count = socket_recv(channel, server->tcp_socket,
404 server->tcp_lenbuf + server->tcp_lenbuf_pos,
405 2 - server->tcp_lenbuf_pos);
408 if (!(count == -1 && try_again(SOCKERRNO)))
409 handle_error(channel, i, now);
413 server->tcp_lenbuf_pos += (int)count;
414 if (server->tcp_lenbuf_pos == 2)
416 /* We finished reading the length word. Decode the
417 * length and allocate a buffer for the data.
419 server->tcp_length = server->tcp_lenbuf[0] << 8
420 | server->tcp_lenbuf[1];
421 server->tcp_buffer = ares_malloc(server->tcp_length);
422 if (!server->tcp_buffer) {
423 handle_error(channel, i, now);
424 return; /* bail out on malloc failure. TODO: make this
425 function return error codes */
427 server->tcp_buffer_pos = 0;
432 /* Read data into the allocated buffer. */
433 count = socket_recv(channel, server->tcp_socket,
434 server->tcp_buffer + server->tcp_buffer_pos,
435 server->tcp_length - server->tcp_buffer_pos);
438 if (!(count == -1 && try_again(SOCKERRNO)))
439 handle_error(channel, i, now);
443 server->tcp_buffer_pos += (int)count;
444 if (server->tcp_buffer_pos == server->tcp_length)
446 /* We finished reading this answer; process it and
447 * prepare to read another length word.
449 process_answer(channel, server->tcp_buffer, server->tcp_length,
451 ares_free(server->tcp_buffer);
452 server->tcp_buffer = NULL;
453 server->tcp_lenbuf_pos = 0;
454 server->tcp_buffer_pos = 0;
460 /* If any UDP sockets select true for reading, process them. */
461 static void read_udp_packets(ares_channel channel, fd_set *read_fds,
462 ares_socket_t read_fd, struct timeval *now)
464 struct server_state *server;
467 unsigned char buf[MAXENDSSZ + 1];
469 ares_socklen_t fromlen;
472 struct sockaddr_in sa4;
473 struct sockaddr_in6 sa6;
477 if(!read_fds && (read_fd == ARES_SOCKET_BAD))
478 /* no possible action */
481 for (i = 0; i < channel->nservers; i++)
483 /* Make sure the server has a socket and is selected in read_fds. */
484 server = &channel->servers[i];
486 if (server->udp_socket == ARES_SOCKET_BAD || server->is_broken)
490 if(!FD_ISSET(server->udp_socket, read_fds))
494 if(server->udp_socket != read_fd)
499 /* If there's an error and we close this socket, then open
500 * another with the same fd to talk to another server, then we
501 * don't want to think that it was the new socket that was
502 * ready. This is not disastrous, but is likely to result in
503 * extra system calls and confusion. */
504 FD_CLR(server->udp_socket, read_fds);
506 /* To reduce event loop overhead, read and process as many
507 * packets as we can. */
509 if (server->udp_socket == ARES_SOCKET_BAD)
513 if (server->addr.family == AF_INET)
514 fromlen = sizeof(from.sa4);
516 fromlen = sizeof(from.sa6);
517 count = socket_recvfrom(channel, server->udp_socket, (void *)buf,
518 sizeof(buf), 0, &from.sa, &fromlen);
521 if (count == -1 && try_again(SOCKERRNO))
524 handle_error(channel, i, now);
526 else if (!same_address(&from.sa, &server->addr))
527 /* The address the response comes from does not match the address we
528 * sent the request to. Someone may be attempting to perform a cache
529 * poisoning attack. */
533 process_answer(channel, buf, (int)count, i, 0, now);
538 /* If any queries have timed out, note the timeout and move them on. */
539 static void process_timeouts(ares_channel channel, struct timeval *now)
541 time_t t; /* the time of the timeouts we're processing */
543 struct list_node* list_head;
544 struct list_node* list_node;
546 /* Process all the timeouts that have fired since the last time we processed
547 * timeouts. If things are going well, then we'll have hundreds/thousands of
548 * queries that fall into future buckets, and only a handful of requests
549 * that fall into the "now" bucket, so this should be quite quick.
551 for (t = channel->last_timeout_processed; t <= now->tv_sec; t++)
553 list_head = &(channel->queries_by_timeout[t % ARES_TIMEOUT_TABLE_SIZE]);
554 for (list_node = list_head->next; list_node != list_head; )
556 query = list_node->data;
557 list_node = list_node->next; /* in case the query gets deleted */
558 if (query->timeout.tv_sec && ares__timedout(now, &query->timeout))
560 query->error_status = ARES_ETIMEOUT;
562 next_server(channel, query, now);
566 channel->last_timeout_processed = now->tv_sec;
569 /* Handle an answer from a server. */
570 static void process_answer(ares_channel channel, unsigned char *abuf,
571 int alen, int whichserver, int tcp,
574 int tc, rcode, packetsz;
577 struct list_node* list_head;
578 struct list_node* list_node;
580 /* If there's no room in the answer for a header, we can't do much
585 /* Grab the query ID, truncate bit, and response code from the packet. */
586 id = DNS_HEADER_QID(abuf);
587 tc = DNS_HEADER_TC(abuf);
588 rcode = DNS_HEADER_RCODE(abuf);
590 /* Find the query corresponding to this packet. The queries are
591 * hashed/bucketed by query id, so this lookup should be quick. Note that
592 * both the query id and the questions must be the same; when the query id
593 * wraps around we can have multiple outstanding queries with the same query
594 * id, so we need to check both the id and question.
597 list_head = &(channel->queries_by_qid[id % ARES_QID_TABLE_SIZE]);
598 for (list_node = list_head->next; list_node != list_head;
599 list_node = list_node->next)
601 struct query *q = list_node->data;
602 if ((q->qid == id) && same_questions(q->qbuf, q->qlen, abuf, alen))
612 /* If we use EDNS and server answers with FORMERR without an OPT RR, the protocol
613 * extension is not understood by the responder. We must retry the query
614 * without EDNS enabled.
616 if (channel->flags & ARES_FLAG_EDNS)
618 packetsz = channel->ednspsz;
619 if (rcode == FORMERR && has_opt_rr(abuf, alen) != 1)
621 int qlen = (query->tcplen - 2) - EDNSFIXEDSZ;
622 channel->flags ^= ARES_FLAG_EDNS;
623 query->tcplen -= EDNSFIXEDSZ;
624 query->qlen -= EDNSFIXEDSZ;
625 query->tcpbuf[0] = (unsigned char)((qlen >> 8) & 0xff);
626 query->tcpbuf[1] = (unsigned char)(qlen & 0xff);
627 DNS_HEADER_SET_ARCOUNT(query->tcpbuf + 2, 0);
628 query->tcpbuf = ares_realloc(query->tcpbuf, query->tcplen);
629 query->qbuf = query->tcpbuf + 2;
630 ares__send_query(channel, query, now);
635 /* If we got a truncated UDP packet and are not ignoring truncation,
636 * don't accept the packet, and switch the query to TCP if we hadn't
639 if ((tc || alen > packetsz) && !tcp && !(channel->flags & ARES_FLAG_IGNTC))
641 if (!query->using_tcp)
643 query->using_tcp = 1;
644 ares__send_query(channel, query, now);
649 /* Limit alen to PACKETSZ if we aren't using TCP (only relevant if we
650 * are ignoring truncation.
652 if (alen > packetsz && !tcp)
655 /* If we aren't passing through all error packets, discard packets
656 * with SERVFAIL, NOTIMP, or REFUSED response codes.
658 if (!(channel->flags & ARES_FLAG_NOCHECKRESP))
660 if (rcode == SERVFAIL || rcode == NOTIMP || rcode == REFUSED)
662 skip_server(channel, query, whichserver);
663 if (query->server == whichserver)
664 next_server(channel, query, now);
669 end_query(channel, query, ARES_SUCCESS, abuf, alen);
672 /* Close all the connections that are no longer usable. */
673 static void process_broken_connections(ares_channel channel,
677 for (i = 0; i < channel->nservers; i++)
679 struct server_state *server = &channel->servers[i];
680 if (server->is_broken)
682 handle_error(channel, i, now);
687 /* Swap the contents of two lists */
688 static void swap_lists(struct list_node* head_a,
689 struct list_node* head_b)
691 int is_a_empty = ares__is_list_empty(head_a);
692 int is_b_empty = ares__is_list_empty(head_b);
693 struct list_node old_a = *head_a;
694 struct list_node old_b = *head_b;
697 ares__init_list_head(head_b);
700 old_a.next->prev = head_b;
701 old_a.prev->next = head_b;
704 ares__init_list_head(head_a);
707 old_b.next->prev = head_a;
708 old_b.prev->next = head_a;
712 static void handle_error(ares_channel channel, int whichserver,
715 struct server_state *server;
717 struct list_node list_head;
718 struct list_node* list_node;
720 server = &channel->servers[whichserver];
722 /* Reset communications with this server. */
723 ares__close_sockets(channel, server);
725 /* Tell all queries talking to this server to move on and not try this
726 * server again. We steal the current list of queries that were in-flight to
727 * this server, since when we call next_server this can cause the queries to
728 * be re-sent to this server, which will re-insert these queries in that
729 * same server->queries_to_server list.
731 ares__init_list_head(&list_head);
732 swap_lists(&list_head, &(server->queries_to_server));
733 for (list_node = list_head.next; list_node != &list_head; )
735 query = list_node->data;
736 list_node = list_node->next; /* in case the query gets deleted */
737 assert(query->server == whichserver);
738 skip_server(channel, query, whichserver);
739 next_server(channel, query, now);
741 /* Each query should have removed itself from our temporary list as
742 * it re-sent itself or finished up...
744 assert(ares__is_list_empty(&list_head));
747 static void skip_server(ares_channel channel, struct query *query,
750 /* The given server gave us problems with this query, so if we have the
751 * luxury of using other servers, then let's skip the potentially broken
752 * server and just use the others. If we only have one server and we need to
753 * retry then we should just go ahead and re-use that server, since it's our
754 * only hope; perhaps we just got unlucky, and retrying will work (eg, the
755 * server timed out our TCP connection just as we were sending another
758 if (channel->nservers > 1)
760 query->server_info[whichserver].skip_server = 1;
764 static void next_server(ares_channel channel, struct query *query,
767 /* We need to try each server channel->tries times. We have channel->nservers
768 * servers to try. In total, we need to do channel->nservers * channel->tries
769 * attempts. Use query->try to remember how many times we already attempted
770 * this query. Use modular arithmetic to find the next server to try. */
771 while (++(query->try_count) < (channel->nservers * channel->tries))
773 struct server_state *server;
775 /* Move on to the next server. */
776 query->server = (query->server + 1) % channel->nservers;
777 server = &channel->servers[query->server];
779 /* We don't want to use this server if (1) we decided this connection is
780 * broken, and thus about to be closed, (2) we've decided to skip this
781 * server because of earlier errors we encountered, or (3) we already
782 * sent this query over this exact connection.
784 if (!server->is_broken &&
785 !query->server_info[query->server].skip_server &&
786 !(query->using_tcp &&
787 (query->server_info[query->server].tcp_connection_generation ==
788 server->tcp_connection_generation)))
790 ares__send_query(channel, query, now);
794 /* You might think that with TCP we only need one try. However, even
795 * when using TCP, servers can time-out our connection just as we're
796 * sending a request, or close our connection because they die, or never
797 * send us a reply because they get wedged or tickle a bug that drops
802 /* If we are here, all attempts to perform query failed. */
803 end_query(channel, query, query->error_status, NULL, 0);
806 void ares__send_query(ares_channel channel, struct query *query,
809 struct send_request *sendreq;
810 struct server_state *server;
813 server = &channel->servers[query->server];
814 if (query->using_tcp)
816 /* Make sure the TCP socket for this server is set up and queue
819 if (server->tcp_socket == ARES_SOCKET_BAD)
821 if (open_tcp_socket(channel, server) == -1)
823 skip_server(channel, query, query->server);
824 next_server(channel, query, now);
828 sendreq = ares_malloc(sizeof(struct send_request));
831 end_query(channel, query, ARES_ENOMEM, NULL, 0);
834 memset(sendreq, 0, sizeof(struct send_request));
835 /* To make the common case fast, we avoid copies by using the query's
836 * tcpbuf for as long as the query is alive. In the rare case where the
837 * query ends while it's queued for transmission, then we give the
838 * sendreq its own copy of the request packet and put it in
839 * sendreq->data_storage.
841 sendreq->data_storage = NULL;
842 sendreq->data = query->tcpbuf;
843 sendreq->len = query->tcplen;
844 sendreq->owner_query = query;
845 sendreq->next = NULL;
847 server->qtail->next = sendreq;
850 SOCK_STATE_CALLBACK(channel, server->tcp_socket, 1, 1);
851 server->qhead = sendreq;
853 server->qtail = sendreq;
854 query->server_info[query->server].tcp_connection_generation =
855 server->tcp_connection_generation;
859 if (server->udp_socket == ARES_SOCKET_BAD)
861 if (open_udp_socket(channel, server) == -1)
863 skip_server(channel, query, query->server);
864 next_server(channel, query, now);
868 if (socket_write(channel, server->udp_socket, query->qbuf, query->qlen) == -1)
870 /* FIXME: Handle EAGAIN here since it likely can happen. */
871 skip_server(channel, query, query->server);
872 next_server(channel, query, now);
877 /* For each trip through the entire server list, double the channel's
878 * assigned timeout, avoiding overflow. If channel->timeout is negative,
879 * leave it as-is, even though that should be impossible here.
881 timeplus = channel->timeout;
883 /* How many times do we want to double it? Presume sane values here. */
884 const int shift = query->try_count / channel->nservers;
886 /* Is there enough room to shift timeplus left that many times?
888 * To find out, confirm that all of the bits we'll shift away are zero.
889 * Stop considering a shift if we get to the point where we could shift
890 * a 1 into the sign bit (i.e. when shift is within two of the bit
893 * This has the side benefit of leaving negative numbers unchanged.
895 if(shift <= (int)(sizeof(int) * CHAR_BIT - 1)
896 && (timeplus >> (sizeof(int) * CHAR_BIT - 1 - shift)) == 0)
902 query->timeout = *now;
903 timeadd(&query->timeout, timeplus);
904 /* Keep track of queries bucketed by timeout, so we can process
905 * timeout events quickly.
907 ares__remove_from_list(&(query->queries_by_timeout));
908 ares__insert_in_list(
909 &(query->queries_by_timeout),
910 &(channel->queries_by_timeout[query->timeout.tv_sec %
911 ARES_TIMEOUT_TABLE_SIZE]));
913 /* Keep track of queries bucketed by server, so we can process server
916 ares__remove_from_list(&(query->queries_to_server));
917 ares__insert_in_list(&(query->queries_to_server),
918 &(server->queries_to_server));
922 * setsocknonblock sets the given socket to either blocking or non-blocking
923 * mode based on the 'nonblock' boolean argument. This function is highly
926 static int setsocknonblock(ares_socket_t sockfd, /* operate on this */
927 int nonblock /* TRUE or FALSE */)
929 #if defined(USE_BLOCKING_SOCKETS)
931 return 0; /* returns success */
933 #elif defined(HAVE_FCNTL_O_NONBLOCK)
935 /* most recent unix versions */
937 flags = fcntl(sockfd, F_GETFL, 0);
938 if (FALSE != nonblock)
939 return fcntl(sockfd, F_SETFL, flags | O_NONBLOCK);
941 return fcntl(sockfd, F_SETFL, flags & (~O_NONBLOCK)); /* LCOV_EXCL_LINE */
943 #elif defined(HAVE_IOCTL_FIONBIO)
945 /* older unix versions */
946 int flags = nonblock ? 1 : 0;
947 return ioctl(sockfd, FIONBIO, &flags);
949 #elif defined(HAVE_IOCTLSOCKET_FIONBIO)
952 char flags = nonblock ? 1 : 0;
955 unsigned long flags = nonblock ? 1UL : 0UL;
957 return ioctlsocket(sockfd, FIONBIO, &flags);
959 #elif defined(HAVE_IOCTLSOCKET_CAMEL_FIONBIO)
962 long flags = nonblock ? 1L : 0L;
963 return IoctlSocket(sockfd, FIONBIO, flags);
965 #elif defined(HAVE_SETSOCKOPT_SO_NONBLOCK)
968 long b = nonblock ? 1L : 0L;
969 return setsockopt(sockfd, SOL_SOCKET, SO_NONBLOCK, &b, sizeof(b));
972 # error "no non-blocking method was found/used/set"
976 static int configure_socket(ares_socket_t s, int family, ares_channel channel)
980 struct sockaddr_in sa4;
981 struct sockaddr_in6 sa6;
984 /* do not set options for user-managed sockets */
985 if (channel->sock_funcs)
988 (void)setsocknonblock(s, TRUE);
990 #if defined(FD_CLOEXEC) && !defined(MSDOS)
991 /* Configure the socket fd as close-on-exec. */
992 if (fcntl(s, F_SETFD, FD_CLOEXEC) == -1)
993 return -1; /* LCOV_EXCL_LINE */
996 /* Set the socket's send and receive buffer sizes. */
997 if ((channel->socket_send_buffer_size > 0) &&
998 setsockopt(s, SOL_SOCKET, SO_SNDBUF,
999 (void *)&channel->socket_send_buffer_size,
1000 sizeof(channel->socket_send_buffer_size)) == -1)
1003 if ((channel->socket_receive_buffer_size > 0) &&
1004 setsockopt(s, SOL_SOCKET, SO_RCVBUF,
1005 (void *)&channel->socket_receive_buffer_size,
1006 sizeof(channel->socket_receive_buffer_size)) == -1)
1009 #ifdef SO_BINDTODEVICE
1010 if (channel->local_dev_name[0]) {
1011 if (setsockopt(s, SOL_SOCKET, SO_BINDTODEVICE,
1012 channel->local_dev_name, sizeof(channel->local_dev_name))) {
1013 /* Only root can do this, and usually not fatal if it doesn't work, so */
1014 /* just continue on. */
1019 if (family == AF_INET) {
1020 if (channel->local_ip4) {
1021 memset(&local.sa4, 0, sizeof(local.sa4));
1022 local.sa4.sin_family = AF_INET;
1023 local.sa4.sin_addr.s_addr = htonl(channel->local_ip4);
1024 if (bind(s, &local.sa, sizeof(local.sa4)) < 0)
1028 else if (family == AF_INET6) {
1029 if (memcmp(channel->local_ip6, &ares_in6addr_any,
1030 sizeof(channel->local_ip6)) != 0) {
1031 memset(&local.sa6, 0, sizeof(local.sa6));
1032 local.sa6.sin6_family = AF_INET6;
1033 memcpy(&local.sa6.sin6_addr, channel->local_ip6,
1034 sizeof(channel->local_ip6));
1035 if (bind(s, &local.sa, sizeof(local.sa6)) < 0)
1043 static int open_tcp_socket(ares_channel channel, struct server_state *server)
1047 ares_socklen_t salen;
1049 struct sockaddr_in sa4;
1050 struct sockaddr_in6 sa6;
1052 struct sockaddr *sa;
1054 switch (server->addr.family)
1057 sa = (void *)&saddr.sa4;
1058 salen = sizeof(saddr.sa4);
1059 memset(sa, 0, salen);
1060 saddr.sa4.sin_family = AF_INET;
1061 if (server->addr.tcp_port) {
1062 saddr.sa4.sin_port = aresx_sitous(server->addr.tcp_port);
1064 saddr.sa4.sin_port = aresx_sitous(channel->tcp_port);
1066 memcpy(&saddr.sa4.sin_addr, &server->addr.addrV4,
1067 sizeof(server->addr.addrV4));
1070 sa = (void *)&saddr.sa6;
1071 salen = sizeof(saddr.sa6);
1072 memset(sa, 0, salen);
1073 saddr.sa6.sin6_family = AF_INET6;
1074 if (server->addr.tcp_port) {
1075 saddr.sa6.sin6_port = aresx_sitous(server->addr.tcp_port);
1077 saddr.sa6.sin6_port = aresx_sitous(channel->tcp_port);
1079 memcpy(&saddr.sa6.sin6_addr, &server->addr.addrV6,
1080 sizeof(server->addr.addrV6));
1083 return -1; /* LCOV_EXCL_LINE */
1086 /* Acquire a socket. */
1087 s = ares__open_socket(channel, server->addr.family, SOCK_STREAM, 0);
1088 if (s == ARES_SOCKET_BAD)
1092 if (configure_socket(s, server->addr.family, channel) < 0)
1094 ares__close_socket(channel, s);
1100 * Disable the Nagle algorithm (only relevant for TCP sockets, and thus not
1101 * in configure_socket). In general, in DNS lookups we're pretty much
1102 * interested in firing off a single request and then waiting for a reply,
1103 * so batching isn't very interesting.
1106 if (channel->sock_funcs == 0
1108 setsockopt(s, IPPROTO_TCP, TCP_NODELAY,
1109 (void *)&opt, sizeof(opt)) == -1)
1111 ares__close_socket(channel, s);
1116 if (channel->sock_config_cb)
1118 int err = channel->sock_config_cb(s, SOCK_STREAM,
1119 channel->sock_config_cb_data);
1122 ares__close_socket(channel, s);
1127 /* Connect to the server. */
1128 if (ares__connect_socket(channel, s, sa, salen) == -1)
1130 int err = SOCKERRNO;
1132 if (err != EINPROGRESS && err != EWOULDBLOCK)
1134 ares__close_socket(channel, s);
1139 if (channel->sock_create_cb)
1141 int err = channel->sock_create_cb(s, SOCK_STREAM,
1142 channel->sock_create_cb_data);
1145 ares__close_socket(channel, s);
1150 SOCK_STATE_CALLBACK(channel, s, 1, 0);
1151 server->tcp_buffer_pos = 0;
1152 server->tcp_socket = s;
1153 server->tcp_connection_generation = ++channel->tcp_connection_generation;
1157 static int open_udp_socket(ares_channel channel, struct server_state *server)
1160 ares_socklen_t salen;
1162 struct sockaddr_in sa4;
1163 struct sockaddr_in6 sa6;
1165 struct sockaddr *sa;
1167 switch (server->addr.family)
1170 sa = (void *)&saddr.sa4;
1171 salen = sizeof(saddr.sa4);
1172 memset(sa, 0, salen);
1173 saddr.sa4.sin_family = AF_INET;
1174 if (server->addr.udp_port) {
1175 saddr.sa4.sin_port = aresx_sitous(server->addr.udp_port);
1177 saddr.sa4.sin_port = aresx_sitous(channel->udp_port);
1179 memcpy(&saddr.sa4.sin_addr, &server->addr.addrV4,
1180 sizeof(server->addr.addrV4));
1183 sa = (void *)&saddr.sa6;
1184 salen = sizeof(saddr.sa6);
1185 memset(sa, 0, salen);
1186 saddr.sa6.sin6_family = AF_INET6;
1187 if (server->addr.udp_port) {
1188 saddr.sa6.sin6_port = aresx_sitous(server->addr.udp_port);
1190 saddr.sa6.sin6_port = aresx_sitous(channel->udp_port);
1192 memcpy(&saddr.sa6.sin6_addr, &server->addr.addrV6,
1193 sizeof(server->addr.addrV6));
1196 return -1; /* LCOV_EXCL_LINE */
1199 /* Acquire a socket. */
1200 s = ares__open_socket(channel, server->addr.family, SOCK_DGRAM, 0);
1201 if (s == ARES_SOCKET_BAD)
1204 /* Set the socket non-blocking. */
1205 if (configure_socket(s, server->addr.family, channel) < 0)
1207 ares__close_socket(channel, s);
1211 if (channel->sock_config_cb)
1213 int err = channel->sock_config_cb(s, SOCK_DGRAM,
1214 channel->sock_config_cb_data);
1217 ares__close_socket(channel, s);
1222 /* Connect to the server. */
1223 if (ares__connect_socket(channel, s, sa, salen) == -1)
1225 int err = SOCKERRNO;
1227 if (err != EINPROGRESS && err != EWOULDBLOCK)
1229 ares__close_socket(channel, s);
1234 if (channel->sock_create_cb)
1236 int err = channel->sock_create_cb(s, SOCK_DGRAM,
1237 channel->sock_create_cb_data);
1240 ares__close_socket(channel, s);
1245 SOCK_STATE_CALLBACK(channel, s, 1, 0);
1247 server->udp_socket = s;
1251 static int same_questions(const unsigned char *qbuf, int qlen,
1252 const unsigned char *abuf, int alen)
1255 const unsigned char *p;
1264 if (qlen < HFIXEDSZ || alen < HFIXEDSZ)
1267 /* Extract qdcount from the request and reply buffers and compare them. */
1268 q.qdcount = DNS_HEADER_QDCOUNT(qbuf);
1269 a.qdcount = DNS_HEADER_QDCOUNT(abuf);
1270 if (q.qdcount != a.qdcount)
1273 /* For each question in qbuf, find it in abuf. */
1274 q.p = qbuf + HFIXEDSZ;
1275 for (i = 0; i < q.qdcount; i++)
1277 /* Decode the question in the query. */
1278 if (ares_expand_name(q.p, qbuf, qlen, &q.name, &q.namelen)
1282 if (q.p + QFIXEDSZ > qbuf + qlen)
1287 q.type = DNS_QUESTION_TYPE(q.p);
1288 q.dnsclass = DNS_QUESTION_CLASS(q.p);
1291 /* Search for this question in the answer. */
1292 a.p = abuf + HFIXEDSZ;
1293 for (j = 0; j < a.qdcount; j++)
1295 /* Decode the question in the answer. */
1296 if (ares_expand_name(a.p, abuf, alen, &a.name, &a.namelen)
1303 if (a.p + QFIXEDSZ > abuf + alen)
1309 a.type = DNS_QUESTION_TYPE(a.p);
1310 a.dnsclass = DNS_QUESTION_CLASS(a.p);
1313 /* Compare the decoded questions. */
1314 if (strcasecmp(q.name, a.name) == 0 && q.type == a.type
1315 && q.dnsclass == a.dnsclass)
1330 static int same_address(struct sockaddr *sa, struct ares_addr *aa)
1335 if (sa->sa_family == aa->family)
1340 addr1 = &aa->addrV4;
1341 addr2 = &(CARES_INADDR_CAST(struct sockaddr_in *, sa))->sin_addr;
1342 if (memcmp(addr1, addr2, sizeof(aa->addrV4)) == 0)
1343 return 1; /* match */
1346 addr1 = &aa->addrV6;
1347 addr2 = &(CARES_INADDR_CAST(struct sockaddr_in6 *, sa))->sin6_addr;
1348 if (memcmp(addr1, addr2, sizeof(aa->addrV6)) == 0)
1349 return 1; /* match */
1352 break; /* LCOV_EXCL_LINE */
1355 return 0; /* different */
1358 /* search for an OPT RR in the response */
1359 static int has_opt_rr(const unsigned char *abuf, int alen)
1361 unsigned int qdcount, ancount, nscount, arcount, i;
1362 const unsigned char *aptr;
1365 if (alen < HFIXEDSZ)
1368 /* Parse the answer header. */
1369 qdcount = DNS_HEADER_QDCOUNT(abuf);
1370 ancount = DNS_HEADER_ANCOUNT(abuf);
1371 nscount = DNS_HEADER_NSCOUNT(abuf);
1372 arcount = DNS_HEADER_ARCOUNT(abuf);
1374 aptr = abuf + HFIXEDSZ;
1376 /* skip the questions */
1377 for (i = 0; i < qdcount; i++)
1381 status = ares_expand_name(aptr, abuf, alen, &name, &len);
1382 if (status != ARES_SUCCESS)
1384 ares_free_string(name);
1385 if (aptr + len + QFIXEDSZ > abuf + alen)
1387 aptr += len + QFIXEDSZ;
1390 /* skip the ancount and nscount */
1391 for (i = 0; i < ancount + nscount; i++)
1396 status = ares_expand_name(aptr, abuf, alen, &name, &len);
1397 if (status != ARES_SUCCESS)
1399 ares_free_string(name);
1400 if (aptr + len + RRFIXEDSZ > abuf + alen)
1403 dlen = DNS_RR_LEN(aptr);
1405 if (aptr + dlen > abuf + alen)
1410 /* search for rr type (41) - opt */
1411 for (i = 0; i < arcount; i++)
1416 status = ares_expand_name(aptr, abuf, alen, &name, &len);
1417 if (status != ARES_SUCCESS)
1419 ares_free_string(name);
1420 if (aptr + len + RRFIXEDSZ > abuf + alen)
1424 if (DNS_RR_TYPE(aptr) == T_OPT)
1427 dlen = DNS_RR_LEN(aptr);
1429 if (aptr + dlen > abuf + alen)
1437 static void end_query (ares_channel channel, struct query *query, int status,
1438 unsigned char *abuf, int alen)
1442 /* First we check to see if this query ended while one of our send
1443 * queues still has pointers to it.
1445 for (i = 0; i < channel->nservers; i++)
1447 struct server_state *server = &channel->servers[i];
1448 struct send_request *sendreq;
1449 for (sendreq = server->qhead; sendreq; sendreq = sendreq->next)
1450 if (sendreq->owner_query == query)
1452 sendreq->owner_query = NULL;
1453 assert(sendreq->data_storage == NULL);
1454 if (status == ARES_SUCCESS)
1456 /* We got a reply for this query, but this queued sendreq
1457 * points into this soon-to-be-gone query's tcpbuf. Probably
1458 * this means we timed out and queued the query for
1459 * retransmission, then received a response before actually
1460 * retransmitting. This is perfectly fine, so we want to keep
1461 * the connection running smoothly if we can. But in the worst
1462 * case we may have sent only some prefix of the query, with
1463 * some suffix of the query left to send. Also, the buffer may
1464 * be queued on multiple queues. To prevent dangling pointers
1465 * to the query's tcpbuf and handle these cases, we just give
1466 * such sendreqs their own copy of the query packet.
1468 sendreq->data_storage = ares_malloc(sendreq->len);
1469 if (sendreq->data_storage != NULL)
1471 memcpy(sendreq->data_storage, sendreq->data, sendreq->len);
1472 sendreq->data = sendreq->data_storage;
1475 if ((status != ARES_SUCCESS) || (sendreq->data_storage == NULL))
1477 /* We encountered an error (probably a timeout, suggesting the
1478 * DNS server we're talking to is probably unreachable,
1479 * wedged, or severely overloaded) or we couldn't copy the
1480 * request, so mark the connection as broken. When we get to
1481 * process_broken_connections() we'll close the connection and
1482 * try to re-send requests to another server.
1484 server->is_broken = 1;
1485 /* Just to be paranoid, zero out this sendreq... */
1486 sendreq->data = NULL;
1492 /* Invoke the callback */
1493 query->callback(query->arg, status, query->timeouts, abuf, alen);
1494 ares__free_query(query);
1496 /* Simple cleanup policy: if no queries are remaining, close all network
1497 * sockets unless STAYOPEN is set.
1499 if (!(channel->flags & ARES_FLAG_STAYOPEN) &&
1500 ares__is_list_empty(&(channel->all_queries)))
1502 for (i = 0; i < channel->nservers; i++)
1503 ares__close_sockets(channel, &channel->servers[i]);
1507 void ares__free_query(struct query *query)
1509 /* Remove the query from all the lists in which it is linked */
1510 ares__remove_from_list(&(query->queries_by_qid));
1511 ares__remove_from_list(&(query->queries_by_timeout));
1512 ares__remove_from_list(&(query->queries_to_server));
1513 ares__remove_from_list(&(query->all_queries));
1514 /* Zero out some important stuff, to help catch bugs */
1515 query->callback = NULL;
1517 /* Deallocate the memory associated with the query */
1518 ares_free(query->tcpbuf);
1519 ares_free(query->server_info);
1523 ares_socket_t ares__open_socket(ares_channel channel,
1524 int af, int type, int protocol)
1526 if (channel->sock_funcs)
1527 return channel->sock_funcs->asocket(af,
1530 channel->sock_func_cb_data);
1532 return socket(af, type, protocol);
1535 int ares__connect_socket(ares_channel channel,
1536 ares_socket_t sockfd,
1537 const struct sockaddr *addr,
1538 ares_socklen_t addrlen)
1540 if (channel->sock_funcs)
1541 return channel->sock_funcs->aconnect(sockfd,
1544 channel->sock_func_cb_data);
1546 return connect(sockfd, addr, addrlen);
1549 void ares__close_socket(ares_channel channel, ares_socket_t s)
1551 if (channel->sock_funcs)
1552 channel->sock_funcs->aclose(s, channel->sock_func_cb_data);