1 /* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
2 /* camel-mbox-parser.c : mbox folder parser */
6 * Copyright (C) 1999 Bertrand Guiheneuf <bertrand@helixcode.com> .
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License as
10 * published by the Free Software Foundation; either version 2 of the
11 * License, or (at your option) any later version.
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
26 #include "camel-mbox-parser.h"
27 #include "camel-log.h"
28 #include "camel-exception.h"
29 #include <sys/types.h>
38 #define MBOX_PARSER_BUF_SIZE 10000
40 #define MBOX_PARSER_FROM_KW "from:"
41 #define MBOX_PARSER_FROM_KW_SZ 5
43 #define MBOX_PARSER_DATE_KW "date:"
44 #define MBOX_PARSER_DATE_KW_SZ 5
46 #define MBOX_PARSER_SUBJECT_KW "subject:"
47 #define MBOX_PARSER_SUBJECT_KW_SZ 8
49 #define MBOX_PARSER_X_EVOLUTION_KW "x-evolution:"
50 #define MBOX_PARSER_X_EVOLUTION_KW_SZ 12
52 /* the maximum lentgh of all the previous keywords */
53 #define MBOX_PARSER_MAX_KW_SIZE 12
56 #define MBOX_PARSER_SUMMARY_SIZE 150
65 int fd; /* file descriptor of the mbox file */
66 glong real_position; /* real position in the file */
69 gchar *message_delimiter; /* message delimiter string */
70 guint message_delimiter_length;
72 guint message_summary_size; /* how many characters from the begining of the
73 mail to put into the message summary */
75 GArray *preparsed_messages; /* array of MessagePreParsingInfo */
76 CamelMboxParserMessageInfo current_message_info; /* used to store curent info */
77 gboolean is_pending_message; /* is there some message information pending ? */
80 gchar *buffer; /* temporary buffer */
81 guint left_chunk_size; /* size of the left chunk in the temp buffer */
82 guint last_position; /* last position that can be compared to a keyword */
83 guint current_position; /* current position in the temp buffer */
84 gboolean eof; /* did we read the entire file */
87 GString *tmp_string; /* temporary string to fill the headers in */
94 /* clear a preparsing info structure */
96 clear_message_info (CamelMboxParserMessageInfo *preparsing_info)
98 preparsing_info->message_position = 0;
99 preparsing_info->size = 0;
100 preparsing_info->from = NULL;
101 preparsing_info->date = NULL;
102 preparsing_info->subject = NULL;
103 preparsing_info->status = NULL;
104 preparsing_info->priority = NULL;
105 preparsing_info->references = NULL;
106 preparsing_info->body_summary = NULL;
107 preparsing_info->end_of_headers_offset = 0;
109 preparsing_info->x_evolution = NULL;
110 preparsing_info->x_evolution_offset = 0;
111 /* reparsing_info->x_evolution_length = 0; */
118 * new_parser: create a new parser object
119 * @fd: file descriptor opened on the mbox file
120 * @message_delimiter: the string that announce the start of a new message.
122 * Create a new parser object. This object is the place where are
123 * stored all the information concerning the parsing process.
125 * Return value: The newly created parser object.
127 static CamelMboxPreParser *
129 const gchar *message_delimiter)
132 CamelMboxPreParser *parser;
134 parser = g_new0 (CamelMboxPreParser, 1);
137 parser->buffer = g_new (gchar, MBOX_PARSER_BUF_SIZE);
138 parser->current_position = 0;
139 parser->message_delimiter = g_strdup (message_delimiter);
140 parser->message_delimiter_length = strlen (message_delimiter);
141 parser->real_position = 0;
142 parser->preparsed_messages = g_array_new (FALSE, FALSE, sizeof (CamelMboxParserMessageInfo));
143 parser->message_summary_size = MBOX_PARSER_SUMMARY_SIZE;
145 parser->left_chunk_size = MAX (parser->message_delimiter_length, MBOX_PARSER_MAX_KW_SIZE);
148 parser->tmp_string = g_string_sized_new (1000);
156 * parser_free: free the parser object
157 * @parser: the parser objet to free.
159 * it is important to notice that all structures allocated
160 * in new_parser () are freed ** EXCEPT ** the message
161 * information array, i.e. the preparsed_messages
165 parser_free (CamelMboxPreParser *parser)
167 g_free (parser->buffer);
168 g_free (parser->message_delimiter);
169 g_string_free (parser->tmp_string, TRUE);
177 /* ** handle exceptions here */
179 * initialize_buffer: read the first chunk of data in the buffer
180 * @parser: parser object to fill
181 * @first_position: position to start the read at
183 * read the first chunk of data from the mbox file.
187 initialize_buffer (CamelMboxPreParser *parser,
188 glong first_position)
195 /* set the search start position */
196 seek_res = lseek (parser->fd, first_position, SEEK_SET);
197 //if (seek_res == (off_t)-1) goto io_error;
200 /* the first part of the buffer is filled with newlines,
201 but the next time a chunk of buffer is read, it will
202 be filled with the last bytes of the previous chunk.
203 This allows simple g_strcasecmp to test for the presence of
205 memset (parser->buffer, '\n', parser->left_chunk_size);
207 buf_nb_read = read (parser->fd, parser->buffer + parser->left_chunk_size,
208 MBOX_PARSER_BUF_SIZE - parser->left_chunk_size);
209 } while ((buf_nb_read == -1) && (errno == EINTR));
210 /* ** check for an error here */
212 parser->last_position = buf_nb_read;
214 if (buf_nb_read == 0)
217 parser->current_position = parser->left_chunk_size;
224 * read_next_buffer_chunk: read the next chunk of data in the mbox file
225 * @parser: parser object
227 * read the next chunk of data in the mbox file.
228 * Routine copies the last part of the buffer at
229 * the begining are concatenate the read data to
230 * it. This allows strcmp of keywords in the buffer,
231 * until the last postion. That means you can
232 * do a strcmp (buffer, keyword) for any of the
233 * keyword defined at the begining of this file.
237 read_next_buffer_chunk (CamelMboxPreParser *parser)
243 /* read the next chunk of data in the folder file : */
244 /* - first, copy the last bytes from the previous
245 chunk at the begining of the new one. */
246 memcpy (parser->buffer,
247 parser->buffer + MBOX_PARSER_BUF_SIZE - parser->left_chunk_size,
248 parser->left_chunk_size);
250 /* - then read the next chunk on disk */
252 buf_nb_read = read (parser->fd,
253 parser->buffer + parser->left_chunk_size,
254 MBOX_PARSER_BUF_SIZE - parser->left_chunk_size);
255 } while ((buf_nb_read == -1) && (errno == EINTR));
256 /* ** check for an error here */
258 parser->last_position = buf_nb_read;
260 if (buf_nb_read == 0)
263 parser->current_position = 0;
270 * goto_next_char: go one postion forward in the buffer
271 * @parser: parser object
273 * goto one position forward in the buffer. If necessary,
274 * read the next chunk of data in the file, possibly
275 * raising the parser->eof flag.
279 goto_next_char (CamelMboxPreParser *parser)
281 if (parser->current_position < parser->last_position - 1)
282 parser->current_position++;
284 read_next_buffer_chunk (parser);
286 parser->real_position++;
296 * advance_n_chars: go n positions forward in the buffer.
297 * @parser: parser object
298 * @n: number of characters to advance.
302 advance_n_chars (CamelMboxPreParser *parser, guint n)
305 gint position_to_the_end;
307 position_to_the_end = parser->last_position - parser->current_position;
309 if (n < position_to_the_end)
310 parser->current_position += n;
312 printf ("Advance %d chars\n", n);
313 printf ("Last position = %d\n", parser->last_position);
314 printf ("Current position = %d\n", parser->current_position);
315 read_next_buffer_chunk (parser);
316 parser->current_position = n - position_to_the_end;
317 printf ("New position = %d\n", parser->current_position);
320 parser->real_position += n;
328 /* called when the buffer has detected the begining of
329 a new message. This routine is supposed to simply
330 store the previous message information and
331 clean the temporary structure used to store
336 * new_message_detected: routine to call when a new message has been detected
337 * @parser: parser object.
339 * this routine must be called when the keyword determining the
340 * begining of a new message has been detected. It pushes the
341 * information fetched for the last message into the message information
342 * array. Also, it gets the parser to the end of the line.
345 new_message_detected (CamelMboxPreParser *parser)
350 /* if we were filling a message information
351 save it in the message information array */
353 if (parser->is_pending_message) {
354 parser->current_message_info.size =
355 parser->real_position - parser->current_message_info.message_position;
356 g_array_append_vals (parser->preparsed_messages, (gchar *)parser +
357 G_STRUCT_OFFSET (CamelMboxPreParser, current_message_info), 1);
360 clear_message_info ( &(parser->current_message_info));
362 /* go to the end of the line */
365 c = parser->buffer[parser->current_position];
366 goto_next_char (parser);
370 /* save message position in the message information structure */
371 (parser->current_message_info).message_position = parser->real_position;
373 parser->is_pending_message = TRUE;
384 * read_header: read the header content contained after the current position.
385 * @parser: the parser object.
386 * @header_content: a pointer on a (char *) variable to feed with the obtained header string.
388 * This routine must be called when the parser has detected a header
389 * and it wants the header content to be stored. The parser current position
390 * must EXACTELY be located at the begining of the header content line.
391 * For example, if the file contains the line :
392 * from:Bertrand Guiheneuf <bertrand@helixcode.com>
394 * When this routine is called, the parser must be located
395 * on the "B" of "Bertrand".
397 * When this routine returns, the parser is located just
398 * after the "\n" at the end of the header content.
402 read_header (CamelMboxPreParser *parser, gchar **header_content)
404 gboolean space = FALSE;
405 gboolean newline = FALSE;
406 gboolean header_end = FALSE;
413 /* reset the header buffer string */
414 parser->tmp_string = g_string_truncate (parser->tmp_string, 0);
416 buffer = parser->buffer;
418 while (! (parser->eof || header_end) ) {
420 /* read the current character */
421 c = buffer[parser->current_position];
424 if (c == ' ' && c == '\t')
431 if (c == ' ' && c == '\t') {
448 /* feed the header content */
449 parser->tmp_string = g_string_append_c (parser->tmp_string, c);
451 next_char: /* read next char in the buffer */
452 goto_next_char (parser);
456 /* copy the buffer in the preparsing information structure */
457 *header_content = g_strndup (parser->tmp_string->str, parser->tmp_string->len);
467 * read_message_begining: read the first characters of a message body
468 * @parser: parser object
469 * @message_summary: a pointer on a (gchar *) variable where the obtained string will be stored.
471 * Read the first lines of a message. When calling this routine, the
472 * parser must be located at the begining of the message body.
474 * Return value: if the parsing inside this routine last read a newline, then %TRUE is returned, otherwise %FALSE is returned
477 read_message_begining (CamelMboxPreParser *parser, gchar **message_summary)
481 gboolean new_message = FALSE;
485 /* reset the header buffer string */
486 parser->tmp_string = g_string_truncate (parser->tmp_string, 0);
488 buffer = parser->buffer;
489 /* the message should not be filled character by
490 character but there is no g_string_n_append
491 function, so for the moment, this is a lazy
493 while (! (parser->eof) && (nb_line <2) && (nb_read<parser->message_summary_size) && (!new_message)) {
496 /* test if we are not at the end of the message */
497 if (buffer[parser->current_position] == '\n') {
500 goto_next_char (parser);
501 if ((parser->eof) || (g_strncasecmp (parser->buffer + parser->current_position,
502 parser->message_delimiter,
503 parser->message_delimiter_length) == 0)) {
507 /* we're not at the end, so let's just add the cr to the summary */
508 parser->tmp_string = g_string_append_c (parser->tmp_string,
517 parser->tmp_string = g_string_append_c (parser->tmp_string,
518 buffer[parser->current_position]);
520 goto_next_char (parser);
523 *message_summary = g_strndup (parser->tmp_string->str, parser->tmp_string->len);
538 * camel_mbox_parse_file: read an mbox file and parse it.
539 * @fd: file descriptor opened on the mbox file.
540 * @message_delimiter: character string delimiting the beginig of a new message
541 * @start_position: poition in the file where to start the parsing.
542 * @get_message_summary: should the parser retrieve the begining of the messages
543 * @status_callback: function to call peridically to indicate the progress of the parser
544 * @status_interval: floating value between 0 and 1 indicate how often to call @status_callback.
545 * @user_data: user data that will be passed to the callback function
547 * This routine parses an mbox file and retreives both the message starting positions and
548 * some of the informations contained in the message. Those informations are mainly
549 * some RFC822 headers values but also (optionally) the first characters of the mail
550 * body. The @get_message_summary parameter allows to enable or disable this option.
553 * Return value: An array of CamelMboxParserMessageInfo containing the informations on each message parsed in the file
556 camel_mbox_parse_file (int fd,
557 const gchar *message_delimiter,
558 glong start_position,
559 gboolean get_message_summary,
560 camel_mbox_preparser_status_callback *status_callback,
561 double status_interval,
564 CamelMboxPreParser *parser;
565 gboolean is_parsing_a_message = FALSE;
567 struct stat stat_buf;
569 glong total_file_size;
573 GArray *return_value;
576 fstat_result = fstat (fd, &stat_buf);
577 if (fstat_result == -1) {
578 g_warning ("Manage exception here \n");
581 total_file_size = stat_buf.st_size;
582 real_interval = status_interval * total_file_size;
585 /* create the parser */
586 parser = new_parser (fd, message_delimiter);
588 /* initialize the temporary char buffer */
589 initialize_buffer (parser, start_position);
591 /* the first line is indeed at the begining of a new line ... */
594 while (!parser->eof) {
599 /* read the current character */
601 c = parser->buffer[parser->current_position];
602 newline = (c == '\n');
603 goto_next_char (parser);
608 /* check if we reached a status milestone */
609 if ( status_callback && ((parser->real_position - last_status) > real_interval)) {
610 last_status += real_interval;
611 status_callback ((double)last_status / (double)total_file_size,
615 /* is the next part a message delimiter ? */
616 if (g_strncasecmp (parser->buffer + parser->current_position,
617 parser->message_delimiter,
618 parser->message_delimiter_length) == 0) {
620 is_parsing_a_message = TRUE;
621 new_message_detected (parser);
627 if (is_parsing_a_message) {
628 /* we could find the headers in a clever way, like
629 storing them in a list of pair
630 [keyword, offset_in_CamelMboxParserMessageInfo]
631 I am too busy for now. Contribution welcome */
633 /* is the next part a "from" header ? */
634 if (g_strncasecmp (parser->buffer + parser->current_position,
636 MBOX_PARSER_FROM_KW_SZ) == 0) {
638 advance_n_chars (parser, MBOX_PARSER_FROM_KW_SZ);
639 read_header (parser, (gchar **) ((gchar *)parser +
640 G_STRUCT_OFFSET (CamelMboxPreParser, current_message_info) +
641 G_STRUCT_OFFSET (CamelMboxParserMessageInfo, from)));
647 /* is the next part a "Date" header ? */
648 if (g_strncasecmp (parser->buffer + parser->current_position,
650 MBOX_PARSER_DATE_KW_SZ) == 0) {
652 advance_n_chars (parser, MBOX_PARSER_DATE_KW_SZ);
653 read_header (parser, (gchar **) ((gchar *)parser +
654 G_STRUCT_OFFSET (CamelMboxPreParser, current_message_info) +
655 G_STRUCT_OFFSET (CamelMboxParserMessageInfo, date)));
662 /* is the next part a "Subject" header ? */
663 if (g_strncasecmp (parser->buffer + parser->current_position,
664 MBOX_PARSER_SUBJECT_KW,
665 MBOX_PARSER_SUBJECT_KW_SZ) == 0) {
667 advance_n_chars (parser, MBOX_PARSER_SUBJECT_KW_SZ);
668 read_header (parser, (gchar **) ((gchar *)parser +
669 G_STRUCT_OFFSET (CamelMboxPreParser, current_message_info) +
670 G_STRUCT_OFFSET (CamelMboxParserMessageInfo, subject)));
677 /* is the next part a "X-evolution" header ? */
678 if (g_strncasecmp (parser->buffer + parser->current_position,
679 MBOX_PARSER_X_EVOLUTION_KW,
680 MBOX_PARSER_X_EVOLUTION_KW_SZ) == 0) {
682 /* in the case of the evolution private field, we store
683 the field position as well as its length because
684 we will have to change them */
685 parser->current_message_info.x_evolution_offset = parser->real_position
686 - parser->current_message_info.message_position;
687 advance_n_chars (parser, MBOX_PARSER_X_EVOLUTION_KW_SZ);
688 read_header (parser, (gchar **) ((gchar *)parser +
689 G_STRUCT_OFFSET (CamelMboxPreParser, current_message_info) +
690 G_STRUCT_OFFSET (CamelMboxParserMessageInfo, x_evolution)));
693 parser->current_message_info.x_evolution_length =
694 parser->real_position - parser->current_message_info.x_evolution_position;
703 /* is it an empty line ? */
704 if (parser->buffer[parser->current_position] == '\n') {
706 parser->current_message_info.end_of_headers_offset =
707 parser->real_position - parser->current_message_info.message_position;
709 goto_next_char (parser);
710 if (get_message_summary)
711 newline = read_message_begining (parser, (gchar **) ((gchar *)parser +
712 G_STRUCT_OFFSET (CamelMboxPreParser, current_message_info) +
713 G_STRUCT_OFFSET (CamelMboxParserMessageInfo, body_summary)));
715 is_parsing_a_message = FALSE;
724 /* if there is a pending message information put it in the array */
725 if (parser->is_pending_message) {
726 g_array_append_vals (parser->preparsed_messages, (gchar *)parser +
727 G_STRUCT_OFFSET (CamelMboxPreParser, current_message_info), 1);
730 return_value = parser->preparsed_messages;
732 /* free the parser */
733 parser_free (parser);
747 #ifdef MBOX_PARSER_TEST
748 /* to build the test :
750 gcc -O3 -I/opt/gnome/lib/glib/include `glib-config --cflags` -o test_parser -DMBOX_PARSER_TEST -I ../.. -I ../../.. -I /usr/lib/glib/include camel-mbox-parser.c `glib-config --libs` -lm
759 status (double done, gpointer user_data)
761 printf ("%d %% done\n", (int)floor (done * 100));
764 main (int argc, char **argv)
768 GArray *message_positions;
769 CamelMboxParserMessageInfo *message_info;
770 gchar tmp_buffer[50];
772 tmp_buffer[49] = '\0';
774 test_file_fd = open (argv[1], O_RDONLY);
775 message_positions = camel_mbox_parse_file (test_file_fd,
784 printf ("Found %d messages \n", message_positions->len);
787 for (i=0; i<message_positions->len; i++) {
789 message_info = ((CamelMboxParserMessageInfo *)(message_positions->data)) + i;
790 printf ("\n\n** Message %d : \n", i);
791 printf ("Size : %d\n", message_info->size);
792 printf ("From: %s\n", message_info->from);
793 printf ("Date: %s\n", message_info->date);
794 printf ("Subject: %s\n", message_info->subject);
795 printf ("Summary: %s\n", message_info->body_summary) ;
798 lseek (test_file_fd, message_info->message_position, SEEK_SET);
799 read (test_file_fd, tmp_buffer, 49);
800 printf ("File content at position %d : \n===\n%s\n===\n", message_info->message_position, tmp_buffer);
812 #endif /* MBOX_PARSER_TEST */