Backup of the first clean and working mbox file parser. It both find the
authorbertrand <bertrand@helixcode.com>
Wed, 12 Jan 2000 02:09:50 +0000 (02:09 +0000)
committerBertrand Guiheneuf <bertrand@src.gnome.org>
Wed, 12 Jan 2000 02:09:50 +0000 (02:09 +0000)
2000-01-11  bertrand  <bertrand@helixcode.com>

Backup of the first clean and working mbox file
parser. It both find the message and pre-parse
the message, that is, retrieve some key headers,
and the first lines of the body.

camel/providers/mbox/Makefile.am
camel/providers/mbox/camel-mbox-folder.c
camel/providers/mbox/camel-mbox-parser.c
camel/providers/mbox/camel-mbox-parser.h

index b614a63..3943cd7 100644 (file)
@@ -13,11 +13,13 @@ INCLUDES = -I.. -I$(srcdir)/.. -I$(includedir)      \
 
 libcamelmbox_la_SOURCES =                      \
        camel-mbox-folder.c                     \
-       camel-mbox-store.c                      
+       camel-mbox-store.c                      \
+       camel-mbox-parser.c
 
 libcamelmboxinclude_HEADERS =                  \
        camel-mbox-folder.h                     \
-       camel-mbox-store.h
+       camel-mbox-store.h                      \
+       camel-mbox-parser.h
 
 
 libcamelmbox_la_LDFLAGS = -version-info 0:0:0 -rpath $(libdir)
index 966218f..674f56c 100644 (file)
@@ -195,7 +195,6 @@ _open (CamelFolder *folder, CamelFolderOpenMode mode, CamelException *ex)
 {
        CamelMboxFolder *mbox_folder = CAMEL_MBOX_FOLDER (folder);
        struct dirent *dir_entry;
-       DIR *dir_handle;
        
        
        if (folder->open_state == FOLDER_OPEN) {
@@ -763,6 +762,7 @@ _list_subfolders (CamelFolder *folder, CamelException *ex)
 
 
 
+#if 0
 
 static CamelMimeMessage *
 _get_message_by_number (CamelFolder *folder, gint number, CamelException *ex)
@@ -803,9 +803,14 @@ _get_message_by_number (CamelFolder *folder, gint number, CamelException *ex)
 #warning Set flags and all this stuff here
                }
                g_free (message_file_name);
+
        } else 
                CAMEL_LOG_FULL_DEBUG  ("CanelMhFolder::get_message message number = %d, not found\n", number);
        
        
        return message;   
 }
+
+#endif 
+
+
index a13032a..e5c18e7 100644 (file)
 #include "camel-mbox-parser.h"
 #include "camel-log.h"
 #include "camel-exception.h"
-
 #include <sys/types.h>
 #include <unistd.h>
+#include <errno.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+
+
+
+#define MBOX_PARSER_BUF_SIZE 1000
+#define MBOX_PARSER_FROM_KW "from:"               
+#define MBOX_PARSER_FROM_KW_SZ 5
+
+#define MBOX_PARSER_DATE_KW "date:"
+#define MBOX_PARSER_DATE_KW_SZ 5
+
+#define MBOX_PARSER_X_EVOLUTION_KW "x-evolution:"
+#define MBOX_PARSER_X_EVOLUTION_KW_SZ 12
+
+/* the maximum lentgh of all the previous keywords */
+#define MBOX_PARSER_MAX_KW_SIZE 12
+
+
+#define MBOX_PARSER_SUMMARY_SIZE 100
 
 
 
 
-GList * 
-camel_mbox_find_message_positions (int fd, gint first_position, CamelException *ex)
+
+
+typedef struct {
+       
+       int fd;                          /* file descriptor of the mbox file */
+       guint real_position;             /* real position in the file */
+
+       
+       gchar *message_delimiter;        /* message delimiter string */
+       guint message_delimiter_length;
+
+       guint message_summary_size;      /* how many characters from the begining of the 
+                                          mail to put into the message summary */
+       
+       GArray *preparsed_messages;      /* array of MessagePreParsingInfo */
+       CamelMboxParserMessageInfo current_message_info;  /* used to store curent info */
+       gboolean is_pending_message;     /* is there some message information pending ? */
+
+       /* buffer info */
+       gchar *buffer;                   /* temporary buffer */
+       guint left_chunk_size;           /* size of the left chunk in the temp buffer */
+       guint last_position;             /* last position that can be compared to a keyword */
+       guint current_position;          /* current position in the temp buffer */
+       gboolean eof;                    /* did we read the entire file */
+
+       /* other */
+       GString *tmp_string;             /* temporary string to fill the headers in */
+
+       
+       
+} CamelMboxPreParser;
+
+
+/* clear a preparsing info structure */
+static void
+clear_message_info (CamelMboxParserMessageInfo *preparsing_info)
 {
-#define MBOX_PARSER_BUF_SIZE 1000
+       preparsing_info->message_position = 0;
+       preparsing_info->from = NULL;
+       preparsing_info->date = NULL;
+       preparsing_info->subject = NULL;
+       preparsing_info->status = NULL;
+       preparsing_info->priority = NULL;
+       preparsing_info->references = NULL;
+}
 
-       off_t seek_res;
-       GList *message_positions = NULL;
-       char buffer[MBOX_PARSER_BUF_SIZE]; 
-       ssize_t buf_nb_read;
 
 
-       /* set the initial position */
-       seek_res = lseek (fd, first_position, SEEK_SET);
-       if (seek_res == (off_t)-1) goto io_error;
+static CamelMboxPreParser *
+new_parser (int fd,
+           const gchar *message_delimiter) 
+{
+       
+       CamelMboxPreParser *parser;
 
-       /* populate the buffer and initialize the search proc */
-       buf_nb_read = read (fd, buffer, MBOX_PARSER_BUF_SIZE);
+       parser = g_new0 (CamelMboxPreParser, 1);
        
-       while (buf_nb_read>0) {
-               current_pos = 0;
+       parser->fd = fd;
+       parser->buffer = g_new (gchar, MBOX_PARSER_BUF_SIZE);
+       parser->current_position = 0;
+       parser->message_delimiter = g_strdup (message_delimiter);
+       parser->message_delimiter_length = strlen (message_delimiter);
+       parser->real_position = 0;      
+       parser->preparsed_messages = g_array_new (FALSE, FALSE, sizeof (CamelMboxParserMessageInfo));
+       parser->message_summary_size = MBOX_PARSER_SUMMARY_SIZE;
+       
+       parser->left_chunk_size = MAX (parser->message_delimiter_length, MBOX_PARSER_MAX_KW_SIZE);
+       parser->eof = FALSE;
+       
+       parser->tmp_string = g_string_sized_new (1000);
+
+       return parser;
+}
+
+
+
+/* ** handle exceptions here */
+/* read the first chunk of data in the buffer */
+static void 
+initialize_buffer (CamelMboxPreParser *parser,
+                  guint first_position)
+{
+       gint seek_res;
+       gint buf_nb_read;
+
+       g_assert (parser);
+
+       /* set the search start position */
+       seek_res = lseek (parser->fd, first_position, SEEK_SET);
+       //if (seek_res == (off_t)-1) goto io_error;
+       
+       
+       /* the first part of the buffer is filled with newlines, 
+          but the next time a chunk of buffer is read, it will
+          be filled with the last bytes of the previous chunk. 
+          This allows simple g_strcasecmp to test for the presence of 
+          the keyword */
+       memset (parser->buffer, '\n', parser->left_chunk_size);
+       do {
+               buf_nb_read = read (parser->fd, parser->buffer + parser->left_chunk_size, 
+                                   MBOX_PARSER_BUF_SIZE - parser->left_chunk_size);
+       } while ((buf_nb_read == -1) && (errno == EINTR));
+       /* ** check for an error here */
+
+       parser->last_position = buf_nb_read - parser->left_chunk_size;
+       if (buf_nb_read < (MBOX_PARSER_BUF_SIZE - parser->left_chunk_size))
+               parser->eof =TRUE;
+
+       parser->current_position = 0;
+}
+
+
+
+
+/* read next data in the mbox file */
+static void 
+read_next_buffer_chunk (CamelMboxPreParser *parser)
+{
+       gint buf_nb_read;
+
+
+       g_assert (parser);
+       
+       /* read the next chunk of data in the folder file  : */
+       /*  -   first, copy the last bytes from the previous 
+           chunk at the begining of the new one. */
+       memcpy (parser->buffer, 
+               parser->buffer + MBOX_PARSER_BUF_SIZE - parser->left_chunk_size, 
+               parser->left_chunk_size);
+
+       /*  -   then read the next chunk on disk */
+       do {
+               buf_nb_read = read (parser->fd, 
+                                   parser->buffer + parser->left_chunk_size, 
+                                   MBOX_PARSER_BUF_SIZE - parser->left_chunk_size);    
+       } while ((buf_nb_read == -1) && (errno == EINTR));
+       /* ** check for an error here */
+
+
+       parser->last_position = buf_nb_read - parser->left_chunk_size;
+       if (buf_nb_read < (MBOX_PARSER_BUF_SIZE - parser->left_chunk_size))
+               parser->eof =TRUE;
+
+       parser->current_position = 0;
+       
+}
+
+
+
+/* read next char in the buffer */
+static void 
+goto_next_char (CamelMboxPreParser *parser) 
+{      
+       if (parser->current_position < parser->last_position)
+                       parser->current_position++;
+       else 
+               read_next_buffer_chunk (parser);
+
+       parser->real_position++;
+}
+
+
+
+
+static void 
+new_message_detected (CamelMboxPreParser *parser)
+{
+       /* if we were filling a message information 
+          save it in the message information array */ 
+
+       if (parser->is_pending_message) {
+               g_array_append_vals (parser->preparsed_messages, (gchar *)parser + 
+                                   G_STRUCT_OFFSET (CamelMboxPreParser, current_message_info), 1);
+}
+       
+       clear_message_info ( &(parser->current_message_info));
+
+       (parser->current_message_info).message_position = parser->real_position;
+
+       parser->is_pending_message = TRUE;
                
+}
+
+
+
+
+/* read a header value and put it in the string pointer
+   to by header_content */
+static void 
+read_header (CamelMboxPreParser *parser, gchar **header_content)
+{
+       gboolean space = FALSE;
+       gboolean newline = FALSE;
+       gboolean header_end = FALSE;
+       gchar *buffer;
+       gchar c;
+       
+
+       g_assert (parser);
+
+       /* reset the header buffer string */
+       parser->tmp_string = g_string_truncate (parser->tmp_string, 0);
+
+       buffer = parser->buffer;
+
+       while (! (parser->eof || header_end) ) {
                
+               /* read the current character */
+               c = buffer[parser->current_position];
                
+               if (space) {
+                       if (c == ' ' && c == '\t')
+                               goto next_char;
+                       else
+                               space = FALSE;
+               }
+
+               if (newline) {
+                       if (c == ' ' && c == '\t') {
+
+                               space = TRUE;
+                               newline = FALSE;
+                               goto next_char;
+                       } else {
+
+                               header_end = TRUE;
+                               continue;
+                       }
+               }
+
+               if (c == '\n') {
+                       newline = TRUE;
+                       goto next_char;
+               }
+
+               /* feed the header content */
+               parser->tmp_string = g_string_append_c (parser->tmp_string, c);
 
-               /* read the next chunk of data in the folder file */
-               buf_nb_read = read (fd, buffer, MBOX_PARSER_BUF_SIZE);  
+       next_char: /* read next char in the buffer */
+               goto_next_char (parser);
        }
+
        
+       /* copy the buffer in the preparsing information structure */
+       *header_content = g_strndup (parser->tmp_string->str, parser->tmp_string->len); 
+}
+
+
+/* read the begining of the message and put it in the message
+   summary field 
+   
+*/
+static void
+read_message_begining (CamelMboxPreParser *parser, gchar **message_summary)
+{
+       guint nb_read = 0;
+       gchar *buffer;
        
+       g_assert (parser);
        
-               
+       /* reset the header buffer string */
+       parser->tmp_string = g_string_truncate (parser->tmp_string, 0);
+       
+       buffer = parser->buffer;
+       /* the message should not be filled character by
+          character but there is no g_string_n_append 
+          function, so for the moment, this is a lazy 
+          implementation */
+       while (! (parser->eof) && nb_read<parser->message_summary_size) {
+
+               parser->tmp_string = g_string_append_c (parser->tmp_string, 
+                                                       buffer[parser->current_position]);
+               nb_read++;
+               goto_next_char (parser);
+       }
+
+       *message_summary = g_strndup (parser->tmp_string->str, parser->tmp_string->len);
+}
+
+
+
+
+
+
+
 
+GArray *
+camel_mbox_parse_file (int fd, guint start_position, const gchar *message_delimiter)
+{
+       CamelMboxPreParser *parser;
+       gboolean is_parsing_a_message = FALSE;
+       gchar c;
        
-               /* io exception handling */
-       io_error : 
 
-               switch errno { 
-               case EACCES :
+
+       /* create the parser */
+       parser = new_parser (fd, message_delimiter);
+       
+       /* initialize the temporary char buffer */
+       initialize_buffer (parser, start_position);
+       
+       while (!parser->eof) {
+               
+               /* read the current character */
+               c = parser->buffer[parser->current_position];
+               goto_next_char (parser);
                        
-                       camel_exception_setv (ex, 
-                                             CAMEL_EXCEPTION_FOLDER_INSUFFICIENT_PERMISSION,
-                                             "Unable to list the directory. Full Error text is : %s ", 
-                                             strerror (errno));
-                       break;
+               if (c == '\n') {
                        
-               case ENOENT :
-               case ENOTDIR :
-                       camel_exception_setv (ex, 
-                                             CAMEL_EXCEPTION_FOLDER_INVALID_PATH,
-                                             "Invalid mbox folder path. Full Error text is : %s ", 
-                                             strerror (errno));
-                       break;
+                       /* is the next part a message delimiter ? */
+                       if (g_strncasecmp (parser->buffer + parser->current_position, 
+                                          parser->message_delimiter, 
+                                          parser->message_delimiter_length) == 0) {
+                               
+                               is_parsing_a_message = TRUE;
+                               new_message_detected (parser);
+                               goto_next_char (parser);
+                               continue;
+                       }
                        
-               default :
-                       camel_exception_set (ex, 
-                                            CAMEL_EXCEPTION_SYSTEM,
-                                            "Unable to delete the mbox folder.");
                        
+                       if (is_parsing_a_message) {
+                               
+                               /* is the next part a "from" header ? */
+                               if (g_strncasecmp (parser->buffer + parser->current_position, 
+                                                 MBOX_PARSER_FROM_KW, 
+                                                 MBOX_PARSER_FROM_KW_SZ) == 0) {
+
+                                       parser->current_position += MBOX_PARSER_FROM_KW_SZ;
+                                       read_header (parser, (gchar **) ((gchar *)parser +
+                                                    G_STRUCT_OFFSET (CamelMboxPreParser, current_message_info) + 
+                                                    G_STRUCT_OFFSET (CamelMboxParserMessageInfo, from)));
+                                       continue;
+                               }
+
+                               /* is it an empty line ? */
+                               if (parser->buffer[parser->current_position] == '\n') {
+                                       
+                                       goto_next_char (parser);
+                                       read_message_begining (parser,  (gchar **) ((gchar *)parser +
+                                                              G_STRUCT_OFFSET (CamelMboxPreParser, current_message_info) + 
+                                                              G_STRUCT_OFFSET (CamelMboxParserMessageInfo, body_summary)));
+                                       is_parsing_a_message = FALSE;
+                               }
+                                       
+                       }
                }
+               
+       }
+       
+       /* if there is a pending message information put it in the array */
+       if (parser->is_pending_message) {
+               g_array_append_vals (parser->preparsed_messages, (gchar *)parser + 
+                                    G_STRUCT_OFFSET (CamelMboxPreParser, current_message_info), 1);    
+       }
+       
+       /* free the parser */
+       /* ** FIXME : FREE THE PARSER */
+
+       return parser->preparsed_messages;
+       
+}
+
+
+
 
+
+
+
+
+
+
+#ifdef MBOX_PARSER_TEST
+/* to build the test : 
+   gcc -o test_parser -DMBOX_PARSER_TEST -I ../.. -I ../../.. \
+   -I /usr/lib/glib/include camel-mbox-parser.c \
+   -lglib ../../.libs/libcamel.a
+
+   
+ */
+   
+int 
+main (int argc, char **argv)
+{
+       int test_file_fd;
+       int i;
+       GArray *message_positions; 
+       CamelMboxParserMessageInfo *message_info;
+
+
+       test_file_fd = open (argv[1], O_RDONLY);
+       message_positions = camel_mbox_parse_file (test_file_fd, 
+                                                  0,
+                                                  "From ");
+
+       printf ("Found %d messages \n", message_positions->len);
+       
+#if 0
+       for (i=0; i<message_positions->len; i++) {
+               //message_info = g_array_index(message_positions, CamelMboxParserMessageInfo, i);
+               message_info = ((CamelMboxParserMessageInfo *)(message_positions->data)) + i;
+               printf ("\n\n** Message %d : \n", i);
+               printf ("\t From: %s\n", message_info->from) ;
+               printf ("\t Summary: %s\n", message_info->body_summary) ;
+       }
+#endif
 }
+
+
+
+
+#endif /* MBOX_PARSER_TEST */
index 19b7a42..994e5d8 100644 (file)
  * USA
  */
 
+#include <glib.h>
+#include "camel-log.h"
+#include "camel-exception.h"
 
 
+typedef struct {
+
+       guint message_position;
+       gchar *from;
+       gchar *date;
+       gchar *subject;
+       gchar *status;
+       gchar *priority;
+       gchar *references;
+       gchar *body_summary;
+
+} CamelMboxParserMessageInfo;
+
+
+GArray * camel_mbox_find_message_positions (int fd, 
+                                           const gchar *message_delimiter,
+                                           gint first_position, 
+                                           CamelException *ex);