tools/license2rtf.js

   1
   2 var assert = require('assert'),
   3     Stream = require('stream'),
   4     inherits = require('util').inherits;
   5
   6
   7 /*
   8  * This filter consumes a stream of characters and emits one string per line.
   9  */
  10 function LineSplitter() {
  11   var self = this,
  12       buffer = "";
  13
  14   Stream.call(this);
  15   this.writable = true;
  16
  17   this.write = function(data) {
  18     var lines = (buffer + data).split(/\r\n|\n\r|\n|\r/);
  19     for (var i = 0; i < lines.length - 1; i++) {
  20       self.emit('data', lines[i]);
  21     }
  22     buffer = lines[lines.length - 1];
  23     return true;
  24   };
  25
  26   this.end = function(data) {
  27     this.write(data || '');
  28     if (buffer) {
  29       self.emit('data', buffer);
  30     }
  31     self.emit('end');
  32   };
  33 }
  34 inherits(LineSplitter, Stream);
  35
  36
  37 /*
  38  * This filter consumes lines and emits paragraph objects.
  39  */
  40 function ParagraphParser() {
  41   var self = this,
  42       block_is_license_block = false,
  43       block_has_c_style_comment,
  44       is_first_line_in_paragraph,
  45       paragraph_line_indent,
  46       paragraph;
  47
  48    Stream.call(this);
  49    this.writable = true;
  50
  51    resetBlock(false);
  52
  53    this.write = function(data) {
  54      parseLine(data + '');
  55      return true;
  56    };
  57
  58    this.end = function(data) {
  59      if (data) {
  60        parseLine(data + '');
  61      }
  62      flushParagraph();
  63      self.emit('end');
  64    };
  65
  66   function resetParagraph() {
  67     is_first_line_in_paragraph = true;
  68     paragraph_line_indent = -1;
  69
  70     paragraph = {
  71       li: '',
  72       in_license_block: block_is_license_block,
  73       lines: []
  74     };
  75   }
  76
  77   function resetBlock(is_license_block) {
  78     block_is_license_block = is_license_block;
  79     block_has_c_style_comment = false;
  80     resetParagraph();
  81   }
  82
  83   function flushParagraph() {
  84     if (paragraph.lines.length || paragraph.li) {
  85       self.emit('data', paragraph);
  86     }
  87     resetParagraph();
  88   }
  89
  90   function parseLine(line) {
  91     // Strip trailing whitespace
  92     line = line.replace(/\s*$/, '');
  93
  94     // Detect block separator
  95     if (/^\s*(=|"){3,}\s*$/.test(line)) {
  96       flushParagraph();
  97       resetBlock(!block_is_license_block);
  98       return;
  99     }
 100
 101     // Strip comments around block
 102     if (block_is_license_block) {
 103       if (!block_has_c_style_comment)
 104         block_has_c_style_comment = /^\s*(\/\*)/.test(line);
 105       if (block_has_c_style_comment) {
 106         var prev = line;
 107         line = line.replace(/^(\s*?)(?:\s?\*\/|\/\*\s|\s\*\s?)/, '$1');
 108         if (prev == line)
 109           line = line.replace(/^\s{2}/, '');
 110         if (/\*\//.test(prev))
 111           block_has_c_style_comment = false;
 112       } else {
 113         // Strip C++ and perl style comments.
 114         line = line.replace(/^(\s*)(?:\/\/\s?|#\s?)/, '$1');
 115       }
 116     }
 117
 118     // Detect blank line (paragraph separator)
 119     if (!/\S/.test(line)) {
 120       flushParagraph();
 121       return;
 122     }
 123
 124     // Detect separator "lines" within a block. These mark a paragraph break
 125     // and are stripped from the output.
 126     if (/^\s*[=*\-]{5,}\s*$/.test(line)) {
 127       flushParagraph();
 128       return;
 129     }
 130
 131     // Find out indentation level and the start of a lied or numbered list;
 132     var result = /^(\s*)(\d+\.|\*|-)?\s*/.exec(line);
 133     assert.ok(result);
 134     // The number of characters that will be stripped from the beginning of
 135     // the line.
 136     var line_strip_length = result[0].length;
 137     // The indentation size that will be used to detect indentation jumps.
 138     // Fudge by 1 space.
 139     var line_indent = Math.floor(result[0].length / 2) * 2;
 140     // The indentation level that will be exported
 141     var level = Math.floor(result[1].length / 2);
 142     // The list indicator that precedes the actual content, if any.
 143     var line_li = result[2];
 144
 145     // Flush the paragraph when there is a li or an indentation jump
 146     if (line_li || (line_indent != paragraph_line_indent &&
 147                     paragraph_line_indent != -1)) {
 148       flushParagraph();
 149       paragraph.li = line_li;
 150     }
 151
 152     // Set the paragraph indent that we use to detect indentation jumps. When
 153     // we just detected a list indicator, wait
 154     // for the next line to arrive before setting this.
 155     if (!line_li && paragraph_line_indent != -1) {
 156       paragraph_line_indent = line_indent;
 157     }
 158
 159     // Set the output indent level if it has not been set yet.
 160     if (paragraph.level === undefined)
 161       paragraph.level = level;
 162
 163     // Strip leading whitespace and li.
 164     line = line.slice(line_strip_length);
 165
 166     if (line)
 167       paragraph.lines.push(line);
 168
 169     is_first_line_in_paragraph = false;
 170   }
 171 }
 172 inherits(ParagraphParser, Stream);
 173
 174
 175 /*
 176  * This filter consumes paragraph objects and emits modified paragraph objects.
 177  * The lines within the paragraph are unwrapped where appropriate. It also
 178  * replaces multiple consecutive whitespace characters by a single one.
 179  */
 180 function Unwrapper() {
 181   var self = this;
 182
 183   Stream.call(this);
 184   this.writable = true;
 185
 186   this.write = function(paragraph) {
 187     var lines = paragraph.lines,
 188         break_after = [],
 189         i;
 190
 191     for (i = 0; i < lines.length - 1; i++) {
 192       var line = lines[i];
 193
 194       // When a line is really short, the line was probably kept separate for a
 195       // reason.
 196       if (line.length < 50)  {
 197         // If the first word on the next line really didn't fit after the line,
 198         // it probably was just ordinary wrapping after all.
 199         var next_first_word_length = lines[i + 1].replace(/\s.*$/, '').length;
 200         if (line.length + next_first_word_length < 60) {
 201           break_after[i] = true;
 202         }
 203       }
 204     }
 205
 206     for (i = 0; i < lines.length - 1; ) {
 207       if (!break_after[i]) {
 208         lines[i] += ' ' + lines.splice(i + 1, 1)[0];
 209       } else {
 210         i++;
 211       }
 212     }
 213
 214     for (i = 0; i < lines.length; i++) {
 215       // Replace multiple whitespace characters by a single one, and strip
 216       // trailing whitespace.
 217       lines[i] = lines[i].replace(/\s+/g, ' ').replace(/\s+$/, '');
 218     }
 219
 220     self.emit('data', paragraph);
 221   };
 222
 223   this.end = function(data) {
 224     if (data)
 225       self.write(data);
 226     self.emit('end');
 227   };
 228 }
 229 inherits(Unwrapper, Stream);
 230
 231
 232 /*
 233  * This filter generates an rtf document from a stream of paragraph objects.
 234  */
 235 function RtfGenerator() {
 236   var self = this,
 237       did_write_anything = false;
 238
 239   Stream.call(this);
 240   this.writable = true;
 241
 242   this.write = function(paragraph) {
 243     if (!did_write_anything) {
 244       emitHeader();
 245       did_write_anything = true;
 246     }
 247
 248     var li = paragraph.li,
 249         level = paragraph.level + (li ? 1 : 0),
 250         lic = paragraph.in_license_block;
 251
 252     var rtf = "\\pard";
 253     rtf += '\\sa150\\sl300\\slmult1';
 254     if (level > 0)
 255       rtf += '\\li' + (level * 240);
 256     if (li) {
 257       rtf += '\\tx' + (level) * 240;
 258       rtf += '\\fi-240';
 259     }
 260     if (lic)
 261       rtf += '\\ri240';
 262     if (!lic)
 263       rtf += '\\b';
 264     if (li)
 265       rtf += ' ' + li + '\\tab';
 266     rtf += ' ';
 267     rtf += paragraph.lines.map(rtfEscape).join('\\line ');
 268     if (!lic)
 269       rtf += '\\b0';
 270     rtf += '\\par\n';
 271
 272     self.emit('data', rtf);
 273   };
 274
 275   this.end = function(data) {
 276     if (data)
 277       self.write(data);
 278     if (did_write_anything)
 279       emitFooter();
 280     self.emit('end');
 281   };
 282
 283   function toHex(number, length) {
 284     var hex = (~~number).toString(16);
 285     while (hex.length < length)
 286       hex = '0' + hex;
 287     return hex;
 288   }
 289
 290   function rtfEscape(string) {
 291     return string
 292       .replace(/[\\\{\}]/g, function(m) {
 293        return '\\' + m;
 294       })
 295       .replace(/\t/g, function() {
 296         return '\\tab ';
 297       })
 298       .replace(/[\x00-\x1f\x7f-\xff]/g, function(m) {
 299         return '\\\'' + toHex(m.charCodeAt(0), 2);
 300       })
 301       .replace(/\ufeff/g, '')
 302       .replace(/[\u0100-\uffff]/g, function(m) {
 303         return '\\u' + toHex(m.charCodeAt(0), 4) + '?';
 304      });
 305   }
 306
 307   function emitHeader() {
 308     self.emit('data', '{\\rtf1\\ansi\\ansicpg1252\\uc1\\deff0\\deflang1033' +
 309                       '{\\fonttbl{\\f0\\fswiss\\fcharset0 Tahoma;}}\\fs20\n' +
 310                       '{\\*\\generator txt2rtf 0.0.1;}\n');
 311   }
 312
 313   function emitFooter() {
 314     self.emit('data', '}');
 315   }
 316 }
 317 inherits(RtfGenerator, Stream);
 318
 319
 320 var stdin = process.stdin,
 321     stdout = process.stdout,
 322     line_splitter = new LineSplitter(),
 323     paragraph_parser = new ParagraphParser(),
 324     unwrapper = new Unwrapper(),
 325     rtf_generator = new RtfGenerator();
 326
 327 stdin.setEncoding('utf-8');
 328 stdin.resume();
 329
 330 stdin.pipe(line_splitter);
 331 line_splitter.pipe(paragraph_parser);
 332 paragraph_parser.pipe(unwrapper);
 333 unwrapper.pipe(rtf_generator);
 334 rtf_generator.pipe(stdout);