2 * Copyright (C) 2011 Google Inc. All rights reserved.
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are
8 * * Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above
11 * copyright notice, this list of conditions and the following disclaimer
12 * in the documentation and/or other materials provided with the
14 * * Neither the name of Google Inc. nor the names of its
15 * contributors may be used to endorse or promote products derived from
16 * this software without specific prior written permission.
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 #if ENABLE(VIDEO_TRACK)
35 #include "WebVTTParser.h"
37 #include "HTMLElement.h"
38 #include "ProcessingInstruction.h"
39 #include "SegmentedString.h"
41 #include <wtf/text/WTFString.h>
45 const int secondsPerHour = 3600;
46 const int secondsPerMinute = 60;
47 const double malformedTime = -1;
48 const unsigned bomLength = 3;
49 const unsigned fileIdentiferLength = 6;
51 unsigned WebVTTParser::fileIdentifierMaximumLength()
53 return bomLength + fileIdentiferLength;
56 inline bool hasLongWebVTTIdentifier(String line)
58 // If line is more than six characters ...
59 if (line.length() < fileIdentiferLength)
62 // but the first six characters do not exactly equal "WEBVTT" ...
63 if (line.substring(0, fileIdentiferLength) != "WEBVTT")
66 // or the seventh character is neither a space nor a tab character, then abort.
67 if (line.length() > fileIdentiferLength && line[fileIdentiferLength] != ' ' && line[fileIdentiferLength] != '\t')
73 bool WebVTTParser::hasRequiredFileIdentifier(const char* data, unsigned length)
75 // A WebVTT file identifier consists of an optional BOM character,
76 // the string "WEBVTT" followed by an optional space or tab character,
77 // and any number of characters that are not line terminators ...
78 unsigned position = 0;
79 if (length >= bomLength && data[0] == '\xEF' && data[1] == '\xBB' && data[2] == '\xBF')
80 position += bomLength;
81 String line = collectNextLine(data, length, &position);
83 if (line.length() < fileIdentiferLength)
85 if (line.length() == fileIdentiferLength && line != "WEBVTT")
87 if (!hasLongWebVTTIdentifier(line))
93 String WebVTTParser::collectDigits(const String& input, unsigned* position)
96 while (*position < input.length() && isASCIIDigit(input[*position]))
97 digits.append(input[(*position)++]);
98 return digits.toString();
101 String WebVTTParser::collectWord(const String& input, unsigned* position)
103 StringBuilder string;
104 while (*position < input.length() && !isASpace(input[*position]))
105 string.append(input[(*position)++]);
106 return string.toString();
109 WebVTTParser::WebVTTParser(WebVTTParserClient* client, ScriptExecutionContext* context)
110 : m_scriptExecutionContext(context)
112 , m_tokenizer(WebVTTTokenizer::create())
117 void WebVTTParser::getNewCues(Vector<RefPtr<TextTrackCue> >& outputCues)
119 outputCues = m_cuelist;
123 void WebVTTParser::parseBytes(const char* data, unsigned length)
125 // 4.8.10.13.3 WHATWG WebVTT Parser algorithm.
126 // 1-3 - Initial setup.
127 unsigned position = 0;
129 while (position < length) {
130 String line = collectNextLine(data, length, &position);
134 // 4-12 - Collect the first line and check for "WEBVTT".
135 if (!hasRequiredFileIdentifier(data, length))
141 // 13-18 - Allow a header (comment area) under the WEBVTT line.
147 // 19-29 - Allow any number of line terminators, then initialize new cue values.
152 // 30-39 - Check if this line contains an optional identifier or timing data.
153 m_state = collectCueId(line);
156 case TimingsAndSettings:
157 // 40 - Collect cue timings and settings.
158 m_state = collectTimingsAndSettings(line);
162 // 41-53 - Collect the cue text, create a cue, and add it to the output.
163 m_state = collectCueText(line, length, position);
167 // 54-62 - Collect and discard the remaining cue.
168 m_state = ignoreBadCue(line);
174 WebVTTParser::ParseState WebVTTParser::collectCueId(const String& line)
176 if (line.contains("-->"))
177 return collectTimingsAndSettings(line);
179 return TimingsAndSettings;
182 WebVTTParser::ParseState WebVTTParser::collectTimingsAndSettings(const String& line)
184 // 4.8.10.13.3 Collect WebVTT cue timings and settings.
185 // 1-3 - Let input be the string being parsed and position be a pointer into input
186 unsigned position = 0;
187 skipWhiteSpace(line, &position);
189 // 4-5 - Collect a WebVTT timestamp. If that fails, then abort and return failure. Otherwise, let cue's text track cue start time be the collected time.
190 m_currentStartTime = collectTimeStamp(line, &position);
191 if (m_currentStartTime == malformedTime)
193 if (position >= line.length())
195 char nextChar = line[position++];
196 if (nextChar != ' ' && nextChar != '\t')
198 skipWhiteSpace(line, &position);
200 // 6-9 - If the next three characters are not "-->", abort and return failure.
201 if (line.find("-->", position) == notFound)
204 if (position >= line.length())
206 nextChar = line[position++];
207 if (nextChar != ' ' && nextChar != '\t')
209 skipWhiteSpace(line, &position);
211 // 10-11 - Collect a WebVTT timestamp. If that fails, then abort and return failure. Otherwise, let cue's text track cue end time be the collected time.
212 m_currentEndTime = collectTimeStamp(line, &position);
213 if (m_currentEndTime == malformedTime)
215 skipWhiteSpace(line, &position);
217 // 12 - Parse the WebVTT settings for the cue (conducted in TextTrackCue).
218 m_currentSettings = line.substring(position, line.length()-1);
222 WebVTTParser::ParseState WebVTTParser::collectCueText(const String& line, unsigned length, unsigned position)
224 if (line.isEmpty()) {
228 if (!m_currentContent.isEmpty())
229 m_currentContent.append("\n");
230 m_currentContent.append(line);
232 if (position >= length)
238 WebVTTParser::ParseState WebVTTParser::ignoreBadCue(const String& line)
245 void WebVTTParser::processCueText()
247 // 51 - Cue text processing based on
248 // 4.8.10.13.4 WebVTT cue text parsing rules and
249 // 4.8.10.13.5 WebVTT cue text DOM construction rules.
250 if (m_currentContent.length() <= 0)
253 ASSERT(m_scriptExecutionContext->isDocument());
254 Document* document = static_cast<Document*>(m_scriptExecutionContext);
256 m_attachmentRoot = DocumentFragment::create(document);
257 m_currentNode = m_attachmentRoot;
258 m_tokenizer->reset();
261 SegmentedString content(m_currentContent.toString());
262 while (m_tokenizer->nextToken(content, m_token))
263 constructTreeFromToken(document);
265 RefPtr<TextTrackCue> cue = TextTrackCue::create(m_scriptExecutionContext, m_currentId, m_currentStartTime, m_currentEndTime, m_currentContent.toString(), m_currentSettings, false);
266 cue->setCueHTML(m_attachmentRoot);
267 m_cuelist.append(cue);
268 m_client->newCuesParsed();
271 void WebVTTParser::resetCueValues()
273 m_currentId = emptyString();
274 m_currentSettings = emptyString();
275 m_currentStartTime = 0;
276 m_currentEndTime = 0;
277 m_currentContent.clear();
280 double WebVTTParser::collectTimeStamp(const String& line, unsigned* position)
282 // 4.8.10.13.3 Collect a WebVTT timestamp.
283 // 1-4 - Initial checks, let most significant units be minutes.
284 enum Mode { minutes, hours };
286 if (*position >= line.length() || !isASCIIDigit(line[*position]))
287 return malformedTime;
289 // 5-6 - Collect a sequence of characters that are 0-9.
290 String digits1 = collectDigits(line, position);
291 int value1 = digits1.toInt();
293 // 7 - If not 2 characters or value is greater than 59, interpret as hours.
294 if (digits1.length() != 2 || value1 > 59)
297 // 8-12 - Collect the next sequence of 0-9 after ':' (must be 2 chars).
298 if (*position >= line.length() || line[(*position)++] != ':')
299 return malformedTime;
300 if (*position >= line.length() || !isASCIIDigit(line[(*position)]))
301 return malformedTime;
302 String digits2 = collectDigits(line, position);
303 int value2 = digits2.toInt();
304 if (digits2.length() != 2)
305 return malformedTime;
307 // 13 - Detect whether this timestamp includes hours.
309 if (mode == hours || (*position < line.length() && line[*position] == ':')) {
310 if (*position >= line.length() || line[(*position)++] != ':')
311 return malformedTime;
312 if (*position >= line.length() || !isASCIIDigit(line[*position]))
313 return malformedTime;
314 String digits3 = collectDigits(line, position);
315 if (digits3.length() != 2)
316 return malformedTime;
317 value3 = digits3.toInt();
324 // 14-19 - Collect next sequence of 0-9 after '.' (must be 3 chars).
325 if (*position >= line.length() || line[(*position)++] != '.')
326 return malformedTime;
327 if (*position >= line.length() || !isASCIIDigit(line[*position]))
328 return malformedTime;
329 String digits4 = collectDigits(line, position);
330 if (digits4.length() != 3)
331 return malformedTime;
332 int value4 = digits4.toInt();
333 if (value2 > 59 || value3 > 59)
334 return malformedTime;
336 // 20-21 - Calculate result.
337 return (value1 * secondsPerHour) + (value2 * secondsPerMinute) + value3 + ((double)value4 / 1000);
340 void WebVTTParser::constructTreeFromToken(Document* document)
342 AtomicString tokenTagName(m_token.name().data(), m_token.name().size());
343 QualifiedName tagName(nullAtom, tokenTagName, xhtmlNamespaceURI);
345 switch (m_token.type()) {
346 case WebVTTTokenTypes::Character: {
347 String content(m_token.characters().data(), m_token.characters().size());
348 RefPtr<Text> child = Text::create(document, content);
349 m_currentNode->parserAddChild(child);
352 case WebVTTTokenTypes::StartTag: {
353 RefPtr<HTMLElement> child;
354 if (isRecognizedTag(tokenTagName))
355 child = HTMLElement::create(tagName, document);
356 else if (m_token.name().size() == 1 && m_token.name()[0] == 'c')
357 child = HTMLElement::create(spanTag, document);
358 else if (m_token.name().size() == 1 && m_token.name()[0] == 'v')
359 child = HTMLElement::create(qTag, document);
362 if (m_token.classes().size() > 0) {
363 RefPtr<NamedNodeMap> attributeMap = NamedNodeMap::create();
364 attributeMap->addAttribute(Attribute::createMapped(classAttr, AtomicString(m_token.classes().data(), m_token.classes().size())));
365 child->setAttributeMap(attributeMap.release());
367 if (child->hasTagName(qTag))
368 child->setAttribute(titleAttr, String(m_token.annotation().data(), m_token.annotation().size()));
369 m_currentNode->parserAddChild(child);
370 m_currentNode = child;
374 case WebVTTTokenTypes::EndTag:
375 if (isRecognizedTag(tokenTagName)
376 || (m_token.name().size() == 1 && m_token.name()[0] == 'c')
377 || (m_token.name().size() == 1 && m_token.name()[0] == 'v')) {
378 if (m_currentNode->parentNode())
379 m_currentNode = m_currentNode->parentNode();
382 case WebVTTTokenTypes::TimestampTag: {
383 unsigned position = 0;
384 double time = collectTimeStamp(m_token.characters().data(), &position);
385 if (time != malformedTime)
386 m_currentNode->parserAddChild(ProcessingInstruction::create(document, "timestamp", String(m_token.characters().data(), m_token.characters().size())));
395 void WebVTTParser::skipWhiteSpace(const String& line, unsigned* position)
397 while (*position < line.length() && isASpace(line[*position]))
401 void WebVTTParser::skipLineTerminator(const char* data, unsigned length, unsigned* position)
403 if (*position >= length)
405 if (data[*position] == '\r')
407 if (*position >= length)
409 if (data[*position] == '\n')
413 String WebVTTParser::collectNextLine(const char* data, unsigned length, unsigned* position)
415 unsigned oldPosition = *position;
416 while (*position < length && data[*position] != '\r' && data[*position] != '\n')
418 String line = String::fromUTF8(data + oldPosition, *position - oldPosition);
419 skipLineTerminator(data, length, position);