gecko-dev/media/webvtt/parser.c
Caitlin Potter 1a3621792c Bug 833403 - Integrate libwebvtt into Mozilla. r=ted
- Updated for new build system (2/28/2013)
 - declaration-after-statement no longer breaking MSVC build
 - Source files and scripts now contain appropriate license info
 - media/webvtt/update.sh no longer hiding unexpected/significant
   errors.
2013-03-01 19:25:19 -05:00

1388 lines
38 KiB
C

/**
* Copyright (c) 2013 Mozilla Foundation and Contributors
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met:
*
* - Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "parser_internal.h"
#include "cuetext_internal.h"
#include "cue_internal.h"
#include <string.h>
#define _ERROR(X) do { if( skip_error == 0 ) { ERROR(X); } } while(0)
static const webvtt_byte separator[] = {
UTF8_HYPHEN_MINUS, UTF8_HYPHEN_MINUS, UTF8_GREATER_THAN
};
#define MSECS_PER_HOUR (3600000)
#define MSECS_PER_MINUTE (60000)
#define MSECS_PER_SECOND (1000)
#define BUFFER (self->buffer + self->position)
#define MALFORMED_TIME ((webvtt_timestamp_t)-1.0)
static int find_bytes( const webvtt_byte *buffer, webvtt_uint len, const webvtt_byte *sbytes, webvtt_uint slen );
static webvtt_status webvtt_skipwhite( const webvtt_byte *buffer, webvtt_uint *pos, webvtt_uint len );
static webvtt_int64 parse_int( const webvtt_byte **pb, int *pdigits );
WEBVTT_EXPORT webvtt_status
webvtt_create_parser( webvtt_cue_fn on_read,
webvtt_error_fn on_error, void *
userdata,
webvtt_parser *ppout )
{
webvtt_parser p;
if( !on_read || !on_error || !ppout ) {
return WEBVTT_INVALID_PARAM;
}
if( !( p = ( webvtt_parser )webvtt_alloc0( sizeof * p ) ) ) {
return WEBVTT_OUT_OF_MEMORY;
}
memset( p->astack, 0, sizeof( p->astack ) );
p->stack = p->astack;
p->top = p->stack;
p->top->state = T_INITIAL;
p->stack_alloc = sizeof( p->astack ) / sizeof( p->astack[0] );
p->read = on_read;
p->error = on_error;
p->column = p->line = 1;
p->userdata = userdata;
p->finished = 0;
*ppout = p;
return WEBVTT_SUCCESS;
}
/**
* Helper to validate a cue and, if valid, notify the application that a cue has
* been read.
* If it fails to validate, silently delete the cue.
*
* ( This might not be the best way to go about this, and additionally,
* webvtt_validate_cue has no means to report errors with the cue, and we do
* nothing with its return value )
*/
static void
finish_cue( webvtt_parser self, webvtt_cue **pcue )
{
if( pcue ) {
webvtt_cue *cue = *pcue;
if( cue ) {
if( webvtt_validate_cue( cue ) ) {
self->read( self->userdata, cue );
} else {
webvtt_release_cue( &cue );
}
*pcue = 0;
}
}
}
/**
* This routine tries to clean up the stack
* for us, to prevent leaks.
*
* It should also help find errors in stack management.
*/
WEBVTT_INTERN void
cleanup_stack( webvtt_parser self )
{
webvtt_state *st = self->top;
while( st >= self->stack ) {
switch( st->type ) {
case V_CUE:
webvtt_release_cue( &st->v.cue );
break;
case V_TEXT:
webvtt_release_string( &st->v.text );
break;
/**
* TODO: Clean up cuetext nodes as well.
* Eventually the cuetext parser will probably be making use
* of this stack, and will need to manage it well also.
*/
}
st->type = V_NONE;
st->line = st->column = st->token = 0;
st->v.cue = NULL;
if( st > self->stack ) {
--self->top;
}
--st;
}
if( self->stack != self->astack ) {
/**
* If the stack is dynamically allocated (probably not),
* then point it to the statically allocated one (and zeromem it),
* then finally delete the old dynamically allocated stack
*/
webvtt_state *pst = self->stack;
memset( self->astack, 0, sizeof( self->astack ) );
self->stack = self->astack;
self->stack_alloc = sizeof( self->astack ) / sizeof( *( self->astack ) );
webvtt_free( pst );
}
}
/**
*
*/
WEBVTT_EXPORT webvtt_status
webvtt_finish_parsing( webvtt_parser self )
{
webvtt_status status = WEBVTT_SUCCESS;
if( !self->finished ) {
self->finished = 1;
switch( self->mode ) {
/**
* We've left off parsing cue settings and are not in the empty state,
* return WEBVTT_CUE_INCOMPLETE.
*/
case M_WEBVTT:
if( self->top->type != V_NONE ) {
ERROR( WEBVTT_CUE_INCOMPLETE );
}
break;
/**
* We've left off on trying to read in a cue text.
* Parse the partial cue text read and pass the cue back to the
* application if possible.
*/
case M_CUETEXT:
status = webvtt_parse_cuetext( self, self->top->v.cue,
&self->line_buffer, self->finished );
webvtt_release_string( &self->line_buffer );
finish_cue( self, &self->top->v.cue );
break;
case M_SKIP_CUE:
/* Nothing to do here. */
break;
case M_READ_LINE:
/* Nothing to do here. */
break;
}
cleanup_stack( self );
}
return status;
}
WEBVTT_EXPORT void
webvtt_delete_parser( webvtt_parser self )
{
if( self ) {
cleanup_stack( self );
webvtt_release_string( &self->line_buffer );
webvtt_free( self );
}
}
#define BEGIN_STATE(State) case State: {
#define END_STATE } break;
#define IF_TOKEN(Token,Actions) case Token: { Actions } break;
#define BEGIN_DFA switch(top->state) {
#define END_DFA }
#define BEGIN_TOKEN switch(token) {
#define END_TOKEN }
#define IF_TRANSITION(Token,State) if( token == Token ) { self->state = State;
#define ELIF_TRANSITION(Token,State) } else IF_TRANSITION(Token,State)
#define ENDIF }
#define ELSE } else {
static int
find_newline( const webvtt_byte *buffer, webvtt_uint *pos, webvtt_uint len )
{
while( *pos < len ) {
if( buffer[ *pos ] == UTF8_CARRIAGE_RETURN || buffer[ *pos ] == UTF8_LINE_FEED ) {
return 1;
} else {
( *pos )++;
}
}
return -1;
}
static webvtt_status
webvtt_skipwhite( const webvtt_byte *buffer, webvtt_uint *pos, webvtt_uint len )
{
if( !buffer || !pos ) {
return WEBVTT_INVALID_PARAM;
}
for( ; *pos < len && webvtt_iswhite( buffer[ *pos ] ); (*pos)++ );
return WEBVTT_SUCCESS;
}
static void
find_next_whitespace( const webvtt_byte *buffer, webvtt_uint *ppos, webvtt_uint len )
{
webvtt_uint pos = *ppos;
while( pos < len ) {
webvtt_byte c = buffer[pos];
if( c == UTF8_CARRIAGE_RETURN || c == UTF8_LINE_FEED || c == UTF8_SPACE || c == UTF8_TAB ) {
break;
}
++pos;
}
*ppos = pos;
}
/**
* basic strnstr-ish routine
*/
static int
find_bytes( const webvtt_byte *buffer, webvtt_uint len,
const webvtt_byte *sbytes, webvtt_uint slen )
{
webvtt_uint slen2;
// check params for integrity
if( !buffer || len < 1 || !sbytes || slen < 1 ) {
return 0;
}
slen2 = slen - 1;
while( len-- >= slen && *buffer ){
if( *buffer == *sbytes && memcmp( buffer + 1, sbytes + 1, slen2 ) == 0 ) {
return 1;
}
buffer++;
}
return 0;
}
/**
* Helpers to figure out what state we're on
*/
#define SP (self->top)
#define AT_BOTTOM (self->top == self->stack)
#define ON_HEAP (self->stack_alloc == sizeof(p->astack) / sizeof(p->astack[0]))
#define STACK_SIZE ((webvtt_uint)(self->top - self->stack))
#define FRAME(i) (self->top - (i))
#define FRAMEUP(i) (self->top + (i))
#define RECHECK goto _recheck;
#define BACK (SP->back)
/**
* More state stack helpers
*/
static webvtt_status
do_push( webvtt_parser self, webvtt_uint token, webvtt_uint back, webvtt_uint state, void *data, webvtt_state_value_type type, webvtt_uint line, webvtt_uint column )
{
if( STACK_SIZE + 1 >= self->stack_alloc ) {
webvtt_state *stack = ( webvtt_state * )webvtt_alloc0( sizeof( webvtt_state ) * ( self->stack_alloc << 1 ) ), *tmp;
if( !stack ) {
ERROR( WEBVTT_ALLOCATION_FAILED );
return WEBVTT_OUT_OF_MEMORY;
}
memcpy( stack, self->stack, sizeof( webvtt_state ) * self->stack_alloc );
tmp = self->stack;
self->stack = stack;
self->top = stack + ( self->top - tmp );
if( tmp != self->astack ) {
webvtt_free( tmp );
}
}
++self->top;
self->top->state = state;
self->top->type = type;
self->top->token = ( webvtt_token )token;
self->top->line = line;
self->top->back = back;
self->top->column = column;
self->top->v.cue = ( webvtt_cue * )data;
return WEBVTT_SUCCESS;
}
static int
do_pop( webvtt_parser self )
{
int count = self->top->back;
self->top -= count;
self->top->back = 0;
self->popped = 1;
return count;
}
#define PUSH0(S,V,T) \
do { \
self->popped = 0; \
if( do_push(self,token,BACK+1,(S),(void*)(V),T,last_line, last_column) \
== WEBVTT_OUT_OF_MEMORY ) \
return WEBVTT_OUT_OF_MEMORY; \
} while(0)
#define PUSH(S,B,V,T) \
do { \
self->popped = 0; \
if( do_push(self,token,(B),(S),(void*)(V),T,last_line, last_column) \
== WEBVTT_OUT_OF_MEMORY ) \
return WEBVTT_OUT_OF_MEMORY; \
} while(0)
#define POP() \
do \
{ \
--(self->top); \
self->popped = 1; \
} while(0)
#define POPBACK() do_pop(self)
WEBVTT_INTERN int
parse_cueparams( webvtt_parser self, const webvtt_byte *buffer,
webvtt_uint len, webvtt_cue *cue )
{
int digits;
int have_ws = 0;
int unexpected_whitespace = 0;
webvtt_uint baddelim = 0;
webvtt_uint pos = 0;
webvtt_token last_token = 0;
enum cp_state {
CP_T1, CP_T2, CP_T3, CP_T4, CP_T5, /* 'start' cuetime, whitespace1,
'separator', whitespace2, 'end' cuetime */
CP_CS0, /* pre-cuesetting */
CP_SD, /* cuesettings delimiter here */
CP_V1, /* 'vertical' cuesetting */
CP_P1, /* 'position' cuesetting */
CP_A1, /* 'align' cuesetting */
CP_S1, /* 'size' cuesetting */
CP_L1, /* 'line' cuesetting */
CP_SV, /* cuesettings value here */
CP_V2,
CP_P2,
CP_A2,
CP_S2,
CP_L2,
};
enum cp_state last_state = CP_T1;
enum cp_state state = CP_T1;
#define SETST(X) do { baddelim = 0; last_state = state; state = (X); } while( 0 )
self->token_pos = 0;
while( pos < len ) {
webvtt_uint last_column = self->column;
webvtt_token token = webvtt_lex( self, buffer, &pos, len, 1 );
_recheck:
switch( state ) {
/* start timestamp */
case CP_T1:
if( token == WHITESPACE && !unexpected_whitespace ) {
ERROR_AT_COLUMN( WEBVTT_UNEXPECTED_WHITESPACE, self->column );
unexpected_whitespace = 1;
} else if( token == TIMESTAMP )
if( !parse_timestamp( self->token, &cue->from ) ) {
ERROR_AT_COLUMN(
( BAD_TIMESTAMP( cue->from )
? WEBVTT_EXPECTED_TIMESTAMP
: WEBVTT_MALFORMED_TIMESTAMP ), last_column );
if( !webvtt_isdigit( self->token[self->token_pos - 1] ) ) {
while( pos < len && buffer[pos] != 0x09 && buffer[pos] != 0x20 ) { ++pos; }
}
if( BAD_TIMESTAMP( cue->from ) )
{ return -1; }
SETST( CP_T2 );
} else {
SETST( CP_T2 );
}
else {
ERROR_AT_COLUMN( WEBVTT_EXPECTED_TIMESTAMP, last_column );
return -1;
}
break;
/* end timestamp */
case CP_T5:
if( token == WHITESPACE ) {
/* no problem, just ignore it and continue */
} else if( token == TIMESTAMP )
if( !parse_timestamp( self->token, &cue->until ) ) {
ERROR_AT_COLUMN(
( BAD_TIMESTAMP( cue->until )
? WEBVTT_EXPECTED_TIMESTAMP
: WEBVTT_MALFORMED_TIMESTAMP ), last_column );
if( !webvtt_isdigit( self->token[self->token_pos - 1] ) ) {
while( pos < len && buffer[pos] != 0x09 && buffer[pos] != 0x20 ) { ++pos; }
}
if( BAD_TIMESTAMP( cue->until ) )
{ return -1; }
SETST( CP_CS0 );
} else {
SETST( CP_CS0 );
}
else {
ERROR_AT_COLUMN( WEBVTT_EXPECTED_TIMESTAMP, last_column );
return -1;
}
break;
/* whitespace 1 */
case CP_T2:
switch( token ) {
case SEPARATOR:
ERROR_AT_COLUMN( WEBVTT_EXPECTED_WHITESPACE, last_column );
SETST( CP_T4 );
break;
case WHITESPACE:
SETST( CP_T3 );
break;
}
break;
case CP_T3:
switch( token ) {
case WHITESPACE: /* ignore this whitespace */
break;
case SEPARATOR:
SETST( CP_T4 );
break;
case TIMESTAMP:
ERROR( WEBVTT_MISSING_CUETIME_SEPARATOR );
SETST( CP_T5 );
goto _recheck;
default: /* some garbage */
ERROR_AT_COLUMN( WEBVTT_EXPECTED_CUETIME_SEPARATOR, last_column );
return -1;
}
break;
case CP_T4:
switch( token ) {
case WHITESPACE:
SETST( CP_T5 );
break;
case TIMESTAMP:
ERROR_AT_COLUMN( WEBVTT_EXPECTED_WHITESPACE, last_column );
goto _recheck;
default:
ERROR_AT_COLUMN( WEBVTT_EXPECTED_WHITESPACE, last_column );
goto _recheck;
}
break;
#define CHKDELIM \
if( baddelim ) \
ERROR_AT_COLUMN(WEBVTT_INVALID_CUESETTING_DELIMITER,baddelim); \
else if( !have_ws ) \
ERROR_AT_COLUMN(WEBVTT_EXPECTED_WHITESPACE,last_column);
/**
* This section is "pre-cuesetting". We are expecting whitespace,
* followed by a cuesetting keyword
*
* If we don't see a keyword, but have our whitespace, it is considered
* a bad keyword (invalid cuesetting)
*
* Otherwise, if we don't have whitespace and have a bad token, it's an
* invalid delimiter
*/
case CP_CS0:
switch( token ) {
case WHITESPACE:
have_ws = last_column;
break;
case COLON:
ERROR_AT_COLUMN( WEBVTT_MISSING_CUESETTING_KEYWORD, last_column );
break;
case VERTICAL:
CHKDELIM have_ws = 0;
SETST( CP_V1 );
break;
case POSITION:
CHKDELIM have_ws = 0;
SETST( CP_P1 );
break;
case ALIGN:
CHKDELIM have_ws = 0;
SETST( CP_A1 );
break;
case SIZE:
CHKDELIM have_ws = 0;
SETST( CP_S1 );
break;
case LINE:
CHKDELIM have_ws = 0;
SETST( CP_L1 );
break;
default:
if( have_ws ) {
ERROR_AT_COLUMN( WEBVTT_INVALID_CUESETTING, last_column );
while( pos < len && buffer[pos] != 0x09 && buffer[pos] != 0x20 ) { ++pos; }
} else if( token == BADTOKEN ) {
/* it was a bad delimiter... */
if( !baddelim ) {
baddelim = last_column;
}
++pos;
}
}
break;
#define CS1(S) \
if( token == COLON ) \
{ if(have_ws) { ERROR_AT_COLUMN(WEBVTT_UNEXPECTED_WHITESPACE,have_ws); } SETST((S)); have_ws = 0; } \
else if( token == WHITESPACE && !have_ws ) \
{ \
have_ws = last_column; \
} \
else \
{ \
switch(token) \
{ \
case LR: case RL: case INTEGER: case PERCENTAGE: case START: case MIDDLE: case END: case LEFT: case RIGHT: \
ERROR_AT_COLUMN(WEBVTT_MISSING_CUESETTING_DELIMITER,have_ws ? have_ws : last_column); break; \
default: \
ERROR_AT_COLUMN(WEBVTT_INVALID_CUESETTING_DELIMITER,last_column); \
while( pos < len && buffer[pos] != 0x20 && buffer[pos] != 0x09 ) ++pos; \
break; \
} \
have_ws = 0; \
}
/**
* If we get a COLON, we advance to the next state.
* If we encounter whitespace first, fire an "unexpected whitespace"
* error and continue. If we encounter a cue-setting value, fire a
* "missing cuesetting delimiter" error otherwise (eg vertical;rl), fire
* "invalid cuesetting delimiter" error
*
* this logic is performed by the CS1 macro, defined above
*/
case CP_V1:
CS1( CP_V2 );
break;
case CP_P1:
CS1( CP_P2 );
break;
case CP_A1:
CS1( CP_A2 );
break;
case CP_S1:
CS1( CP_S2 );
break;
case CP_L1:
CS1( CP_L2 );
break;
#undef CS1
/* BV: emit the BAD_VALUE error for the appropriate setting, when required */
#define BV(T) \
ERROR_AT_COLUMN(WEBVTT_##T##_BAD_VALUE,last_column); \
while( pos < len && buffer[pos] != 0x20 && buffer[pos] != 0x09 ) ++pos; \
SETST(CP_CS0);
/* HV: emit the ALREADY_SET (have value) error for the appropriate setting, when required */
#define HV(T) \
if( cue->flags & CUE_HAVE_##T ) \
{ \
ERROR_AT_COLUMN(WEBVTT_##T##_ALREADY_SET,last_column); \
}
/* WS: emit the WEBVTT_UNEXPECTED_WHITESPACE error when required. */
#define WS \
case WHITESPACE: \
if( !have_ws ) \
{ \
ERROR_AT_COLUMN(WEBVTT_UNEXPECTED_WHITESPACE,last_column); \
have_ws = last_column; \
} \
break
/* set that the cue already has a value for this */
#define SV(T) cue->flags |= CUE_HAVE_##T
case CP_V2:
HV( VERTICAL );
switch( token ) {
WS;
case LR:
cue->settings.vertical = WEBVTT_VERTICAL_LR;
have_ws = 0;
SETST( CP_CS0 );
SV( VERTICAL );
break;
case RL:
cue->settings.vertical = WEBVTT_VERTICAL_RL;
have_ws = 0;
SETST( CP_CS0 );
SV( VERTICAL );
break;
default:
BV( VERTICAL );
}
break;
case CP_P2:
HV( POSITION );
switch( token ) {
WS;
case PERCENTAGE: {
int digits;
const webvtt_byte *t = self->token;
webvtt_int64 v = parse_int( &t, &digits );
if( v < 0 ) {
BV( POSITION );
}
cue->settings.position = ( webvtt_uint )v;
SETST( CP_CS0 );
SV( POSITION );
}
break;
default:
BV( POSITION );
break;
}
break;
case CP_A2:
HV( ALIGN );
switch( token ) {
WS;
case START:
cue->settings.align = WEBVTT_ALIGN_START;
have_ws = 0;
SETST( CP_CS0 );
SV( ALIGN );
break;
case MIDDLE:
cue->settings.align = WEBVTT_ALIGN_MIDDLE;
have_ws = 0;
SETST( CP_CS0 );
SV( ALIGN );
break;
case END:
cue->settings.align = WEBVTT_ALIGN_END;
have_ws = 0;
SETST( CP_CS0 );
SV( ALIGN );
break;
case LEFT:
cue->settings.align = WEBVTT_ALIGN_LEFT;
have_ws = 0;
SETST( CP_CS0 );
SV( ALIGN );
break;
case RIGHT:
cue->settings.align = WEBVTT_ALIGN_RIGHT;
have_ws = 0;
SETST( CP_CS0 );
SV( ALIGN );
break;
default:
BV( ALIGN );
break;
}
break;
case CP_S2:
HV( SIZE );
switch( token ) {
WS;
case PERCENTAGE: {
int digits;
const webvtt_byte *t = self->token;
webvtt_int64 v = parse_int( &t, &digits );
if( v < 0 ) {
BV( SIZE );
}
cue->settings.size = ( webvtt_uint )v;
SETST( CP_CS0 );
SV( SIZE );
}
break;
default:
BV( SIZE );
break;
}
break;
case CP_L2:
HV( LINE );
switch( token ) {
WS;
case INTEGER: {
const webvtt_byte *t = self->token;
webvtt_int64 v = parse_int( &t, &digits );
cue->snap_to_lines = 1;
cue->settings.line = ( int )v;
SETST( CP_CS0 );
SV( LINE );
}
break;
case PERCENTAGE: {
const webvtt_byte *t = self->token;
webvtt_int64 v = parse_int( &t, &digits );
if( v < 0 ) {
BV( POSITION );
}
cue->snap_to_lines = 0;
cue->settings.line = ( int )v;
SETST( CP_CS0 );
SV( LINE );
}
break;
default:
BV( LINE );
break;
}
#undef BV
#undef HV
#undef SV
#undef WS
}
self->token_pos = 0;
last_token = token;
}
/**
* If we didn't finish in a good state...
*/
if( state != CP_CS0 ) {
/* if we never made it to the cuesettings, we didn't finish the cuetimes */
if( state < CP_CS0 ) {
ERROR( WEBVTT_UNFINISHED_CUETIMES );
return -1;
} else {
/* if we did, we should report an error but continue parsing. */
webvtt_error e = WEBVTT_INVALID_CUESETTING;
switch( state ) {
case CP_V2:
e = WEBVTT_VERTICAL_BAD_VALUE;
break;
case CP_P2:
e = WEBVTT_POSITION_BAD_VALUE;
break;
case CP_A2:
e = WEBVTT_ALIGN_BAD_VALUE;
break;
case CP_S2:
e = WEBVTT_SIZE_BAD_VALUE;
break;
case CP_L2:
e = WEBVTT_LINE_BAD_VALUE;
break;
}
ERROR( e );
}
} else {
if( baddelim ) {
ERROR_AT_COLUMN( WEBVTT_INVALID_CUESETTING_DELIMITER, baddelim );
}
}
#undef SETST
return 0;
}
static webvtt_status
parse_webvtt( webvtt_parser self, const webvtt_byte *buffer, webvtt_uint *ppos,
webvtt_uint len, webvtt_parse_mode *mode, int finish )
{
webvtt_status status = WEBVTT_SUCCESS;
webvtt_token token;
webvtt_uint pos = *ppos;
int settings_delimiter = 0;
int skip_error = 0;
int settings_whitespace = 0;
while( pos < len ) {
webvtt_uint last_column, last_line, last_pos;
skip_error = 0;
_next:
last_column = self->column;
last_line = self->line;
last_pos = pos;
/**
* If we're in certain states, we don't want to get a token and just
* want to read text instead.
*/
if( SP->state == T_CUEREAD ) {
int v;
webvtt_uint old_pos = pos;
if( v = webvtt_string_getline( &SP->v.text, buffer, &pos,
len, 0, finish, 0 ) ) {
if( v < 0 ) {
webvtt_release_string( &SP->v.text );
SP->type = V_NONE;
POP();
ERROR( WEBVTT_ALLOCATION_FAILED );
status = WEBVTT_OUT_OF_MEMORY;
goto _finish;
}
/* POP the stack and let the previous frame deal with it */
POP();
}
}
/**
* Get the next token from the stream
*
* If the token is 'UNFINISHED', but we are at the end of our input
* data, change it to BADTOKEN because it will never be finished.
*
* Otherwise, if we are expecting further data at some point, and have
* an unfinished token, return and let the next chunk deal with it.
*/
if( SP->state != T_CUE || !( self->popped && FRAMEUP( 1 )->state == T_CUEREAD ) ) {
/**
* We don't tokenize in certain states
*/
token = webvtt_lex( self, buffer, &pos, len, finish );
if( token == UNFINISHED ) {
if( finish ) {
token = BADTOKEN;
} else if( pos == len ) {
goto _finish;
}
}
}
_recheck:
switch( SP->state ) {
case T_INITIAL:
/**
* In the initial state:
* We should have WEBVTT as the first token returned,
* otherwise this isn't really a valid file.
*
* If we get 'WEBVTT', push us into the TAG state, where we
* check for a tag comment (arbitrary text following a whitespace
* after the WEBVTT token) until a newline
*
* If WEBVTT is not the first token, then report error and
* abort parsing.
*/
if( token == WEBVTT ) {
PUSH0( T_TAG, 0, V_NONE );
break;
} else {
if( pos != len ) {
if( !skip_error ) {
ERROR_AT_COLUMN( WEBVTT_MALFORMED_TAG, last_column );
skip_error = 1;
}
status = WEBVTT_PARSE_ERROR;
goto _finish;
}
}
break;
case T_TAG:
/**
* If we have a WHITESPACE following the WEBVTT token,
* switch to T_TAGCOMMENT state and skip the comment.
* Otherwise, if it's a NEWLINE, we can just skip to
* the T_BODY state.
*
* Otherwise, we didn't actually have a WEBVTT token,
* and should feel ashamed.
*/
if( token == WHITESPACE ) {
/* switch to comment skipper */
PUSH0( T_TAGCOMMENT, 0, V_NONE );
} else if( token == NEWLINE ) {
/* switch to NEWLINE counter */
POPBACK();
self->popped = 0;
SP->state = T_BODY;
PUSH0( T_EOL, 1, V_INTEGER );
break;
} else {
/**
* This wasn't preceded by an actual WEBVTT token, it's more
* like WEBVTTasdasd, which is not valid. Report an error,
* which should be considered fatal.
*/
if( !skip_error ) {
ERROR_AT_COLUMN( WEBVTT_MALFORMED_TAG, FRAME( 1 )->column );
skip_error = 1;
status = WEBVTT_PARSE_ERROR;
goto _finish;
}
}
break;
/**
* COMMENT -- Read until EOL, ignore everything else
*/
case T_TAGCOMMENT:
switch( token ) {
case NEWLINE:
/**
* If we encounter a newline, switch to NEWLINE mode,
* and set up so that when we POPBACK() we are in the
* T_BODY state.
*/
POPBACK();
PUSH0( T_EOL, 1, V_INTEGER );
break;
default:
find_newline( buffer, &pos, len );
continue;
}
break;
case T_CUEID:
switch( token ) {
/**
* We're only really expecting a newline here --
* The cue id should have been read already
*/
case NEWLINE:
SP->state = T_FROM;
break;
}
/**
* Count EOLs, POP when finished
*/
case T_EOL:
switch( token ) {
case NEWLINE:
SP->v.value++;
break;
default:
POPBACK();
RECHECK
}
break;
case T_BODY:
if( self->popped && FRAMEUP( 1 )->state == T_EOL ) {
if( FRAMEUP( 1 )->v.value < 2 ) {
ERROR_AT_COLUMN( WEBVTT_EXPECTED_EOL, 1 );
}
FRAMEUP( 1 )->state = 0;
FRAMEUP( 1 )->v.cue = NULL;
}
if( token == NOTE ) {
PUSH0( T_COMMENT, 0, V_NONE );
} else if( token != NEWLINE ) {
webvtt_cue *cue = 0;
webvtt_string tk = { 0 };
if( WEBVTT_FAILED( status = webvtt_create_cue( &cue ) ) ) {
if( status == WEBVTT_OUT_OF_MEMORY ) {
ERROR( WEBVTT_ALLOCATION_FAILED );
}
goto _finish;
}
if( WEBVTT_FAILED( status = webvtt_create_string_with_text( &tk,
self->token, self->token_pos ) ) ) {
if( status == WEBVTT_OUT_OF_MEMORY ) {
ERROR( WEBVTT_ALLOCATION_FAILED );
}
webvtt_release_cue( &cue );
goto _finish;
}
PUSH0( T_CUE, cue, V_CUE );
PUSH0( T_CUEREAD, 0, V_TEXT );
SP->v.text.d = tk.d;
}
break;
case T_CUE:
if( self->popped && FRAMEUP( 1 )->state == T_CUEREAD ) {
/**
* We're expecting either cue-id (contains '-->') or cue
* params
*/
webvtt_cue *cue = SP->v.cue;
webvtt_state *st = FRAMEUP( 1 );
webvtt_string text = st->v.text;
st->type = V_NONE;
st->v.cue = NULL;
/**
* The type should be V_TEXT. If it's not, somethings wrong.
*
* TODO: Add debug assertion
*/
if( find_bytes( webvtt_string_text( &text ), webvtt_string_length( &text ), separator,
sizeof( separator ) ) ) {
/* It's not a cue id, we found '-->'. It can't be a second
cueparams line, because if we had it, we would be in
a different state. */
int v;
/* backup the column */
self->column = 1;
if( ( v = parse_cueparams( self, webvtt_string_text( &text ),
webvtt_string_length( &text ), cue ) ) < 0 ) {
if( v == WEBVTT_PARSE_ERROR ) {
status = WEBVTT_PARSE_ERROR;
goto _finish;
}
webvtt_release_string( &text );
*mode = M_SKIP_CUE;
goto _finish;
} else {
webvtt_release_string( &text );
cue->flags |= CUE_HAVE_CUEPARAMS;
*mode = M_CUETEXT;
goto _finish;
}
} else {
/* It is a cue-id */
if( cue->flags & CUE_HAVE_ID ) {
/**
* This isn't actually a cue-id, because we already
* have one. It seems to be cuetext, which is occurring
* before cue-params
*/
webvtt_release_string( &text );
ERROR( WEBVTT_CUE_INCOMPLETE );
*mode = M_SKIP_CUE;
goto _finish;
} else {
self->column += webvtt_string_length( &text );
if( WEBVTT_FAILED( status = webvtt_string_append(
&cue->id, webvtt_string_text( &text ), webvtt_string_length( &text ) ) ) ) {
webvtt_release_string( &text );
ERROR( WEBVTT_ALLOCATION_FAILED );
}
cue->flags |= CUE_HAVE_ID;
}
}
webvtt_release_string( &text );
self->popped = 0;
} else {
webvtt_cue *cue = SP->v.cue;
/* If we have a newline, it might be the end of the cue. */
if( token == NEWLINE ) {
if( cue->flags & CUE_HAVE_CUEPARAMS ) {
*mode = M_CUETEXT;
} else if( cue->flags & CUE_HAVE_ID ) {
PUSH0( T_CUEREAD, 0, V_NONE );
} else {
/* I don't think this should ever happen? */
POPBACK();
}
}
}
break;
}
/**
* reset token pos
*/
self->token_pos = 0;
}
_finish:
if( status == WEBVTT_OUT_OF_MEMORY ) {
cleanup_stack( self );
}
*ppos = pos;
return status;
}
static webvtt_status
read_cuetext( webvtt_parser self, const webvtt_byte *b, webvtt_uint
*ppos, webvtt_uint len, webvtt_parse_mode *mode, webvtt_bool finish )
{
webvtt_status status = WEBVTT_SUCCESS;
webvtt_uint pos = *ppos;
int finished = 0;
do {
int v;
if( ( v = webvtt_string_getline( &self->line_buffer, b, &pos, len, &self->truncate, finish, 1 ) ) ) {
if( v < 0 ) {
status = WEBVTT_OUT_OF_MEMORY;
goto _finish;
}
if( self->line_buffer.d->length > 1 && self->line_buffer.d->text[ self->line_buffer.d->length - 1 ] == UTF8_LINE_FEED ) {
/**
* finished
*/
finished = 1;
}
webvtt_string_putc( &self->line_buffer, UTF8_LINE_FEED );
if( pos < len ) {
if( b[pos] == UTF8_CARRIAGE_RETURN ) {
if( len - pos >= 2 && b[pos + 1] == UTF8_LINE_FEED ) {
++pos;
}
++pos;
} else {
++pos;
}
}
}
} while( pos < len && !finished );
_finish:
*ppos = pos;
/**
* If we didn't encounter 2 successive EOLs, and it's not the final buffer in
* the file, notify the caller.
*/
if( pos >= len && !WEBVTT_FAILED( status ) && !finished ) {
status = WEBVTT_UNFINISHED;
}
return status;
}
WEBVTT_EXPORT webvtt_status
webvtt_parse_chunk( webvtt_parser self, const void *buffer, webvtt_uint len )
{
webvtt_status status;
webvtt_uint pos = 0;
const webvtt_byte *b = ( const webvtt_byte * )buffer;
while( pos < len ) {
switch( self->mode ) {
case M_WEBVTT:
if( WEBVTT_FAILED( status = parse_webvtt( self, b, &pos, len, &self->mode, self->finished ) ) ) {
return status;
}
break;
case M_CUETEXT:
/**
* read in cuetext
*/
if( WEBVTT_FAILED( status = read_cuetext( self, b, &pos, len, &self->mode, self->finished ) ) ) {
if( status == WEBVTT_UNFINISHED ) {
/* Make an exception here, because this isn't really a failure. */
return WEBVTT_SUCCESS;
}
return status;
}
/**
* Once we've successfully read the cuetext into line_buffer, call the
* cuetext parser from cuetext.c
*/
status = webvtt_parse_cuetext( self, SP->v.cue, &self->line_buffer, self->finished );
/**
* return the cue to the user, if possible.
*/
finish_cue( self, &SP->v.cue );
/**
* return to our typical parsing mode now.
*/
SP->type = V_NONE;
webvtt_release_string( &self->line_buffer );
self->mode = M_WEBVTT;
/* If we failed to parse cuetext, return the error */
if( WEBVTT_FAILED( status ) ) {
return status;
}
break;
case M_SKIP_CUE:
if( WEBVTT_FAILED( status = read_cuetext( self, b, &pos, len, &self->mode, self->finished ) ) ) {
return status;
}
webvtt_release_string( &self->line_buffer );
self->mode = M_WEBVTT;
break;
case M_READ_LINE: {
/**
* Read in a line of text into the line-buffer,
* we will and depending on our state, do something with it.
*/
int ret;
if( ( ret = webvtt_string_getline( &self->line_buffer, b, &pos, len, &self->truncate, self->finished, 0 ) ) ) {
if( ret < 0 ) {
ERROR( WEBVTT_ALLOCATION_FAILED );
return WEBVTT_OUT_OF_MEMORY;
}
self->mode = M_WEBVTT;
}
break;
}
}
if( WEBVTT_FAILED( status = webvtt_skipwhite( b, &pos, len ) ) ) {
return status;
}
}
return WEBVTT_SUCCESS;
}
#undef SP
#undef AT_BOTTOM
#undef ON_HEAP
#undef STACK_SIZE
#undef FRAME
#undef PUSH
#undef POP
/**
* Get an integer value from a series of digits.
*/
static webvtt_int64
parse_int( const webvtt_byte **pb, int *pdigits )
{
int digits = 0;
webvtt_int64 result = 0;
webvtt_int64 mul = 1;
const webvtt_byte *b = *pb;
while( *b ) {
webvtt_byte ch = *b;
if( webvtt_isdigit( ch ) ) {
/**
* Digit character, carry on
*/
result = result * 10 + ( ch - UTF8_DIGIT_ZERO );
++digits;
} else if( mul == 1 && digits == 0 && ch == UTF8_HYPHEN_MINUS ) {
mul = -1;
} else {
break;
}
++b;
}
*pb = b;
if( pdigits ) {
*pdigits = digits;
}
return result * mul;
}
/**
* Turn the token of a TIMESTAMP tag into something useful, and returns non-zero
* returns 0 if it fails
*/
WEBVTT_INTERN int
parse_timestamp( const webvtt_byte *b, webvtt_timestamp *result )
{
webvtt_int64 tmp;
int have_hours = 0;
int digits;
int malformed = 0;
webvtt_int64 v[4];
if ( !webvtt_isdigit( *b ) ) {
goto _malformed;
}
/* get sequence of digits */
v[0] = parse_int( &b, &digits );
/**
* assume v[0] contains hours if more or less than 2 digits, or value is
* greater than 59
*/
if ( digits != 2 || v[0] > 59 ) {
have_hours = 1;
}
/* fail if missing colon ':' character */
if ( !*b || *b++ != UTF8_COLON ) {
malformed = 1;
}
/* fail if end of data reached, or byte is not an ASCII digit */
if ( !*b || !webvtt_isdigit( *b ) ) {
malformed = 1;
}
/* get another integer value, fail if digits is not equal to 2 */
v[1] = parse_int( &b, &digits );
if( digits != 2 ) {
malformed = 1;
}
/* if we already know there's an hour component, or if the next byte is a
colon ':', read the next value */
if ( have_hours || ( *b == UTF8_COLON ) ) {
if( *b++ != UTF8_COLON ) {
goto _malformed;
}
if( !*b || !webvtt_isdigit( *b ) ) {
malformed = 1;
}
v[2] = parse_int( &b, &digits );
if( digits != 2 ) {
malformed = 1;
}
} else {
/* Otherwise, if there is no hour component, shift everything over */
v[2] = v[1];
v[1] = v[0];
v[0] = 0;
}
/* collect the manditory seconds-frac component. fail if there is no FULL_STOP
'.' or if there is no ascii digit following it */
if( *b++ != UTF8_FULL_STOP || !webvtt_isdigit( *b ) ) {
goto _malformed;
}
v[3] = parse_int( &b, &digits );
if( digits != 3 ) {
malformed = 1;
}
/* Ensure that minutes and seconds are acceptable values */
if( v[3] > 999 ) {
#define MILLIS_PER_SEC (1000)
tmp = v[3];
v[2] += tmp / MILLIS_PER_SEC;
v[3] = tmp % MILLIS_PER_SEC;
malformed = 1;
}
if( v[2] > 59 ) {
#define SEC_PER_MIN (60)
tmp = v[2];
v[1] += tmp / SEC_PER_MIN;
v[2] = tmp % SEC_PER_MIN;
malformed = 1;
}
if( v[1] > 59 ) {
#define MIN_PER_HOUR (60)
tmp = v[1];
v[0] += tmp / MIN_PER_HOUR;
v[1] = tmp % MIN_PER_HOUR;
malformed = 1;
}
*result = ( webvtt_timestamp )( v[0] * MSECS_PER_HOUR )
+ ( v[1] * MSECS_PER_MINUTE )
+ ( v[2] * MSECS_PER_SECOND )
+ ( v[3] );
if( malformed ) {
return 0;
}
return 1;
_malformed:
*result = 0xFFFFFFFFFFFFFFFF;
return 0;
}