Rockbox.org home
release
dev builds
extras
themes manual
wiki
device status forums
mailing lists
IRC bugs
patches
dev guide
translations



Rockbox mail archive

Subject: Source to state machine for parsing input

Source to state machine for parsing input

From: <postmaster_at_diffenbach.org>
Date: Fri, 10 Mar 2006 16:19:43 -0500

Enclosed please find source code to a simple state machine designed to
parse input. Your comments are requested.

The state machine can parse input from a refillable buffer of char,
maintaining state across buffer refills. The machine returns a pointer
  into the buffer, indicating where a match has been made, or
one-past-the-end of the buffer if buffer has been exhausted.

Files
state_machine.h: header file
state_machine.c: implementation
state_machine_client.c: demonstration of use

Each transition in the state machine allows only two alternatives, a
"match" and a "fail". It can match characters, newlines*, whitespace*,
or decimal digits. A match may be negated, e.g., NOT '#' matches any
character except '#'. (*Whitespace is here defined not to include
newlines; newlines are defined as '\n' or '\r'.)


Using the state machine requires the client programmer to provide an
array of state transitions. Each transition indicates a match
constraint, and the states to transition to on match or on fail. The
number or id of each transition is implied, by its position in the array.

To avoid having to use callback functions (and various casts on void
pointers), any transition can be marked as requiring a return to the
caller, on either of match or fail or both. The calling function can
then update any external state it holds, and then re-enter the state
machine for further parsing.


state_machine_client.c demonstrates using the state machine to parse
EXTINF formated files.

In particular, the state machine returns a pointers to
1) the first non-space character of lines not beginning with a "#"
character
or
2) the second character of lines beginning with a "#" character, if
the "#" character is not followed by an EXTINF formatted track length
or
3) the track name, on lines formatted using the EXTINF format
4) the EXTINF header line ("#EXTM3U") is ignored

This allows the finding of track names, or if no track name is
available, the track path, for input formatted in the EXTINF format,
or for input formatted as:
#Track name 1
track path 1
#Track Name 2
track path 3


Program output:

$ ./state_machine_client.exe
--Input follows on next line(s):
  path 1
#EXTM3U
#EXTINF:33333333, Name for path 2
   path 2

#random comment

#random comment 2
#Name for path 3
  path 3
#EXTIF,333Malformed
path 4
path 5
                 path 6
#crap
--end input

Showing what each pointer pointer, 0-5, points to in input. Newlines
are replaced with spaces:


0 path 1 #EXTM3U #EXTINF:33333333,
1 Name for path 2 path 2 #random comment #random comment 2 #
2 Name for path 3 path 3 #
3 EXTIF,333Malformed path 4
4 path 5
5 path 6 #crap


/*
  state_machine_client.c
  copyright 2006, Thomas Paul Diffenbach
*/

#include <stdio.h>
#include <string.h>

#include "state_machine.h"



int main() {
  static const struct state_transition states[] = {
    /* 0: */ { '#', 1, 17, MATCH },
    /* 1: */ { 'E', 2, 13, MATCH | RETURN_MATCH}, /* on sucess, maybe found start of name */
    /* 2: */ { 'X', 3, 14, MATCH },
    /* 3: */ { 'T', 4, 14, MATCH },
    
    /* 4: */ { 'M', 5, 7, MATCH },
    /* 5: */ { '3', 6, 14, MATCH },
    /* 6: */ { 'U', 15, 14, MATCH | RETURN_MATCH}, /* on sucess, wasn't name, was header */
    
    /* 7: */ { 'I', 8, 14, MATCH },
    /* 8: */ { 'N', 9, 14, MATCH },
    /* 9: */ { 'F', 10, 14, MATCH },
    /* 10: */ { ':', 11, 14, MATCH },
    
    /* 11: */ { '0', 11, 12, DIGIT },
    /* 12: */ { ',', 13, 14, MATCH },
    
    /* 13: */ { '0', 13, 14, WHITESPACE | RETURN_FAILURE }, /* on failure found start of name */
    /* 14: */ { '0', 14, 15, NEWLINE | NOT }, /* walk/loop until newline */
    
    /* 15: */ { '0', 15, 16, NEWLINE }, /* walk/loop until not newline */
    
    /* 16: */ { '#', 1, 17, MATCH }, /*re-start if next line begins with # */
    
    /* 17: */ { '0', 17, 18, WHITESPACE }, /*ignore leading whitespace before path */
    /* 18: */ { '0', 19, 15, NEWLINE | NOT | RETURN_MATCH},
    /* if this is not a newline, this is the path (following the name), increment the name/path list */
    
    /* 19: */ { '0', 19, 0, NEWLINE | NOT }, /* ignnore newlines until next non-newline, then restart */
  } ;
  
  
  const char* inputs[] = { "#EXTM3U\n#EXTINF:333,Name is here\npath is here\n#fake song\n#real song\n path2\npath3\npath4\n#name 2\npath5\n#EXM3name\npath\n#EXTINF:333,ZName is here\npath is here",
  "path 1\npath 2\npath 3" ,
  "path 1\n#coment1\npath 2\npath 3",
  "path 1\n#EXTM3U\npath 2\npath 3",
  "path 1\n#EXTM3\npath 2\npath 3",
  "path 1\n#EXTM3U\n#EXTINF\npath 2\npath 3",
  "path 1\n#EXTM3U\n#EXTINF:\npath 2\npath 3",
  "path 1\n#EXTM3U\n#EXTINF:33333333,\npath 2\npath 3",
  "path 1\n#EXTM3U\n#EXTINF:33333333, Name for path 2\npath 2\npath 3",
  "path 1\n#EXTM3U\n#EXTINF:33333333, Name for path 2\npath 2\n#Name for path 3\npath 3\npath 4",
  " path 1\n#EXTM3U\n#EXTINF:33333333, Name for path 2\n path 2\n\n#random comment\n\n#random comment 2\n#Name for path 3\n path 3\n#EXTIF,333Malformed\npath 4\npath 5\n path 6\n#crap",
  } ;
  
  const char* input = inputs[ sizeof( inputs ) / sizeof( inputs[ 0 ] ) - 1 ] ;
  const char* end = input + strlen( input ) + 1 ;
  
  printf( "--Input follows on next line(s):\n%s\n--end input\n\n", input ) ;
  
  struct state_machine_state state ;
  init_state_machine_state( &state, &input[0], end ) ;
  
  const char* offsets[ 10 ] ;
  int offset = 0 ;
  const char* p = &input[0];
  int found = 0 ;
  
  while( p != end ) {
    do {
      p = run_state_machine( &state, states ) ;
      /* printf( "current = %d, next = %d, s = %s\n", state.current_state, state.next_state, "" ) ; */
      if( p != end ) {
        if( state.current_state == 18 && state.next_state == 19 ) {
          if( !found ) {
            offsets[ offset ] = p ;
          } else {
            found = 0 ;
          }
          ++offset ;
        } else if( state.current_state == 6 ) {
          found = 0 ;
        } else {
          offsets[ offset ] = p ;
          found = 1 ;
        }
      }
    } while( p != state.end ) ;
    state.end = end - state.end >=4 ? state.end + 4 : end ;
  }
  
  offsets[ offset ] = end ;
 
  printf( "Showing what each pointer pointer, 0-%d, points to in input. Newlines are replaced with spaces:\n\n", offset - 1 ) ;
  
  int i ;
  for( i = 0 ; i < offset ; ++i ) {
    printf( "\n%d ", i ) ;
    const char* p = offsets[ i ] ;
    const char* e = offsets[ i + 1 ] ;
    while( p != e ) {
      printf( "%c", *p != '\n' ? *p : ' ' ) ;
      ++p ;
    }
  }
    
  return 1 ;
}



/*
  state_machine.c
  copyright 2006, Thomas Paul Diffenbach
*/

#include <ctype.h>

#include "state_machine.h"

typedef unsigned char uchar ;

static void set_state_machine_state( struct state_machine_state* s,
    uchar current_state, uchar next_state, const char* next_pos ) {
  s->current_state = current_state ;
  s->next_state = next_state ;
  s->next_pos = next_pos ;
}

void init_state_machine_state( struct state_machine_state* s, const char* start, const char* end ) {
  set_state_machine_state( s, 0, 0, start ) ;
  s->end = end ;
}

  
const char* run_state_machine( struct state_machine_state* state, const struct state_transition* transitions ) {

  const char* p = state->next_pos ;
  const char* end = state->end ;
  uchar current_state = state->next_state ;
  uchar next_state ;
  const char* n ;
  
  while( p != end ) {
    
    const struct state_transition* t = transitions + current_state ;
    uchar f = t->func ;
    uchar isnot = f & NOT ;
    uchar ismatch = 0 ;
    const int c = *p ;
    switch( f & FUNCS ) {
      case MATCH:
        ismatch = t->match_char == c ; break ;
      case NEWLINE:
        ismatch = c == '\r' || c== '\n' ; break ;
      case DIGIT:
        ismatch = isdigit( c ) ; break ;
      case WHITESPACE:
        ismatch = c != '\r' && c != '\n' && isspace( c ) ; break ;
      default:
        printf( "error, f & FUNCS = %d\n", f & FUNCS ) ;
    }
    
    if( isnot )
      ismatch = !ismatch ;
    
    if( ismatch ) {
      next_state = t->match_state ;
      n = p + 1 ;
    } else {
      next_state = t->fail_state ;
      n = p ;
    }
    
    //printf( "ismatch %d, isnot %d, current %d, next %d, incr %d, p\n", ismatch, isnot, current_state, next_state, n-p, *p ) ;
    
    if( ( ismatch && ( f & RETURN_MATCH) ) || ( ! ismatch && ( f & RETURN_FAILURE ) ) ) {
      set_state_machine_state( state, current_state, next_state, n ) ;
      return p ;
    }
    
    current_state = next_state ;
    p = n ;
  }
  
  set_state_machine_state( state, current_state, next_state, end ) ;
  return end ;
}


/*
  state_machine.h
  copyright 2006, Thomas Paul Diffenbach
*/

#ifndef __STATE_MACHINE_H__
#define __STATE_MACHINE_H__ 1


enum match_func_name {
  MATCH, NEWLINE, DIGIT, WHITESPACE,
  NOT = 32, RETURN_MATCH= 64, RETURN_FAILURE = 128, MAX = 255, FUNCS = 31 } ;

 struct state_transition {
  /*unsigned char number ;*/
  char match_char ;
  unsigned char match_state ;
  unsigned char fail_state ;
  enum match_func_name func;
} ;

struct state_machine_state {
  unsigned char current_state ;
  unsigned char next_state ;
  const char* next_pos ;
  const char* end ;
} ;

void init_state_machine_state( struct state_machine_state* s, const char* start, const char* end ) ;

const char* run_state_machine( struct state_machine_state* state, const struct state_transition* transitions ) ;

#endif
Received on 2006-03-10

Page template was last modified "Tue Sep 7 00:00:02 2021" The Rockbox Crew -- Privacy Policy