Browse Source

Integrate further with framework; unescape UTF-8.

Stefano Cossu 3 years ago
parent
commit
c27f43a402
13 changed files with 1275 additions and 67 deletions
  1. 6 0
      .gitignore
  2. 8 5
      Makefile
  3. 72 0
      include/core.h
  4. 17 7
      include/term.h
  5. 16 0
      src/codec/Makefile
  6. 1074 0
      src/codec/lempar.c
  7. 0 0
      src/codec/nt_grammar.y
  8. 74 38
      src/codec/nt_lexer.re
  9. 0 15
      src/codec/test.nt
  10. 1 0
      src/codec/test2.nt
  11. 1 0
      src/codec/test2.txt
  12. 4 0
      src/core.c
  13. 2 2
      src/term.c

+ 6 - 0
.gitignore

@@ -104,6 +104,12 @@ venv.bak/
 
 sandbox.c
 
+# Lexer and parser artifacts.
+
+src/codec/*_grammar.c
+src/codec/*_grammar.h
+src/codec/*_parser.c
+
 # IDE
 .syntastic*
 .vimrc

+ 8 - 5
Makefile

@@ -1,3 +1,4 @@
+CODEC_DIR=src/codec
 CC=gcc
 CFLAGS+= -Wall -D_XOPEN_SOURCE=500
 INCLUDE=-Iinclude -Iext/xxHash -Iext/openldap/libraries/liblmdb -Iext/uthash/src
@@ -9,6 +10,8 @@ SRC=ext/xxHash/xxhash.c ext/openldap/libraries/liblmdb/mdb.c \
 
 default: test
 
+build_parsers:; $(MAKE) -C $(CODEC_DIR)
+
 build:
 	$(CC) \
 		$(CFLAGS) -Werror
@@ -35,15 +38,15 @@ test:
 
 
 test_lexer:
-	cd src/codec; \
-	lemon -T/usr/share/lemon/lempar.c nt_grammar && \
-	re2c nt_lexer.re -o nt_lexer.c -T --case-ranges && \
-	cd ../../; \
+	cd src/codec && \
+	lemon -T/usr/share/lemon/lempar.c nt_grammar.y && \
+	re2c nt_lexer.re -o nt_codec.c -T --case-ranges && \
+	cd ../../
 	$(CC) \
 		$(CFLAGS) -g3 -DDEBUG \
 		$(INCLUDE) -I. \
 		$(LIB) \
-		$(SRC) src/codec/nt_lexer.c src/codec/nt_grammar.c \
+		$(SRC) src/codec/nt_codec.c src/codec/nt_grammar.c \
 		-o bin/test_lexer
 
 

+ 72 - 0
include/core.h

@@ -104,6 +104,78 @@ LSUP_rc mkdir_p(const char *path, mode_t mode);
 LSUP_rc
 rm_r (const char *path);
 
+/** @brief Convert a Unicode (UTF-8) code point to 4-byte sequence.
+ */
+/*
+inline uint16_t utf8_to_bytes (uint16_t c) {
+    unsigned char b[4] = {0};
+
+    if (c<0x80) *b++=c;
+    else if (c<0x800) *b++=192+c/64, *b++=128+c%64;
+    else if (c-0xd800u<0x800) goto error;
+    else if (c<0x10000) *b++=224+c/4096, *b++=128+c/64%64, *b++=128+c%64;
+    else if (c<0x110000)
+        *b++=240+c/262144, *b++=128+c/4096%64, *b++=128+c/64%64, *b++=128+c%64;
+    else return NULL;
+
+    return (uint16_t)b;
+}
+*/
+
+
+/**
+ * Encode a code point using UTF-8
+ *
+ * @author Ondřej Hruška <ondra@ondrovo.com>
+ * @license MIT
+ * https://gist.github.com/MightyPork/52eda3e5677b4b03524e40c9f0ab1da5
+ *
+ * @param out - output buffer (min 5 characters), will be 0-terminated
+ * @param utf - code point 0-0x10FFFF
+ * @return number of bytes on success, 0 on failure (also produces U+FFFD, which uses 3 bytes)
+ */
+inline int utf8_encode(const uint32_t utf, unsigned char *out)
+{
+  if (utf <= 0x7F) {
+    // Plain ASCII
+    out[0] = (char) utf;
+    out[1] = 0;
+    return 1;
+  }
+  else if (utf <= 0x07FF) {
+    // 2-byte unicode
+    out[0] = (char) (((utf >> 6) & 0x1F) | 0xC0);
+    out[1] = (char) (((utf >> 0) & 0x3F) | 0x80);
+    out[2] = 0;
+    return 2;
+  }
+  else if (utf <= 0xFFFF) {
+    // 3-byte unicode
+    out[0] = (char) (((utf >> 12) & 0x0F) | 0xE0);
+    out[1] = (char) (((utf >>  6) & 0x3F) | 0x80);
+    out[2] = (char) (((utf >>  0) & 0x3F) | 0x80);
+    out[3] = 0;
+    return 3;
+  }
+  else if (utf <= 0x10FFFF) {
+    // 4-byte unicode
+    out[0] = (char) (((utf >> 18) & 0x07) | 0xF0);
+    out[1] = (char) (((utf >> 12) & 0x3F) | 0x80);
+    out[2] = (char) (((utf >>  6) & 0x3F) | 0x80);
+    out[3] = (char) (((utf >>  0) & 0x3F) | 0x80);
+    out[4] = 0;
+    return 4;
+  }
+  else {
+    // error - use replacement character
+    out[0] = (char) 0xEF;
+    out[1] = (char) 0xBF;
+    out[2] = (char) 0xBD;
+    out[3] = 0;
+    return 0;
+  }
+}
+
 
 // Error handling via goto.
 #define CHECK(exp, rc, marker) (rc) = (exp); if ((rc) != LSUP_OK) goto marker

+ 17 - 7
include/term.h

@@ -25,14 +25,24 @@ typedef char LSUP_term_type;
 #define LSUP_TERM_BNODE          2
 #define LSUP_TERM_LITERAL        3
 
-typedef struct LSUP_Term {
-    char *data;
-    char *datatype;
+/** @brief Default data type for untyped literals.
+ */
+#define DEFAULT_DTYPE           "http://www.w3.org/2001/XMLSchema#string"
+
+
+typedef struct term_t {
+    char *              data;       // URI or literal value, or BNode label.
+    char *              datatype;   // Data type for literals.
     // This language variable currently supports a 2-digit ISO 639 language
-    // code and a 2-character ISO 3166-1 country code, separated by a hyphen.
-    // See https://tools.ietf.org/html/bcp47#section-2.1
-    langtag lang;
-    LSUP_term_type type;
+    union {
+        langtag         lang;       // Language tag. This variable currently
+                                    // supports a 2-digit ISO 639 language
+                                    // code and a 2-character ISO 3166-1
+                                    // country code, separated by a hyphen. See
+                                    // https://tools.ietf.org/html/bcp47#section-2.1
+        uint64_t        bnode_id;   // Blank node ID. TODO implement.
+    };
+    LSUP_term_type      type;       // Term type.
 } LSUP_Term;
 
 

+ 16 - 0
src/codec/Makefile

@@ -0,0 +1,16 @@
+LEXER = /usr/bin/re2c
+PARSER = /usr/bin/lemon
+
+# List of codec to be generated. Each must be prefixed with "_codec".
+codecs = nt_codec# ttl_codec […]
+
+parsers := $(codecs:_codec=_parser.c)
+
+all: $(parsers)
+
+$(parsers): %_parser.c: %_lexer.re %_grammar.c
+	$(LEXER) $< -o $@ -T --case-ranges
+
+%_grammar.c: %_grammar.y
+	$(PARSER) $< -c -Tlempar.c
+

+ 1074 - 0
src/codec/lempar.c

@@ -0,0 +1,1074 @@
+/*
+** 2000-05-29
+**
+** The author disclaims copyright to this source code.  In place of
+** a legal notice, here is a blessing:
+**
+**    May you do good and not evil.
+**    May you find forgiveness for yourself and forgive others.
+**    May you share freely, never taking more than you give.
+**
+*************************************************************************
+** Driver template for the LEMON parser generator.
+**
+** The "lemon" program processes an LALR(1) input grammar file, then uses
+** this template to construct a parser.  The "lemon" program inserts text
+** at each "%%" line.  Also, any "P-a-r-s-e" identifer prefix (without the
+** interstitial "-" characters) contained in this template is changed into
+** the value of the %name directive from the grammar.  Otherwise, the content
+** of this template is copied straight through into the generate parser
+** source file.
+**
+** The following is the concatenation of all %include directives from the
+** input grammar file:
+*/
+/************ Begin %include sections from the grammar ************************/
+%%
+/**************** End of %include directives **********************************/
+/* These constants specify the various numeric values for terminal symbols.
+***************** Begin token definitions *************************************/
+%%
+/**************** End token definitions ***************************************/
+
+/* The next sections is a series of control #defines.
+** various aspects of the generated parser.
+**    YYCODETYPE         is the data type used to store the integer codes
+**                       that represent terminal and non-terminal symbols.
+**                       "unsigned char" is used if there are fewer than
+**                       256 symbols.  Larger types otherwise.
+**    YYNOCODE           is a number of type YYCODETYPE that is not used for
+**                       any terminal or nonterminal symbol.
+**    YYFALLBACK         If defined, this indicates that one or more tokens
+**                       (also known as: "terminal symbols") have fall-back
+**                       values which should be used if the original symbol
+**                       would not parse.  This permits keywords to sometimes
+**                       be used as identifiers, for example.
+**    YYACTIONTYPE       is the data type used for "action codes" - numbers
+**                       that indicate what to do in response to the next
+**                       token.
+**    ParseTOKENTYPE     is the data type used for minor type for terminal
+**                       symbols.  Background: A "minor type" is a semantic
+**                       value associated with a terminal or non-terminal
+**                       symbols.  For example, for an "ID" terminal symbol,
+**                       the minor type might be the name of the identifier.
+**                       Each non-terminal can have a different minor type.
+**                       Terminal symbols all have the same minor type, though.
+**                       This macros defines the minor type for terminal 
+**                       symbols.
+**    YYMINORTYPE        is the data type used for all minor types.
+**                       This is typically a union of many types, one of
+**                       which is ParseTOKENTYPE.  The entry in the union
+**                       for terminal symbols is called "yy0".
+**    YYSTACKDEPTH       is the maximum depth of the parser's stack.  If
+**                       zero the stack is dynamically sized using realloc()
+**    ParseARG_SDECL     A static variable declaration for the %extra_argument
+**    ParseARG_PDECL     A parameter declaration for the %extra_argument
+**    ParseARG_PARAM     Code to pass %extra_argument as a subroutine parameter
+**    ParseARG_STORE     Code to store %extra_argument into yypParser
+**    ParseARG_FETCH     Code to extract %extra_argument from yypParser
+**    ParseCTX_*         As ParseARG_ except for %extra_context
+**    YYERRORSYMBOL      is the code number of the error symbol.  If not
+**                       defined, then do no error processing.
+**    YYNSTATE           the combined number of states.
+**    YYNRULE            the number of rules in the grammar
+**    YYNTOKEN           Number of terminal symbols
+**    YY_MAX_SHIFT       Maximum value for shift actions
+**    YY_MIN_SHIFTREDUCE Minimum value for shift-reduce actions
+**    YY_MAX_SHIFTREDUCE Maximum value for shift-reduce actions
+**    YY_ERROR_ACTION    The yy_action[] code for syntax error
+**    YY_ACCEPT_ACTION   The yy_action[] code for accept
+**    YY_NO_ACTION       The yy_action[] code for no-op
+**    YY_MIN_REDUCE      Minimum value for reduce actions
+**    YY_MAX_REDUCE      Maximum value for reduce actions
+*/
+#ifndef INTERFACE
+# define INTERFACE 1
+#endif
+/************* Begin control #defines *****************************************/
+%%
+/************* End control #defines *******************************************/
+#define YY_NLOOKAHEAD ((int)(sizeof(yy_lookahead)/sizeof(yy_lookahead[0])))
+
+/* Define the yytestcase() macro to be a no-op if is not already defined
+** otherwise.
+**
+** Applications can choose to define yytestcase() in the %include section
+** to a macro that can assist in verifying code coverage.  For production
+** code the yytestcase() macro should be turned off.  But it is useful
+** for testing.
+*/
+#ifndef yytestcase
+# define yytestcase(X)
+#endif
+
+
+/* Next are the tables used to determine what action to take based on the
+** current state and lookahead token.  These tables are used to implement
+** functions that take a state number and lookahead value and return an
+** action integer.  
+**
+** Suppose the action integer is N.  Then the action is determined as
+** follows
+**
+**   0 <= N <= YY_MAX_SHIFT             Shift N.  That is, push the lookahead
+**                                      token onto the stack and goto state N.
+**
+**   N between YY_MIN_SHIFTREDUCE       Shift to an arbitrary state then
+**     and YY_MAX_SHIFTREDUCE           reduce by rule N-YY_MIN_SHIFTREDUCE.
+**
+**   N == YY_ERROR_ACTION               A syntax error has occurred.
+**
+**   N == YY_ACCEPT_ACTION              The parser accepts its input.
+**
+**   N == YY_NO_ACTION                  No such action.  Denotes unused
+**                                      slots in the yy_action[] table.
+**
+**   N between YY_MIN_REDUCE            Reduce by rule N-YY_MIN_REDUCE
+**     and YY_MAX_REDUCE
+**
+** The action table is constructed as a single large table named yy_action[].
+** Given state S and lookahead X, the action is computed as either:
+**
+**    (A)   N = yy_action[ yy_shift_ofst[S] + X ]
+**    (B)   N = yy_default[S]
+**
+** The (A) formula is preferred.  The B formula is used instead if
+** yy_lookahead[yy_shift_ofst[S]+X] is not equal to X.
+**
+** The formulas above are for computing the action when the lookahead is
+** a terminal symbol.  If the lookahead is a non-terminal (as occurs after
+** a reduce action) then the yy_reduce_ofst[] array is used in place of
+** the yy_shift_ofst[] array.
+**
+** The following are the tables generated in this section:
+**
+**  yy_action[]        A single table containing all actions.
+**  yy_lookahead[]     A table containing the lookahead for each entry in
+**                     yy_action.  Used to detect hash collisions.
+**  yy_shift_ofst[]    For each state, the offset into yy_action for
+**                     shifting terminals.
+**  yy_reduce_ofst[]   For each state, the offset into yy_action for
+**                     shifting non-terminals after a reduce.
+**  yy_default[]       Default action for each state.
+**
+*********** Begin parsing tables **********************************************/
+%%
+/********** End of lemon-generated parsing tables *****************************/
+
+/* The next table maps tokens (terminal symbols) into fallback tokens.  
+** If a construct like the following:
+** 
+**      %fallback ID X Y Z.
+**
+** appears in the grammar, then ID becomes a fallback token for X, Y,
+** and Z.  Whenever one of the tokens X, Y, or Z is input to the parser
+** but it does not parse, the type of the token is changed to ID and
+** the parse is retried before an error is thrown.
+**
+** This feature can be used, for example, to cause some keywords in a language
+** to revert to identifiers if they keyword does not apply in the context where
+** it appears.
+*/
+#ifdef YYFALLBACK
+static const YYCODETYPE yyFallback[] = {
+%%
+};
+#endif /* YYFALLBACK */
+
+/* The following structure represents a single element of the
+** parser's stack.  Information stored includes:
+**
+**   +  The state number for the parser at this level of the stack.
+**
+**   +  The value of the token stored at this level of the stack.
+**      (In other words, the "major" token.)
+**
+**   +  The semantic value stored at this level of the stack.  This is
+**      the information used by the action routines in the grammar.
+**      It is sometimes called the "minor" token.
+**
+** After the "shift" half of a SHIFTREDUCE action, the stateno field
+** actually contains the reduce action for the second half of the
+** SHIFTREDUCE.
+*/
+struct yyStackEntry {
+  YYACTIONTYPE stateno;  /* The state-number, or reduce action in SHIFTREDUCE */
+  YYCODETYPE major;      /* The major token value.  This is the code
+                         ** number for the token at this stack level */
+  YYMINORTYPE minor;     /* The user-supplied minor token value.  This
+                         ** is the value of the token  */
+};
+typedef struct yyStackEntry yyStackEntry;
+
+/* The state of the parser is completely contained in an instance of
+** the following structure */
+struct yyParser {
+  yyStackEntry *yytos;          /* Pointer to top element of the stack */
+#ifdef YYTRACKMAXSTACKDEPTH
+  int yyhwm;                    /* High-water mark of the stack */
+#endif
+#ifndef YYNOERRORRECOVERY
+  int yyerrcnt;                 /* Shifts left before out of the error */
+#endif
+  ParseARG_SDECL                /* A place to hold %extra_argument */
+  ParseCTX_SDECL                /* A place to hold %extra_context */
+#if YYSTACKDEPTH<=0
+  int yystksz;                  /* Current side of the stack */
+  yyStackEntry *yystack;        /* The parser's stack */
+  yyStackEntry yystk0;          /* First stack entry */
+#else
+  yyStackEntry yystack[YYSTACKDEPTH];  /* The parser's stack */
+  yyStackEntry *yystackEnd;            /* Last entry in the stack */
+#endif
+};
+typedef struct yyParser yyParser;
+
+#ifndef NDEBUG
+#include <stdio.h>
+#include <assert.h>
+static FILE *yyTraceFILE = 0;
+static char *yyTracePrompt = 0;
+#endif /* NDEBUG */
+
+#ifndef NDEBUG
+/* 
+** Turn parser tracing on by giving a stream to which to write the trace
+** and a prompt to preface each trace message.  Tracing is turned off
+** by making either argument NULL 
+**
+** Inputs:
+** <ul>
+** <li> A FILE* to which trace output should be written.
+**      If NULL, then tracing is turned off.
+** <li> A prefix string written at the beginning of every
+**      line of trace output.  If NULL, then tracing is
+**      turned off.
+** </ul>
+**
+** Outputs:
+** None.
+*/
+void ParseTrace(FILE *TraceFILE, char *zTracePrompt){
+  yyTraceFILE = TraceFILE;
+  yyTracePrompt = zTracePrompt;
+  if( yyTraceFILE==0 ) yyTracePrompt = 0;
+  else if( yyTracePrompt==0 ) yyTraceFILE = 0;
+}
+#endif /* NDEBUG */
+
+#if defined(YYCOVERAGE) || !defined(NDEBUG)
+/* For tracing shifts, the names of all terminals and nonterminals
+** are required.  The following table supplies these names */
+static const char *const yyTokenName[] = { 
+%%
+};
+#endif /* defined(YYCOVERAGE) || !defined(NDEBUG) */
+
+#ifndef NDEBUG
+/* For tracing reduce actions, the names of all rules are required.
+*/
+static const char *const yyRuleName[] = {
+%%
+};
+#endif /* NDEBUG */
+
+
+#if YYSTACKDEPTH<=0
+/*
+** Try to increase the size of the parser stack.  Return the number
+** of errors.  Return 0 on success.
+*/
+static int yyGrowStack(yyParser *p){
+  int newSize;
+  int idx;
+  yyStackEntry *pNew;
+
+  newSize = p->yystksz*2 + 100;
+  idx = p->yytos ? (int)(p->yytos - p->yystack) : 0;
+  if( p->yystack==&p->yystk0 ){
+    pNew = malloc(newSize*sizeof(pNew[0]));
+    if( pNew ) pNew[0] = p->yystk0;
+  }else{
+    pNew = realloc(p->yystack, newSize*sizeof(pNew[0]));
+  }
+  if( pNew ){
+    p->yystack = pNew;
+    p->yytos = &p->yystack[idx];
+#ifndef NDEBUG
+    if( yyTraceFILE ){
+      fprintf(yyTraceFILE,"%sStack grows from %d to %d entries.\n",
+              yyTracePrompt, p->yystksz, newSize);
+    }
+#endif
+    p->yystksz = newSize;
+  }
+  return pNew==0; 
+}
+#endif
+
+/* Datatype of the argument to the memory allocated passed as the
+** second argument to ParseAlloc() below.  This can be changed by
+** putting an appropriate #define in the %include section of the input
+** grammar.
+*/
+#ifndef YYMALLOCARGTYPE
+# define YYMALLOCARGTYPE size_t
+#endif
+
+/* Initialize a new parser that has already been allocated.
+*/
+void ParseInit(void *yypRawParser ParseCTX_PDECL){
+  yyParser *yypParser = (yyParser*)yypRawParser;
+  ParseCTX_STORE
+#ifdef YYTRACKMAXSTACKDEPTH
+  yypParser->yyhwm = 0;
+#endif
+#if YYSTACKDEPTH<=0
+  yypParser->yytos = NULL;
+  yypParser->yystack = NULL;
+  yypParser->yystksz = 0;
+  if( yyGrowStack(yypParser) ){
+    yypParser->yystack = &yypParser->yystk0;
+    yypParser->yystksz = 1;
+  }
+#endif
+#ifndef YYNOERRORRECOVERY
+  yypParser->yyerrcnt = -1;
+#endif
+  yypParser->yytos = yypParser->yystack;
+  yypParser->yystack[0].stateno = 0;
+  yypParser->yystack[0].major = 0;
+#if YYSTACKDEPTH>0
+  yypParser->yystackEnd = &yypParser->yystack[YYSTACKDEPTH-1];
+#endif
+}
+
+#ifndef Parse_ENGINEALWAYSONSTACK
+/* 
+** This function allocates a new parser.
+** The only argument is a pointer to a function which works like
+** malloc.
+**
+** Inputs:
+** A pointer to the function used to allocate memory.
+**
+** Outputs:
+** A pointer to a parser.  This pointer is used in subsequent calls
+** to Parse and ParseFree.
+*/
+void *ParseAlloc(void *(*mallocProc)(YYMALLOCARGTYPE) ParseCTX_PDECL){
+  yyParser *yypParser;
+  yypParser = (yyParser*)(*mallocProc)( (YYMALLOCARGTYPE)sizeof(yyParser) );
+  if( yypParser ){
+    ParseCTX_STORE
+    ParseInit(yypParser ParseCTX_PARAM);
+  }
+  return (void*)yypParser;
+}
+#endif /* Parse_ENGINEALWAYSONSTACK */
+
+
+/* The following function deletes the "minor type" or semantic value
+** associated with a symbol.  The symbol can be either a terminal
+** or nonterminal. "yymajor" is the symbol code, and "yypminor" is
+** a pointer to the value to be deleted.  The code used to do the 
+** deletions is derived from the %destructor and/or %token_destructor
+** directives of the input grammar.
+*/
+static void yy_destructor(
+  yyParser *yypParser,    /* The parser */
+  YYCODETYPE yymajor,     /* Type code for object to destroy */
+  YYMINORTYPE *yypminor   /* The object to be destroyed */
+){
+  ParseARG_FETCH
+  ParseCTX_FETCH
+  switch( yymajor ){
+    /* Here is inserted the actions which take place when a
+    ** terminal or non-terminal is destroyed.  This can happen
+    ** when the symbol is popped from the stack during a
+    ** reduce or during error processing or when a parser is 
+    ** being destroyed before it is finished parsing.
+    **
+    ** Note: during a reduce, the only symbols destroyed are those
+    ** which appear on the RHS of the rule, but which are *not* used
+    ** inside the C code.
+    */
+/********* Begin destructor definitions ***************************************/
+%%
+/********* End destructor definitions *****************************************/
+    default:  break;   /* If no destructor action specified: do nothing */
+  }
+}
+
+/*
+** Pop the parser's stack once.
+**
+** If there is a destructor routine associated with the token which
+** is popped from the stack, then call it.
+*/
+static void yy_pop_parser_stack(yyParser *pParser){
+  yyStackEntry *yytos;
+  assert( pParser->yytos!=0 );
+  assert( pParser->yytos > pParser->yystack );
+  yytos = pParser->yytos--;
+#ifndef NDEBUG
+  if( yyTraceFILE ){
+    fprintf(yyTraceFILE,"%sPopping %s\n",
+      yyTracePrompt,
+      yyTokenName[yytos->major]);
+  }
+#endif
+  yy_destructor(pParser, yytos->major, &yytos->minor);
+}
+
+/*
+** Clear all secondary memory allocations from the parser
+*/
+void ParseFinalize(void *p){
+  yyParser *pParser = (yyParser*)p;
+  while( pParser->yytos>pParser->yystack ) yy_pop_parser_stack(pParser);
+#if YYSTACKDEPTH<=0
+  if( pParser->yystack!=&pParser->yystk0 ) free(pParser->yystack);
+#endif
+}
+
+#ifndef Parse_ENGINEALWAYSONSTACK
+/* 
+** Deallocate and destroy a parser.  Destructors are called for
+** all stack elements before shutting the parser down.
+**
+** If the YYPARSEFREENEVERNULL macro exists (for example because it
+** is defined in a %include section of the input grammar) then it is
+** assumed that the input pointer is never NULL.
+*/
+void ParseFree(
+  void *p,                    /* The parser to be deleted */
+  void (*freeProc)(void*)     /* Function used to reclaim memory */
+){
+#ifndef YYPARSEFREENEVERNULL
+  if( p==0 ) return;
+#endif
+  ParseFinalize(p);
+  (*freeProc)(p);
+}
+#endif /* Parse_ENGINEALWAYSONSTACK */
+
+/*
+** Return the peak depth of the stack for a parser.
+*/
+#ifdef YYTRACKMAXSTACKDEPTH
+int ParseStackPeak(void *p){
+  yyParser *pParser = (yyParser*)p;
+  return pParser->yyhwm;
+}
+#endif
+
+/* This array of booleans keeps track of the parser statement
+** coverage.  The element yycoverage[X][Y] is set when the parser
+** is in state X and has a lookahead token Y.  In a well-tested
+** systems, every element of this matrix should end up being set.
+*/
+#if defined(YYCOVERAGE)
+static unsigned char yycoverage[YYNSTATE][YYNTOKEN];
+#endif
+
+/*
+** Write into out a description of every state/lookahead combination that
+**
+**   (1)  has not been used by the parser, and
+**   (2)  is not a syntax error.
+**
+** Return the number of missed state/lookahead combinations.
+*/
+#if defined(YYCOVERAGE)
+int ParseCoverage(FILE *out){
+  int stateno, iLookAhead, i;
+  int nMissed = 0;
+  for(stateno=0; stateno<YYNSTATE; stateno++){
+    i = yy_shift_ofst[stateno];
+    for(iLookAhead=0; iLookAhead<YYNTOKEN; iLookAhead++){
+      if( yy_lookahead[i+iLookAhead]!=iLookAhead ) continue;
+      if( yycoverage[stateno][iLookAhead]==0 ) nMissed++;
+      if( out ){
+        fprintf(out,"State %d lookahead %s %s\n", stateno,
+                yyTokenName[iLookAhead],
+                yycoverage[stateno][iLookAhead] ? "ok" : "missed");
+      }
+    }
+  }
+  return nMissed;
+}
+#endif
+
+/*
+** Find the appropriate action for a parser given the terminal
+** look-ahead token iLookAhead.
+*/
+static YYACTIONTYPE yy_find_shift_action(
+  YYCODETYPE iLookAhead,    /* The look-ahead token */
+  YYACTIONTYPE stateno      /* Current state number */
+){
+  int i;
+
+  if( stateno>YY_MAX_SHIFT ) return stateno;
+  assert( stateno <= YY_SHIFT_COUNT );
+#if defined(YYCOVERAGE)
+  yycoverage[stateno][iLookAhead] = 1;
+#endif
+  do{
+    i = yy_shift_ofst[stateno];
+    assert( i>=0 );
+    assert( i<=YY_ACTTAB_COUNT );
+    assert( i+YYNTOKEN<=(int)YY_NLOOKAHEAD );
+    assert( iLookAhead!=YYNOCODE );
+    assert( iLookAhead < YYNTOKEN );
+    i += iLookAhead;
+    assert( i<(int)YY_NLOOKAHEAD );
+    if( yy_lookahead[i]!=iLookAhead ){
+#ifdef YYFALLBACK
+      YYCODETYPE iFallback;            /* Fallback token */
+      assert( iLookAhead<sizeof(yyFallback)/sizeof(yyFallback[0]) );
+      iFallback = yyFallback[iLookAhead];
+      if( iFallback!=0 ){
+#ifndef NDEBUG
+        if( yyTraceFILE ){
+          fprintf(yyTraceFILE, "%sFALLBACK %s => %s\n",
+             yyTracePrompt, yyTokenName[iLookAhead], yyTokenName[iFallback]);
+        }
+#endif
+        assert( yyFallback[iFallback]==0 ); /* Fallback loop must terminate */
+        iLookAhead = iFallback;
+        continue;
+      }
+#endif
+#ifdef YYWILDCARD
+      {
+        int j = i - iLookAhead + YYWILDCARD;
+        assert( j<(int)(sizeof(yy_lookahead)/sizeof(yy_lookahead[0])) );
+        if( yy_lookahead[j]==YYWILDCARD && iLookAhead>0 ){
+#ifndef NDEBUG
+          if( yyTraceFILE ){
+            fprintf(yyTraceFILE, "%sWILDCARD %s => %s\n",
+               yyTracePrompt, yyTokenName[iLookAhead],
+               yyTokenName[YYWILDCARD]);
+          }
+#endif /* NDEBUG */
+          return yy_action[j];
+        }
+      }
+#endif /* YYWILDCARD */
+      return yy_default[stateno];
+    }else{
+      assert( i>=0 && i<(int)(sizeof(yy_action)/sizeof(yy_action[0])) );
+      return yy_action[i];
+    }
+  }while(1);
+}
+
+/*
+** Find the appropriate action for a parser given the non-terminal
+** look-ahead token iLookAhead.
+*/
+static YYACTIONTYPE yy_find_reduce_action(
+  YYACTIONTYPE stateno,     /* Current state number */
+  YYCODETYPE iLookAhead     /* The look-ahead token */
+){
+  int i;
+#ifdef YYERRORSYMBOL
+  if( stateno>YY_REDUCE_COUNT ){
+    return yy_default[stateno];
+  }
+#else
+  assert( stateno<=YY_REDUCE_COUNT );
+#endif
+  i = yy_reduce_ofst[stateno];
+  assert( iLookAhead!=YYNOCODE );
+  i += iLookAhead;
+#ifdef YYERRORSYMBOL
+  if( i<0 || i>=YY_ACTTAB_COUNT || yy_lookahead[i]!=iLookAhead ){
+    return yy_default[stateno];
+  }
+#else
+  assert( i>=0 && i<YY_ACTTAB_COUNT );
+  assert( yy_lookahead[i]==iLookAhead );
+#endif
+  return yy_action[i];
+}
+
+/*
+** The following routine is called if the stack overflows.
+*/
+static void yyStackOverflow(yyParser *yypParser){
+   ParseARG_FETCH
+   ParseCTX_FETCH
+#ifndef NDEBUG
+   if( yyTraceFILE ){
+     fprintf(yyTraceFILE,"%sStack Overflow!\n",yyTracePrompt);
+   }
+#endif
+   while( yypParser->yytos>yypParser->yystack ) yy_pop_parser_stack(yypParser);
+   /* Here code is inserted which will execute if the parser
+   ** stack every overflows */
+/******** Begin %stack_overflow code ******************************************/
+%%
+/******** End %stack_overflow code ********************************************/
+   ParseARG_STORE /* Suppress warning about unused %extra_argument var */
+   ParseCTX_STORE
+}
+
+/*
+** Print tracing information for a SHIFT action
+*/
+#ifndef NDEBUG
+static void yyTraceShift(yyParser *yypParser, int yyNewState, const char *zTag){
+  if( yyTraceFILE ){
+    if( yyNewState<YYNSTATE ){
+      fprintf(yyTraceFILE,"%s%s '%s', go to state %d\n",
+         yyTracePrompt, zTag, yyTokenName[yypParser->yytos->major],
+         yyNewState);
+    }else{
+      fprintf(yyTraceFILE,"%s%s '%s', pending reduce %d\n",
+         yyTracePrompt, zTag, yyTokenName[yypParser->yytos->major],
+         yyNewState - YY_MIN_REDUCE);
+    }
+  }
+}
+#else
+# define yyTraceShift(X,Y,Z)
+#endif
+
+/*
+** Perform a shift action.
+*/
+static void yy_shift(
+  yyParser *yypParser,          /* The parser to be shifted */
+  YYACTIONTYPE yyNewState,      /* The new state to shift in */
+  YYCODETYPE yyMajor,           /* The major token to shift in */
+  ParseTOKENTYPE yyMinor        /* The minor token to shift in */
+){
+  yyStackEntry *yytos;
+  yypParser->yytos++;
+#ifdef YYTRACKMAXSTACKDEPTH
+  if( (int)(yypParser->yytos - yypParser->yystack)>yypParser->yyhwm ){
+    yypParser->yyhwm++;
+    assert( yypParser->yyhwm == (int)(yypParser->yytos - yypParser->yystack) );
+  }
+#endif
+#if YYSTACKDEPTH>0 
+  if( yypParser->yytos>yypParser->yystackEnd ){
+    yypParser->yytos--;
+    yyStackOverflow(yypParser);
+    return;
+  }
+#else
+  if( yypParser->yytos>=&yypParser->yystack[yypParser->yystksz] ){
+    if( yyGrowStack(yypParser) ){
+      yypParser->yytos--;
+      yyStackOverflow(yypParser);
+      return;
+    }
+  }
+#endif
+  if( yyNewState > YY_MAX_SHIFT ){
+    yyNewState += YY_MIN_REDUCE - YY_MIN_SHIFTREDUCE;
+  }
+  yytos = yypParser->yytos;
+  yytos->stateno = yyNewState;
+  yytos->major = yyMajor;
+  yytos->minor.yy0 = yyMinor;
+  yyTraceShift(yypParser, yyNewState, "Shift");
+}
+
+/* For rule J, yyRuleInfoLhs[J] contains the symbol on the left-hand side
+** of that rule */
+static const YYCODETYPE yyRuleInfoLhs[] = {
+%%
+};
+
+/* For rule J, yyRuleInfoNRhs[J] contains the negative of the number
+** of symbols on the right-hand side of that rule. */
+static const signed char yyRuleInfoNRhs[] = {
+%%
+};
+
+static void yy_accept(yyParser*);  /* Forward Declaration */
+
+/*
+** Perform a reduce action and the shift that must immediately
+** follow the reduce.
+**
+** The yyLookahead and yyLookaheadToken parameters provide reduce actions
+** access to the lookahead token (if any).  The yyLookahead will be YYNOCODE
+** if the lookahead token has already been consumed.  As this procedure is
+** only called from one place, optimizing compilers will in-line it, which
+** means that the extra parameters have no performance impact.
+*/
+static YYACTIONTYPE yy_reduce(
+  yyParser *yypParser,         /* The parser */
+  unsigned int yyruleno,       /* Number of the rule by which to reduce */
+  int yyLookahead,             /* Lookahead token, or YYNOCODE if none */
+  ParseTOKENTYPE yyLookaheadToken  /* Value of the lookahead token */
+  ParseCTX_PDECL                   /* %extra_context */
+){
+  int yygoto;                     /* The next state */
+  YYACTIONTYPE yyact;             /* The next action */
+  yyStackEntry *yymsp;            /* The top of the parser's stack */
+  int yysize;                     /* Amount to pop the stack */
+  ParseARG_FETCH
+  (void)yyLookahead;
+  (void)yyLookaheadToken;
+  yymsp = yypParser->yytos;
+  assert( yyruleno<(int)(sizeof(yyRuleName)/sizeof(yyRuleName[0])) );
+#ifndef NDEBUG
+  if( yyTraceFILE ){
+    yysize = yyRuleInfoNRhs[yyruleno];
+    if( yysize ){
+      fprintf(yyTraceFILE, "%sReduce %d [%s]%s, pop back to state %d.\n",
+        yyTracePrompt,
+        yyruleno, yyRuleName[yyruleno],
+        yyruleno<YYNRULE_WITH_ACTION ? "" : " without external action",
+        yymsp[yysize].stateno);
+    }else{
+      fprintf(yyTraceFILE, "%sReduce %d [%s]%s.\n",
+        yyTracePrompt, yyruleno, yyRuleName[yyruleno],
+        yyruleno<YYNRULE_WITH_ACTION ? "" : " without external action");
+    }
+  }
+#endif /* NDEBUG */
+
+  /* Check that the stack is large enough to grow by a single entry
+  ** if the RHS of the rule is empty.  This ensures that there is room
+  ** enough on the stack to push the LHS value */
+  if( yyRuleInfoNRhs[yyruleno]==0 ){
+#ifdef YYTRACKMAXSTACKDEPTH
+    if( (int)(yypParser->yytos - yypParser->yystack)>yypParser->yyhwm ){
+      yypParser->yyhwm++;
+      assert( yypParser->yyhwm == (int)(yypParser->yytos - yypParser->yystack));
+    }
+#endif
+#if YYSTACKDEPTH>0 
+    if( yypParser->yytos>=yypParser->yystackEnd ){
+      yyStackOverflow(yypParser);
+      /* The call to yyStackOverflow() above pops the stack until it is
+      ** empty, causing the main parser loop to exit.  So the return value
+      ** is never used and does not matter. */
+      return 0;
+    }
+#else
+    if( yypParser->yytos>=&yypParser->yystack[yypParser->yystksz-1] ){
+      if( yyGrowStack(yypParser) ){
+        yyStackOverflow(yypParser);
+        /* The call to yyStackOverflow() above pops the stack until it is
+        ** empty, causing the main parser loop to exit.  So the return value
+        ** is never used and does not matter. */
+        return 0;
+      }
+      yymsp = yypParser->yytos;
+    }
+#endif
+  }
+
+  switch( yyruleno ){
+  /* Beginning here are the reduction cases.  A typical example
+  ** follows:
+  **   case 0:
+  **  #line <lineno> <grammarfile>
+  **     { ... }           // User supplied code
+  **  #line <lineno> <thisfile>
+  **     break;
+  */
+/********** Begin reduce actions **********************************************/
+%%
+/********** End reduce actions ************************************************/
+  };
+  assert( yyruleno<sizeof(yyRuleInfoLhs)/sizeof(yyRuleInfoLhs[0]) );
+  yygoto = yyRuleInfoLhs[yyruleno];
+  yysize = yyRuleInfoNRhs[yyruleno];
+  yyact = yy_find_reduce_action(yymsp[yysize].stateno,(YYCODETYPE)yygoto);
+
+  /* There are no SHIFTREDUCE actions on nonterminals because the table
+  ** generator has simplified them to pure REDUCE actions. */
+  assert( !(yyact>YY_MAX_SHIFT && yyact<=YY_MAX_SHIFTREDUCE) );
+
+  /* It is not possible for a REDUCE to be followed by an error */
+  assert( yyact!=YY_ERROR_ACTION );
+
+  yymsp += yysize+1;
+  yypParser->yytos = yymsp;
+  yymsp->stateno = (YYACTIONTYPE)yyact;
+  yymsp->major = (YYCODETYPE)yygoto;
+  yyTraceShift(yypParser, yyact, "... then shift");
+  return yyact;
+}
+
+/*
+** The following code executes when the parse fails
+*/
+#ifndef YYNOERRORRECOVERY
+static void yy_parse_failed(
+  yyParser *yypParser           /* The parser */
+){
+  ParseARG_FETCH
+  ParseCTX_FETCH
+#ifndef NDEBUG
+  if( yyTraceFILE ){
+    fprintf(yyTraceFILE,"%sFail!\n",yyTracePrompt);
+  }
+#endif
+  while( yypParser->yytos>yypParser->yystack ) yy_pop_parser_stack(yypParser);
+  /* Here code is inserted which will be executed whenever the
+  ** parser fails */
+/************ Begin %parse_failure code ***************************************/
+%%
+/************ End %parse_failure code *****************************************/
+  ParseARG_STORE /* Suppress warning about unused %extra_argument variable */
+  ParseCTX_STORE
+}
+#endif /* YYNOERRORRECOVERY */
+
+/*
+** The following code executes when a syntax error first occurs.
+*/
+static void yy_syntax_error(
+  yyParser *yypParser,           /* The parser */
+  int yymajor,                   /* The major type of the error token */
+  ParseTOKENTYPE yyminor         /* The minor type of the error token */
+){
+  ParseARG_FETCH
+  ParseCTX_FETCH
+#define TOKEN yyminor
+/************ Begin %syntax_error code ****************************************/
+%%
+/************ End %syntax_error code ******************************************/
+  ParseARG_STORE /* Suppress warning about unused %extra_argument variable */
+  ParseCTX_STORE
+}
+
+/*
+** The following is executed when the parser accepts
+*/
+static void yy_accept(
+  yyParser *yypParser           /* The parser */
+){
+  ParseARG_FETCH
+  ParseCTX_FETCH
+#ifndef NDEBUG
+  if( yyTraceFILE ){
+    fprintf(yyTraceFILE,"%sAccept!\n",yyTracePrompt);
+  }
+#endif
+#ifndef YYNOERRORRECOVERY
+  yypParser->yyerrcnt = -1;
+#endif
+  assert( yypParser->yytos==yypParser->yystack );
+  /* Here code is inserted which will be executed whenever the
+  ** parser accepts */
+/*********** Begin %parse_accept code *****************************************/
+%%
+/*********** End %parse_accept code *******************************************/
+  ParseARG_STORE /* Suppress warning about unused %extra_argument variable */
+  ParseCTX_STORE
+}
+
+/* The main parser program.
+** The first argument is a pointer to a structure obtained from
+** "ParseAlloc" which describes the current state of the parser.
+** The second argument is the major token number.  The third is
+** the minor token.  The fourth optional argument is whatever the
+** user wants (and specified in the grammar) and is available for
+** use by the action routines.
+**
+** Inputs:
+** <ul>
+** <li> A pointer to the parser (an opaque structure.)
+** <li> The major token number.
+** <li> The minor token number.
+** <li> An option argument of a grammar-specified type.
+** </ul>
+**
+** Outputs:
+** None.
+*/
+void Parse(
+  void *yyp,                   /* The parser */
+  int yymajor,                 /* The major token code number */
+  ParseTOKENTYPE yyminor       /* The value for the token */
+  ParseARG_PDECL               /* Optional %extra_argument parameter */
+){
+  YYMINORTYPE yyminorunion;
+  YYACTIONTYPE yyact;   /* The parser action. */
+#if !defined(YYERRORSYMBOL) && !defined(YYNOERRORRECOVERY)
+  int yyendofinput;     /* True if we are at the end of input */
+#endif
+#ifdef YYERRORSYMBOL
+  int yyerrorhit = 0;   /* True if yymajor has invoked an error */
+#endif
+  yyParser *yypParser = (yyParser*)yyp;  /* The parser */
+  ParseCTX_FETCH
+  ParseARG_STORE
+
+  assert( yypParser->yytos!=0 );
+#if !defined(YYERRORSYMBOL) && !defined(YYNOERRORRECOVERY)
+  yyendofinput = (yymajor==0);
+#endif
+
+  yyact = yypParser->yytos->stateno;
+#ifndef NDEBUG
+  if( yyTraceFILE ){
+    if( yyact < YY_MIN_REDUCE ){
+      fprintf(yyTraceFILE,"%sInput '%s' in state %d\n",
+              yyTracePrompt,yyTokenName[yymajor],yyact);
+    }else{
+      fprintf(yyTraceFILE,"%sInput '%s' with pending reduce %d\n",
+              yyTracePrompt,yyTokenName[yymajor],yyact-YY_MIN_REDUCE);
+    }
+  }
+#endif
+
+  do{
+    assert( yyact==yypParser->yytos->stateno );
+    yyact = yy_find_shift_action((YYCODETYPE)yymajor,yyact);
+    if( yyact >= YY_MIN_REDUCE ){
+      yyact = yy_reduce(yypParser,yyact-YY_MIN_REDUCE,yymajor,
+                        yyminor ParseCTX_PARAM);
+    }else if( yyact <= YY_MAX_SHIFTREDUCE ){
+      yy_shift(yypParser,yyact,(YYCODETYPE)yymajor,yyminor);
+#ifndef YYNOERRORRECOVERY
+      yypParser->yyerrcnt--;
+#endif
+      break;
+    }else if( yyact==YY_ACCEPT_ACTION ){
+      yypParser->yytos--;
+      yy_accept(yypParser);
+      return;
+    }else{
+      assert( yyact == YY_ERROR_ACTION );
+      yyminorunion.yy0 = yyminor;
+#ifdef YYERRORSYMBOL
+      int yymx;
+#endif
+#ifndef NDEBUG
+      if( yyTraceFILE ){
+        fprintf(yyTraceFILE,"%sSyntax Error!\n",yyTracePrompt);
+      }
+#endif
+#ifdef YYERRORSYMBOL
+      /* A syntax error has occurred.
+      ** The response to an error depends upon whether or not the
+      ** grammar defines an error token "ERROR".  
+      **
+      ** This is what we do if the grammar does define ERROR:
+      **
+      **  * Call the %syntax_error function.
+      **
+      **  * Begin popping the stack until we enter a state where
+      **    it is legal to shift the error symbol, then shift
+      **    the error symbol.
+      **
+      **  * Set the error count to three.
+      **
+      **  * Begin accepting and shifting new tokens.  No new error
+      **    processing will occur until three tokens have been
+      **    shifted successfully.
+      **
+      */
+      if( yypParser->yyerrcnt<0 ){
+        yy_syntax_error(yypParser,yymajor,yyminor);
+      }
+      yymx = yypParser->yytos->major;
+      if( yymx==YYERRORSYMBOL || yyerrorhit ){
+#ifndef NDEBUG
+        if( yyTraceFILE ){
+          fprintf(yyTraceFILE,"%sDiscard input token %s\n",
+             yyTracePrompt,yyTokenName[yymajor]);
+        }
+#endif
+        yy_destructor(yypParser, (YYCODETYPE)yymajor, &yyminorunion);
+        yymajor = YYNOCODE;
+      }else{
+        while( yypParser->yytos >= yypParser->yystack
+            && (yyact = yy_find_reduce_action(
+                        yypParser->yytos->stateno,
+                        YYERRORSYMBOL)) > YY_MAX_SHIFTREDUCE
+        ){
+          yy_pop_parser_stack(yypParser);
+        }
+        if( yypParser->yytos < yypParser->yystack || yymajor==0 ){
+          yy_destructor(yypParser,(YYCODETYPE)yymajor,&yyminorunion);
+          yy_parse_failed(yypParser);
+#ifndef YYNOERRORRECOVERY
+          yypParser->yyerrcnt = -1;
+#endif
+          yymajor = YYNOCODE;
+        }else if( yymx!=YYERRORSYMBOL ){
+          yy_shift(yypParser,yyact,YYERRORSYMBOL,yyminor);
+        }
+      }
+      yypParser->yyerrcnt = 3;
+      yyerrorhit = 1;
+      if( yymajor==YYNOCODE ) break;
+      yyact = yypParser->yytos->stateno;
+#elif defined(YYNOERRORRECOVERY)
+      /* If the YYNOERRORRECOVERY macro is defined, then do not attempt to
+      ** do any kind of error recovery.  Instead, simply invoke the syntax
+      ** error routine and continue going as if nothing had happened.
+      **
+      ** Applications can set this macro (for example inside %include) if
+      ** they intend to abandon the parse upon the first syntax error seen.
+      */
+      yy_syntax_error(yypParser,yymajor, yyminor);
+      yy_destructor(yypParser,(YYCODETYPE)yymajor,&yyminorunion);
+      break;
+#else  /* YYERRORSYMBOL is not defined */
+      /* This is what we do if the grammar does not define ERROR:
+      **
+      **  * Report an error message, and throw away the input token.
+      **
+      **  * If the input token is $, then fail the parse.
+      **
+      ** As before, subsequent error messages are suppressed until
+      ** three input tokens have been successfully shifted.
+      */
+      if( yypParser->yyerrcnt<=0 ){
+        yy_syntax_error(yypParser,yymajor, yyminor);
+      }
+      yypParser->yyerrcnt = 3;
+      yy_destructor(yypParser,(YYCODETYPE)yymajor,&yyminorunion);
+      if( yyendofinput ){
+        yy_parse_failed(yypParser);
+#ifndef YYNOERRORRECOVERY
+        yypParser->yyerrcnt = -1;
+#endif
+      }
+      break;
+#endif
+    }
+  }while( yypParser->yytos>yypParser->yystack );
+#ifndef NDEBUG
+  if( yyTraceFILE ){
+    yyStackEntry *i;
+    char cDiv = '[';
+    fprintf(yyTraceFILE,"%sReturn. Stack=",yyTracePrompt);
+    for(i=&yypParser->yystack[1]; i<=yypParser->yytos; i++){
+      fprintf(yyTraceFILE,"%c%s", cDiv, yyTokenName[i->major]);
+      cDiv = ' ';
+    }
+    fprintf(yyTraceFILE,"]\n");
+  }
+#endif
+  return;
+}
+
+/*
+** Return the fallback token corresponding to canonical token iToken, or
+** 0 if iToken has no fallback.
+*/
+int ParseFallback(int iToken){
+#ifdef YYFALLBACK
+  assert( iToken<(int)(sizeof(yyFallback)/sizeof(yyFallback[0])) );
+  return yyFallback[iToken];
+#else
+  (void)iToken;
+  return 0;
+#endif
+}

+ 0 - 0
src/codec/nt_grammar → src/codec/nt_grammar.y


+ 74 - 38
src/codec/nt_lexer.re

@@ -31,26 +31,20 @@
 
 typedef struct {
     FILE *          file;           // Input file handle.
-    char *          buf,            // Start of buffer.
-         *          lim,            // Position after the last
+    YYCTYPE *       buf,            // Start of buffer.
+            *       lim,            // Position after the last
                                     //   available input character
                                     //   (YYLIMIT)
-         *          cur,            // Next input character to be read
+            *       cur,            // Next input character to be read
                                     //   (YYCURSOR)
-         *          mar,            // Most recent match (YYMARKER)
-         *          tok;            // Start of current token.
+            *       mar,            // Most recent match (YYMARKER)
+            *       tok;            // Start of current token.
     size_t          ct;             // Number of parsed triples.
     int             eof;            // if we have reached EOF (T|F)
-    /*!stags:re2c format = "char *@@;"; */
+    /*!stags:re2c format = "YYCTYPE *@@;"; */
 } Input;
 
 
-typedef struct token_t {
-    int             type;           // Token type (enum).
-    LSUP_Term *     term;           // Token data (e.g. char*,  Term*)
-} Token;
-
-
 static int fill(Input *in)
 {
     if (in->eof) {
@@ -90,7 +84,7 @@ static void init(Input *in, FILE *file)
 static int __attribute__((unused)) extend (Input *in)
 {
     size_t delta = YYLIMIT - in->buf + CHUNK_SIZE;
-    char *tmp = realloc (in->buf, delta);
+    YYCTYPE *tmp = realloc (in->buf, delta);
     if (!tmp) return ENOMEM;
 
     in->lim += delta;
@@ -105,6 +99,55 @@ static void done (Input *in)
 { free (in->buf); }
 
 
+/** @brief Replace \uxxxx and \Uxxxxxxxx with Unicode bytes.
+ */
+static YYCTYPE *unescape_unicode (const YYCTYPE *esc_str, size_t size)
+{
+    YYCTYPE *uc_str = malloc (size + 1);
+
+    size_t j = 0;
+    YYCTYPE tmp_chr[5];
+    for (size_t i = 0; i < size;) {
+        if (memcmp (esc_str + i, "\\u", 2) == 0) {
+            i += 2; // backslash + 'u'
+
+            // Use tmp_chr to hold the hex string representing the code point.
+            memcpy(tmp_chr, esc_str + i, sizeof (tmp_chr) - 1);
+            tmp_chr[4] = '\0';
+
+            uint32_t tmp_val = strtol ((char*)tmp_chr, NULL, 16);
+            TRACE ("tmp_val: %d\n", tmp_val);
+
+            // Reuse tmp_chr to hold the byte values for the code point.
+            int nbytes = utf8_encode (tmp_val, tmp_chr);
+
+            // Copy bytes into destination.
+            memcpy (uc_str + j, tmp_chr, nbytes);
+            TRACE ("UC byte value: %x %x\n", uc_str[j], uc_str[j + 1]);
+
+            j += nbytes;
+            i += 4;
+
+        } else if (memcmp (esc_str + i, "\\U", 2) == 0) {
+            fprintf (
+                    stderr,
+                    "UTF-16 sequence unescaping not yet implemented.\n");
+            return NULL; // TODO encode UTF-16
+        } else {
+            // Copy ASCII char verbatim.
+            uc_str[j++] = esc_str[i++];
+        }
+    }
+
+    YYCTYPE *tmp = realloc (uc_str, j + 1);
+    if (UNLIKELY (!tmp)) return NULL;
+    uc_str = tmp;
+    uc_str[j] = '\0';
+
+    return uc_str;
+}
+
+
 // Parser interface.
 
 void *ParseAlloc();
@@ -116,7 +159,7 @@ void ParseFree();
 
 static int lex (Input *in, LSUP_Term **term)
 {
-    const char *lit_data_e, *dtype_s, *lang_s;
+    const YYCTYPE *lit_data_e, *dtype_s, *lang_s;
 
 loop:
 
@@ -165,45 +208,41 @@ loop:
     }
 
     IRIREF {
-        size_t size = YYCURSOR - in->tok - 1;
-        char *data = malloc (size);
-        memcpy (data, in->tok + 1, size);
-        data [size - 1] = '\0';
+        YYCTYPE *data = unescape_unicode (in->tok + 1, YYCURSOR - in->tok - 2);
 
-        printf ("URI data (%lu): %s\n", strlen (data), data);
+        printf ("URI data: %s\n", data);
 
-        *term = LSUP_uri_new (data);
+        *term = LSUP_uri_new ((char*)data);
         free (data);
 
         return T_IRIREF;
     }
 
     LITERAL {
-        size_t size = lit_data_e - in->tok - 1;
-        char *data = malloc (size);
-        memcpy (data, in->tok + 1, size);
-        data [size -1] = '\0';
-        printf ("Literal data (%lu): %s\n", strlen(data), data);
+        // Only unescape Unicode from data.
+        size_t size = lit_data_e - in->tok - 2;
+        YYCTYPE *data = unescape_unicode (in->tok + 1, size);
+        printf ("Literal data: %s\n", data);
 
-        char *datatype = NULL, *lang = NULL;
+        YYCTYPE *datatype = NULL, *lang = NULL;
 
         if (dtype_s) {
             size = YYCURSOR - dtype_s - 1;
             datatype = malloc (size);
             memcpy (datatype, dtype_s + 1, size);
             datatype [size - 1] = '\0';
-            printf ("datatype (%lu): %s\n", strlen(datatype), datatype);
+            printf ("datatype: %s\n", datatype);
         }
 
         if (lang_s) {
-            size = YYCURSOR - lang_s + 1;
+            size = YYCURSOR - lang_s;
             lang = malloc (size);
-            memcpy (lang, lang_s, size);
+            memcpy (lang, lang_s + 1, size);
             lang [size - 1] = '\0';
-            printf ("lang (%lu): %s\n", strlen (lang), lang);
+            printf ("lang: %s\n", lang);
         }
 
-        *term = LSUP_term_new (LSUP_TERM_LITERAL, data, datatype, lang);
+        *term = LSUP_term_new (LSUP_TERM_LITERAL, (char*)data, (char*)datatype, (char*)lang);
 
         free (data);
         free (datatype);
@@ -213,14 +252,11 @@ loop:
     }
 
     BNODE {
-        size_t size = YYCURSOR - in->tok - 1;
-        char *data = malloc (size);
-        memcpy (data, in->tok + 2, size);
-        data [size - 1] = '\0';
+        YYCTYPE *data = unescape_unicode (in->tok + 2, YYCURSOR - in->tok - 1);
 
-        printf ("BNode data (%lu): %s\n", strlen(data), data);
+        printf ("BNode data: %s\n", data);
 
-        *term = LSUP_term_new (LSUP_TERM_BNODE, data, NULL, NULL);
+        *term = LSUP_term_new (LSUP_TERM_BNODE, (char*)data, NULL, NULL);
         free (data);
 
         return T_BNODE;
@@ -242,7 +278,7 @@ loop:
 
     COMMENT {
         size_t size = YYCURSOR - in->tok + 1;
-        char *data = malloc (size);
+        YYCTYPE *data = malloc (size);
         memcpy (data, in->tok, size);
         data [size - 1] = '\0';
         printf ("Comment: `%s`\n", data);

+ 0 - 15
src/codec/test.nt

@@ -1,15 +0,0 @@
-<http://example.org/show/218> <http://www.w3.org/2000/01/rdf-schema#label> "That Seventies Show"^^<http://www.w3.org/2001/XMLSchema#string> . # literal with XML Schema string datatype
-<http://example.org/show/218> <http://www.w3.org/2000/01/rdf-schema#label> "That Seventies Show" . # same as above
-<http://example.org/show/218> <http://example.org/show/localName> "That Seventies Show"@en . # literal with a language tag
-<http://example.org/show/218> <http://example.org/show/localName> "Cette Série des Années Septante"@fr-be .  # literal outside of ASCII range with a region subtag
-<http://example.org/#spiderman> <http://example.org/text> "This is a multi-line\nliteral with many quotes (\"\"\"\"\")\nand two apostrophes ('')." .
-<http://en.wikipedia.org/wiki/Helium> <http://example.org/elements/atomicNumber> "2"^^<http://www.w3.org/2001/XMLSchema#integer> . # xsd:integer
-<http://en.wikipedia.org/wiki/Helium> <http://example.org/elements/specificGravity> "1.663E-4"^^<http://www.w3.org/2001/XMLSchema#double> .     #xsd:double
-# A comment.
-# Another comment.
-<http://ex.org/longtext> <urn:myns:hasContent> "riverrun, past Eve and Adam's, from swerve of shore to bend of bay, brings us by a commodius vicus of recirculation back to Howth Castle and Environs.\nSir Tristram, violer d'amores, fr'over the short sea, had passencore rearrived from North Armorica on this side the scraggy isthmus of Europe Minor to wielderfight his penisolate war: nor had topsawyer's rocks by the stream Oconee exaggerated themselse to Laurens County's gorgios while they went doublin their mumper all the time: nor avoice from afire bellowsed mishe mishe to tauftauf thuartpeatrick: not yet, though venissoon after, had a kidscad buttended a bland old isaac: not yet, though all's fair in vanessy, were sosie sesthers wroth with twone nathandjoe. Rot a peck of pa's malt had Jhem or Shen brewed by arclight and rory end to the regginbrow was to be seen ringsome on the aquaface.\nThe fall (bababadalgharaghtakamminarronnkonnbronntonnerronntuonnthunntrovarrhounawnskawntoohoohoordenenthurnuk!) of a once wallstrait oldparr is retaled early in bed and later on life down through all christian minstrelsy. The great fall of the offwall entailed at such short notice the pftjschute of Finnegan, erse solid man, that the humptyhillhead of humself prumptly sends an unquiring one well to the west in quest of his tumptytumtoes: and their upturnpikepointandplace is at the knock out in the park where oranges have been laid to rust upon the green since devlinsfirst loved livvy." .
-#<http://example.org/utf8/ქართული ენის შესწავლა და სწავლება> <urn:utf8:語文教學・语文教学> "'læŋɡwidʒ 'lɘr:niŋ ænd 'ti:tʃiŋ" .
-#<http://example.org/utf8/\u202d\u202bללמוד וללמד את ה> <urn:utf8:말배우기와 가르치기> "'læŋɡwidʒ 'lɘr:niŋ ænd 'ti:tʃiŋ" .
-_:alice <http://xmlns.com/foaf/0.1/knows> _:bob .
-_:bob <http://xmlns.com/foaf/0.1/knows> _:alice .
-

+ 1 - 0
src/codec/test2.nt

@@ -2,6 +2,7 @@
 <http://example.org/show/218> <http://www.w3.org/2000/01/rdf-schema#label> "That Seventies Show" . # same as above
 <http://example.org/show/218> <http://example.org/show/localName> "That Seventies Show"@en . # literal with a language tag
 <http://example.org/show/218> <http://example.org/show/localName> "Cette Série des Années Septante"@fr-be .  # literal outside of ASCII range with a region subtag
+<http://example.org/show/\u0126\u014b\u015A> <http://example.org/show/localName> "Test Unicode \u0122 \u012c" .  # <http://example.org/show/ĦŋŚ> [...] "Test Unicode Ģ Ĭ"
 <http://example.org/#spiderman> <http://example.org/text> "This is a multi-line\nliteral with many quotes (\"\"\"\"\")\nand two apostrophes ('')."^^<http://www.w3.org/2001/XMLSchema#string> .
 <http://en.wikipedia.org/wiki/Helium> <http://example.org/elements/atomicNumber> "2"^^<http://www.w3.org/2001/XMLSchema#integer> . # xsd:integer
 <http://en.wikipedia.org/wiki/Helium> <http://example.org/elements/specificGravity> "1.663E-4"^^<http://www.w3.org/2001/XMLSchema#double> .     #xsd:double

+ 1 - 0
src/codec/test2.txt

@@ -0,0 +1 @@
+<http://example.org/show/218> <http://www.w3.org/2000/01/rdf-schema#label> "That Seventies Show"^^<http://www.w3.org/2001/XMLSchema#string> . # literal with XML Schema string datatype <http://example.org/show/218> <http://www.w3.org/2000/01/rdf-schema#label> "That Seventies Show" . # same as above <http://example.org/show/218> <http://example.org/show/localName> "That Seventies Show"@en . # literal with a language tag <http://example.org/show/218> <http://example.org/show/localName> "Cette Série des Années Septante"@fr-be .  # literal outside of ASCII range with a region subtag <http://example.org/#spiderman> <http://example.org/text> "This is a multi-line\nliteral with many quotes (\"\"\"\"\")\nand two apostrophes ('')." .  <http://en.wikipedia.org/wiki/Helium> <http://example.org/elements/atomicNumber> "2"^^<http://www.w3.org/2001/XMLSchema#integer> . # xsd:integer <http://en.wikipedia.org/wiki/Helium> <http://example.org/elements/specificGravity> "1.663E-4"^^<http://www.w3.org/2001/XMLSchema#double> .     #xsd:double # A comment.  # Another comment.

+ 4 - 0
src/core.c

@@ -60,3 +60,7 @@ unlink_cb(
 int rm_r(const char *path)
 { return nftw(path, unlink_cb, 64, FTW_DEPTH | FTW_PHYS); }
 
+
+/* Inline extern functions. */
+
+int utf8_encode(const uint32_t utf, unsigned char *out);

+ 2 - 2
src/term.c

@@ -88,8 +88,8 @@ LSUP_term_init(
         // TODO Cheap fix. Should url-encode all invalid chars.
         if (strpbrk (data, INVALID_URI_CHARS) != NULL) {
             fprintf (
-                    stderr, "Characters %s are not allowed.\n",
-                    INVALID_URI_CHARS);
+                    stderr, "Characters %s are not allowed. Got: %s\n",
+                    INVALID_URI_CHARS, data);
 
             return LSUP_VALUE_ERR;
         }