grammar_ttl.y 8.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279
  1. %include {
  2. /** @brief Lemon parser grammar for Turtle.
  3. *
  4. * The `lemon' parser generator executable must be in your PATH:
  5. * https://sqlite.org/src/doc/trunk/doc/lemon.html
  6. *
  7. * To generate the parser, run: `make parsers'
  8. *
  9. * TTL EBNF: https://www.w3.org/TeamSubmission/turtle/#sec-grammar-grammar
  10. */
  11. #include "codec.h"
  12. }
  13. %name TTLParse
  14. %stack_overflow {
  15. log_error ("Stack oveflow in TTL parsing.");
  16. state->rc = LSUP_MEM_ERR;
  17. }
  18. %parse_failure {
  19. log_error ("TTL parse error. Cannot continue.");
  20. state->rc = LSUP_PARSE_ERR;
  21. }
  22. %syntax_error {
  23. log_warn ("Syntax error. Attempting recovery.");
  24. }
  25. %stack_size CHUNK_SIZE
  26. /*
  27. %syntax_error {
  28. if (TOKEN[0]) log_error ("near \"%T\": syntax error", &TOKEN);
  29. else log_error ("incomplete input");
  30. }
  31. */
  32. %token_prefix "T_"
  33. %token_type { char * }
  34. %token_destructor { free ($$); }
  35. /* NULL-terminated array of object term handles. */
  36. %type objList { LSUP_Term ** }
  37. %destructor objList {
  38. for (size_t i = 0; $$[i]; i++) {
  39. LSUP_term_free ($$[i]);
  40. }
  41. }
  42. %default_type { char * }
  43. %extra_argument { LSUP_TTLParserState *state }
  44. %left WS .
  45. %left EOS .
  46. %left SEMICOLON .
  47. %left COMMA .
  48. %nonassoc PFX .
  49. %nonassoc COLON .
  50. /*
  51. * Rules.
  52. */
  53. turtleDoc ::= statements EOF .
  54. statements ::= statements statement .
  55. statements ::= .
  56. statement ::= directive .
  57. statement ::= triples .
  58. directive ::= prefixID EOS .
  59. directive ::= base EOS .
  60. prefixID ::= PREFIX PFX(P) IRIREF(N) . {
  61. LSUP_nsmap_add (state->nsm, P, N);
  62. }
  63. prefixID ::= PREFIX COLON IRIREF(N) . {
  64. LSUP_nsmap_add (state->nsm, "", N);
  65. }
  66. base ::= BASE IRIREF(D) . {
  67. state->base = LSUP_iriref_new (D, NULL);
  68. }
  69. triples ::= subject(S) predObjList(L) EOS . {
  70. size_t ct = LSUP_spo_list_add_triples (state->it, S, L);
  71. state->ct += ct;
  72. state->rc = LSUP_OK;
  73. log_trace ("Added %lu triples.", ct);
  74. LSUP_term_free (S);
  75. LSUP_pred_obj_list_free (L);
  76. }
  77. %type predObjList { LSUP_PredObjList * }
  78. %destructor predObjList { LSUP_pred_obj_list_free ($$); }
  79. predObjList(A) ::= predicate(P) objectList(O) SEMICOLON . {
  80. A = LSUP_pred_obj_list_new();
  81. LSUP_pred_obj_list_add (A, P, O);
  82. }
  83. predObjList(A) ::= predicate(P) objectList(O) . [SEMICOLON] {
  84. A = LSUP_pred_obj_list_new();
  85. LSUP_pred_obj_list_add (A, P, O);
  86. }
  87. predObjList(A) ::= predObjList(L) SEMICOLON predicate(P) objectList(O) . {
  88. LSUP_pred_obj_list_add (L, P, O);
  89. A = L;
  90. }
  91. %type objectList { LSUP_Term ** }
  92. objectList(A) ::= objectList(L) COMMA object(O) . [COMMA] {
  93. A = LSUP_obj_list_add (L, O);
  94. }
  95. objectList(A) ::= object(O) . [COMMA] {
  96. A = calloc (sizeof (*A), 2);
  97. A[0] = O;
  98. }
  99. %type subject { LSUP_Term * }
  100. subject ::= resource .
  101. subject ::= blank .
  102. %type predicate { LSUP_Term * }
  103. // Leading WS is counted as part of the RDF_TYPE ('a') token.
  104. predicate ::= resource .
  105. predicate(A)::= RDF_TYPE . { A = LSUP_iriref_new ("rdf:type", state->nsm); }
  106. %type object { LSUP_Term * }
  107. object ::= resource .
  108. object ::= blank .
  109. object ::= literal .
  110. %type literal { LSUP_Term * }
  111. literal(A) ::= STRING(D) . {
  112. A = LSUP_term_new (LSUP_TERM_LITERAL, D, NULL);
  113. }
  114. literal(A) ::= STRING(D) LANGTAG(L) . {
  115. A = LSUP_term_new (LSUP_TERM_LT_LITERAL, D, L);
  116. }
  117. literal(A) ::= STRING(D) DTYPE_MARKER resource(M) . {
  118. A = LSUP_term_new (LSUP_TERM_LITERAL, D, M);
  119. }
  120. literal(A) ::= INTEGER(D) . {
  121. A = LSUP_term_new (
  122. LSUP_TERM_LITERAL, D,
  123. LSUP_iriref_new ("xsd:integer", state->nsm));
  124. }
  125. literal(A) ::= DOUBLE(D) . {
  126. A = LSUP_term_new (
  127. LSUP_TERM_LITERAL, D,
  128. LSUP_iriref_new ("xsd:double", state->nsm));
  129. }
  130. literal(A) ::= DECIMAL(D) . {
  131. A = LSUP_term_new (
  132. LSUP_TERM_LITERAL, D,
  133. LSUP_iriref_new ("xsd:decimal", state->nsm));
  134. }
  135. literal(A) ::= BOOLEAN(D) . {
  136. A = LSUP_term_new (
  137. LSUP_TERM_LITERAL, D,
  138. LSUP_iriref_new ("xsd:boolean", state->nsm));
  139. }
  140. %type blank { LSUP_Term * }
  141. blank(A) ::= BNODE_ID(D) . {
  142. A = LSUP_term_new (LSUP_TERM_BNODE, D, NULL);
  143. }
  144. blank(A) ::= LBRACKET RBRACKET . {
  145. A = LSUP_term_new (LSUP_TERM_BNODE, NULL, NULL);
  146. }
  147. blank(A) ::= LBRACKET predObjList(L) RBRACKET . {
  148. A = LSUP_term_new (LSUP_TERM_BNODE, NULL, NULL);
  149. state->ct += LSUP_spo_list_add_triples (state->it, A, L);
  150. LSUP_pred_obj_list_free (L);
  151. }
  152. blank ::= collection .
  153. blank(A) ::= LPAREN RPAREN . {
  154. A = LSUP_iriref_new ("rdf:nil", state->nsm);
  155. }
  156. // "collection" is the subject of the first collection item.
  157. %type collection { LSUP_Term * }
  158. // Collection triples are added here to the graph.
  159. collection(A) ::= LPAREN itemList(L) RPAREN . {
  160. A = LSUP_bnode_add_collection (state->it, L);
  161. }
  162. %type itemList { LSUP_Term ** }
  163. // Freed when the item list in the collection gets added to the graph.
  164. %destructor itemList {}
  165. itemList(A) ::= itemList(L) object(O) . { A = LSUP_obj_list_add (L, O); }
  166. itemList(A) ::= object(O) . {
  167. A = calloc (sizeof (*A), 2);
  168. A[0] = O;
  169. }
  170. %type resource { LSUP_Term * }
  171. resource(A) ::= IRIREF(D) . {
  172. LSUP_Term *rel_iri = LSUP_iriref_new (D, NULL);
  173. free (D);
  174. if (state->base) {
  175. A = LSUP_iriref_absolute (rel_iri, state->base);
  176. LSUP_term_free (rel_iri);
  177. } else {
  178. A = rel_iri;
  179. }
  180. }
  181. resource(A) ::= qname(D) . { A = LSUP_iriref_new (D, state->nsm); }
  182. qname(A) ::= PFX(P) IDNAME(D) . {
  183. A = malloc (strlen (P) + strlen (D) + 2);
  184. sprintf (A, "%s:%s", P, D);
  185. }
  186. qname(A) ::= COLON IDNAME(D) . {
  187. A = malloc (strlen (D) + 2);
  188. sprintf (A, ":%s", D);
  189. }
  190. qname(A) ::= COLON . { A = strndup (":", 2); }
  191. /*
  192. * This has been adapted from
  193. * https://www.w3.org/TeamSubmission/turtle/#sec-grammar-grammar :
  194. [1] turtleDoc ::= statement*
  195. [2] statement ::= directive '.' | triples '.' | ws+
  196. [3] directive ::= prefixID | base
  197. [4] prefixID ::= '@prefix' ws+ prefixName? ':' uriref
  198. [5] base ::= '@base' ws+ uriref
  199. [6] triples ::= subject predicateObjectList
  200. [7] predicateObjectList ::= verb objectList ( ';' verb objectList )* ( ';')?
  201. [8] objectList ::= object ( ',' object)*
  202. [9] verb ::= predicate | 'a'
  203. [10] comment ::= '#' ( [^#xA#xD] )*
  204. [11] subject ::= resource | blank
  205. [12] predicate ::= resource
  206. [13] object ::= resource | blank | literal
  207. [14] literal ::= quotedString ( '@' language )? | datatypeString | integer | double | decimal | boolean
  208. [15] datatypeString ::= quotedString '^^' resource
  209. [16] integer ::= ('-' | '+') ? [0-9]+
  210. [17] double ::= ('-' | '+') ? ( [0-9]+ '.' [0-9]* exponent | '.' ([0-9])+ exponent | ([0-9])+ exponent )
  211. [18] decimal ::= ('-' | '+')? ( [0-9]+ '.' [0-9]* | '.' ([0-9])+ | ([0-9])+ )
  212. [19] exponent ::= [eE] ('-' | '+')? [0-9]+
  213. [20] boolean ::= 'true' | 'false'
  214. [21] blank ::= nodeID | '[]' | '[' predicateObjectList ']' | collection
  215. [22] itemList ::= object+
  216. [23] collection ::= '(' itemList? ')'
  217. [24] ws ::= #x9 | #xA | #xD | #x20 | comment
  218. [25] resource ::= uriref | qname
  219. [26] nodeID ::= '_:' name
  220. [27] qname ::= prefixName? ':' name?
  221. [28] uriref ::= '<' relativeURI '>'
  222. [29] language ::= [a-z]+ ('-' [a-z0-9]+ )*
  223. [30] nameStartChar ::= [A-Z] | "_" | [a-z] | [#x00C0-#x00D6] | [#x00D8-#x00F6] | [#x00F8-#x02FF] | [#x0370-#x037D] | [#x037F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF]
  224. [31] nameChar ::= nameStartChar | '-' | [0-9] | #x00B7 | [#x0300-#x036F] | [#x203F-#x2040]
  225. [32] name ::= nameStartChar nameChar*
  226. [33] prefixName ::= ( nameStartChar - '_' ) nameChar*
  227. [34] relativeURI ::= ucharacter*
  228. [35] quotedString ::= string | longString
  229. [36] string ::= #x22 scharacter* #x22
  230. [37] longString ::= #x22 #x22 #x22 lcharacter* #x22 #x22 #x22
  231. [38] character ::= '\u' hex hex hex hex |
  232. '\U' hex hex hex hex hex hex hex hex |
  233. '\\' |
  234. [#x20-#x5B] | [#x5D-#x10FFFF]
  235. [39] echaracter ::= character | '\t' | '\n' | '\r'
  236. [40] hex ::= [#x30-#x39] | [#x41-#x46]
  237. [41] ucharacter ::= ( character - #x3E ) | '\>'
  238. [42] scharacter ::= ( echaracter - #x22 ) | '\"'
  239. [43] lcharacter ::= echaracter | '\"' | #x9 | #xA | #xD
  240. */