grammar_ttl.y 8.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274
  1. %include {
  2. /** @brief Lemon parser grammar for Turtle.
  3. *
  4. * The `lemon' parser generator executable must be in your PATH:
  5. * https://sqlite.org/src/doc/trunk/doc/lemon.html
  6. *
  7. * To generate the parser, run: `make parsers'
  8. *
  9. * TTL EBNF: https://www.w3.org/TeamSubmission/turtle/#sec-grammar-grammar
  10. */
  11. #include "codec.h"
  12. }
  13. %name TTLParse
  14. %stack_overflow {
  15. log_error ("Stack oveflow in TTL parsing. Please jettison the parser.");
  16. }
  17. %parse_failure {
  18. log_error ("TTL parse error. Cannot continue.");
  19. }
  20. %stack_size CHUNK_SIZE
  21. /*
  22. %syntax_error {
  23. if (TOKEN[0]) log_error ("near \"%T\": syntax error", &TOKEN);
  24. else log_error ("incomplete input");
  25. }
  26. */
  27. %token_prefix "T_"
  28. %token_type { char * }
  29. %token_destructor { free ($$); }
  30. /* NULL-terminated array of object term handles. */
  31. %type objList { LSUP_Term ** }
  32. %destructor objList {
  33. for (size_t i = 0; $$[i]; i++) {
  34. LSUP_term_free ($$[i]);
  35. }
  36. }
  37. %default_type { char * }
  38. %extra_argument { LSUP_TTLParserState *state }
  39. %left WS .
  40. %left EOS .
  41. %left SEMICOLON .
  42. %left COMMA .
  43. %nonassoc PFX .
  44. %nonassoc COLON .
  45. /*
  46. * Rules.
  47. */
  48. turtleDoc ::= statements EOF .
  49. statements ::= statements statement .
  50. statements ::= .
  51. statement ::= directive ows EOS .
  52. statement ::= triples ows EOS .
  53. directive ::= prefixID .
  54. directive ::= base .
  55. prefixID ::= PREFIX WS PFX(P) ows IRIREF(N) . {
  56. LSUP_nsmap_add (state->nsm, P, N);
  57. }
  58. prefixID ::= PREFIX WS COLON ows IRIREF(N) . {
  59. LSUP_nsmap_add (state->nsm, "", N);
  60. }
  61. base ::= BASE WS IRIREF(D) . {
  62. state->base = LSUP_iriref_new (D, NULL);
  63. }
  64. // WS before predicate is optional because pred has leading WS already.
  65. triples ::= subject(S) predObjList(L) . [EOS] {
  66. size_t ct = LSUP_spo_list_add_triples (state->it, S, L);
  67. state->ct += ct;
  68. log_trace ("Added %lu triples.", ct);
  69. LSUP_term_free (S);
  70. LSUP_pred_obj_list_free (L);
  71. }
  72. %type predObjList { LSUP_PredObjList * }
  73. %destructor predObjList { LSUP_pred_obj_list_free ($$); }
  74. predObjList(A) ::= predicate(P) WS objectList(O) . [SEMICOLON] {
  75. A = LSUP_pred_obj_list_new();
  76. LSUP_pred_obj_list_add (A, P, O);
  77. }
  78. predObjList(A) ::= predObjList(L) SEMICOLON predicate(P) WS objectList(O) . [SEMICOLON] {
  79. LSUP_pred_obj_list_add (L, P, O);
  80. A = L;
  81. }
  82. %type objectList { LSUP_Term ** }
  83. objectList(A) ::= objectList(L) COMMA object(O) . [COMMA] {
  84. A = LSUP_obj_list_add (L, O);
  85. }
  86. objectList(A) ::= object(O) . [COMMA] {
  87. A = calloc (sizeof (*A), 2);
  88. A[0] = O;
  89. }
  90. %type subject { LSUP_Term * }
  91. subject ::= resource .
  92. subject ::= blank .
  93. %type predicate { LSUP_Term * }
  94. // Leading WS is counted as part of the RDF_TYPE ('a') token.
  95. predicate ::= resource .
  96. predicate(A)::= RDF_TYPE . { A = LSUP_iriref_new ("rdf:type", state->nsm); }
  97. %type object { LSUP_Term * }
  98. object ::= resource .
  99. object ::= blank .
  100. object ::= literal .
  101. %type literal { LSUP_Term * }
  102. literal(A) ::= STRING(D) . {
  103. A = LSUP_term_new (LSUP_TERM_LITERAL, D, NULL);
  104. }
  105. literal(A) ::= STRING(D) LANGTAG(L) . {
  106. A = LSUP_term_new (LSUP_TERM_LT_LITERAL, D, L);
  107. }
  108. literal(A) ::= STRING(D) DTYPE_MARKER resource(M) . {
  109. A = LSUP_term_new (LSUP_TERM_LITERAL, D, M);
  110. }
  111. literal(A) ::= INTEGER(D) . {
  112. A = LSUP_term_new (
  113. LSUP_TERM_LITERAL, D,
  114. LSUP_iriref_new ("xsd:integer", state->nsm));
  115. }
  116. literal(A) ::= DOUBLE(D) . {
  117. A = LSUP_term_new (
  118. LSUP_TERM_LITERAL, D,
  119. LSUP_iriref_new ("xsd:double", state->nsm));
  120. }
  121. literal(A) ::= DECIMAL(D) . {
  122. A = LSUP_term_new (
  123. LSUP_TERM_LITERAL, D,
  124. LSUP_iriref_new ("xsd:decimal", state->nsm));
  125. }
  126. literal(A) ::= BOOLEAN(D) . {
  127. A = LSUP_term_new (
  128. LSUP_TERM_LITERAL, D,
  129. LSUP_iriref_new ("xsd:boolean", state->nsm));
  130. }
  131. %type blank { LSUP_Term * }
  132. blank(A) ::= nodeID(D) . {
  133. A = LSUP_term_new (LSUP_TERM_BNODE, D, NULL);
  134. }
  135. blank(A) ::= LBRACKET ows RBRACKET . {
  136. A = LSUP_term_new (LSUP_TERM_BNODE, NULL, NULL);
  137. }
  138. blank(A) ::= LBRACKET ows predObjList(L) ows RBRACKET . {
  139. A = LSUP_term_new (LSUP_TERM_BNODE, NULL, NULL);
  140. state->ct += LSUP_spo_list_add_triples (state->it, A, L);
  141. LSUP_pred_obj_list_free (L);
  142. }
  143. blank ::= collection .
  144. blank(A) ::= LPAREN ows RPAREN . {
  145. A = LSUP_iriref_new ("rdf:nil", state->nsm);
  146. }
  147. // "collection" is the subject of the first collection item.
  148. %type collection { LSUP_Term * }
  149. // Collection triples are added here to the graph.
  150. collection(A) ::= LPAREN ows itemList(L) ows RPAREN . {
  151. A = LSUP_bnode_add_collection (state->it, L);
  152. }
  153. %type itemList { LSUP_Term ** }
  154. // Freed when the item list in the collection gets added to the graph.
  155. %destructor itemList {}
  156. itemList(A) ::= itemList(L) WS object(O) . { A = LSUP_obj_list_add (L, O); }
  157. itemList(A) ::= object(O) . {
  158. A = calloc (sizeof (*A), 2);
  159. A[0] = O;
  160. }
  161. %type resource { LSUP_Term * }
  162. resource(A) ::= IRIREF(D) . {
  163. LSUP_Term *rel_iri = LSUP_iriref_new (D, NULL);
  164. free (D);
  165. if (state->base) {
  166. A = LSUP_iriref_absolute (rel_iri, state->base);
  167. LSUP_term_free (rel_iri);
  168. } else {
  169. A = rel_iri;
  170. }
  171. }
  172. resource(A) ::= qname(D) . { A = LSUP_iriref_new (D, state->nsm); }
  173. qname(A) ::= PFX(P) IDNAME(D) . {
  174. A = malloc (strlen (P) + strlen (D) + 2);
  175. sprintf (A, "%s:%s", P, D);
  176. }
  177. qname(A) ::= COLON IDNAME(D) . {
  178. A = malloc (strlen (D) + 2);
  179. sprintf (A, ":%s", D);
  180. }
  181. qname(A) ::= COLON . { A = strndup (":", 2); }
  182. nodeID(A) ::= BNODE_PFX IDNAME(D) . { A = D; }
  183. ows ::= WS.
  184. ows ::=.
  185. /*
  186. * This has been adapted from
  187. * https://www.w3.org/TeamSubmission/turtle/#sec-grammar-grammar :
  188. [1] turtleDoc ::= statement*
  189. [2] statement ::= directive '.' | triples '.' | ws+
  190. [3] directive ::= prefixID | base
  191. [4] prefixID ::= '@prefix' ws+ prefixName? ':' uriref
  192. [5] base ::= '@base' ws+ uriref
  193. [6] triples ::= subject predicateObjectList
  194. [7] predicateObjectList ::= verb objectList ( ';' verb objectList )* ( ';')?
  195. [8] objectList ::= object ( ',' object)*
  196. [9] verb ::= predicate | 'a'
  197. [10] comment ::= '#' ( [^#xA#xD] )*
  198. [11] subject ::= resource | blank
  199. [12] predicate ::= resource
  200. [13] object ::= resource | blank | literal
  201. [14] literal ::= quotedString ( '@' language )? | datatypeString | integer | double | decimal | boolean
  202. [15] datatypeString ::= quotedString '^^' resource
  203. [16] integer ::= ('-' | '+') ? [0-9]+
  204. [17] double ::= ('-' | '+') ? ( [0-9]+ '.' [0-9]* exponent | '.' ([0-9])+ exponent | ([0-9])+ exponent )
  205. [18] decimal ::= ('-' | '+')? ( [0-9]+ '.' [0-9]* | '.' ([0-9])+ | ([0-9])+ )
  206. [19] exponent ::= [eE] ('-' | '+')? [0-9]+
  207. [20] boolean ::= 'true' | 'false'
  208. [21] blank ::= nodeID | '[]' | '[' predicateObjectList ']' | collection
  209. [22] itemList ::= object+
  210. [23] collection ::= '(' itemList? ')'
  211. [24] ws ::= #x9 | #xA | #xD | #x20 | comment
  212. [25] resource ::= uriref | qname
  213. [26] nodeID ::= '_:' name
  214. [27] qname ::= prefixName? ':' name?
  215. [28] uriref ::= '<' relativeURI '>'
  216. [29] language ::= [a-z]+ ('-' [a-z0-9]+ )*
  217. [30] nameStartChar ::= [A-Z] | "_" | [a-z] | [#x00C0-#x00D6] | [#x00D8-#x00F6] | [#x00F8-#x02FF] | [#x0370-#x037D] | [#x037F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF]
  218. [31] nameChar ::= nameStartChar | '-' | [0-9] | #x00B7 | [#x0300-#x036F] | [#x203F-#x2040]
  219. [32] name ::= nameStartChar nameChar*
  220. [33] prefixName ::= ( nameStartChar - '_' ) nameChar*
  221. [34] relativeURI ::= ucharacter*
  222. [35] quotedString ::= string | longString
  223. [36] string ::= #x22 scharacter* #x22
  224. [37] longString ::= #x22 #x22 #x22 lcharacter* #x22 #x22 #x22
  225. [38] character ::= '\u' hex hex hex hex |
  226. '\U' hex hex hex hex hex hex hex hex |
  227. '\\' |
  228. [#x20-#x5B] | [#x5D-#x10FFFF]
  229. [39] echaracter ::= character | '\t' | '\n' | '\r'
  230. [40] hex ::= [#x30-#x39] | [#x41-#x46]
  231. [41] ucharacter ::= ( character - #x3E ) | '\>'
  232. [42] scharacter ::= ( echaracter - #x22 ) | '\"'
  233. [43] lcharacter ::= echaracter | '\"' | #x9 | #xA | #xD
  234. */