codec_ttl.c 9.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318
  1. #include "volksdata/codec/codec_ttl.h"
  2. /** @brief NT codec iterator.
  3. *
  4. * This iterator yields one or more triples at a time, one group per subject,
  5. * with the most compact form allowed by Turtle, e.g.
  6. *
  7. * ```
  8. * :s :p1 :o1, :o2, o3; :p2 :o4, :o5, <http://example.com/ext1> .
  9. * ```
  10. */
  11. typedef struct {
  12. const VOLK_Codec * codec; ///< Codec that generated this iterator.
  13. const VOLK_Graph * gr; ///< Graph being encoded.
  14. VOLK_TermSet * subjects; ///< All subjects in the graph.
  15. size_t s_cur; ///< Term set cursor.
  16. VOLK_rc rc; ///< Internal return code.
  17. char * s_str; ///< Serialized subject block (output).
  18. char * p_str; ///< Serialized predicate block.
  19. char * o_str; ///< Serialized object block.
  20. } VOLK_TTLCodecIterator;
  21. /* * * Codec functions. * * */
  22. static VOLK_rc
  23. term_to_ttl (const VOLK_Term *term, char **out_p)
  24. {
  25. VOLK_rc rc;
  26. char
  27. *tmp = NULL,
  28. *out = NULL,
  29. *metadata = NULL;
  30. size_t buf_len;
  31. VOLK_rc md_rc = VOLK_NORESULT;
  32. switch (term->type) {
  33. case VOLK_TERM_IRIREF:
  34. if (strcmp (term->data, VOLK_RDF_TYPE) == 0) {
  35. // Shorten RDF type
  36. buf_len = 2;
  37. out = realloc (*out_p, 2);
  38. if (UNLIKELY (!out)) return VOLK_MEM_ERR;
  39. out[0] = 'a';
  40. out[1] = '\0';
  41. } else {
  42. md_rc = VOLK_nsmap_denormalize_uri (term->data, &tmp);
  43. PRCCK (md_rc);
  44. if (md_rc == VOLK_NORESULT) {
  45. // If URI counld not be shortened, add `<>`
  46. // and copy term from the original.
  47. out = realloc (*out_p, strlen (term->data) + 3);
  48. if (UNLIKELY (!out)) return VOLK_MEM_ERR;
  49. sprintf (out, "<%s>", term->data);
  50. } else {
  51. // If URI was shortened, write it out without `<>` and
  52. // use previously allocated data from denormalization.
  53. // Free previous output pointer
  54. free (*out_p);
  55. out = tmp;
  56. }
  57. }
  58. rc = VOLK_OK;
  59. break;
  60. case VOLK_TERM_LITERAL:
  61. // Calculate string length.
  62. if (escape_lit (term->data, &tmp) != VOLK_OK)
  63. return VOLK_ERROR;
  64. buf_len = strlen (tmp) + 3; // Room for "" and terminator
  65. // Data type.
  66. bool shorten = false;
  67. if (
  68. term->datatype != 0
  69. && term->datatype != VOLK_default_datatype
  70. ) {
  71. md_rc = VOLK_nsmap_denormalize_uri (
  72. term->datatype->data, &metadata);
  73. PRCCK (md_rc);
  74. unsigned padding = 0;
  75. // Shorten numeric and boolean types.
  76. if (strcmp (metadata, "xsd:integer") == 0) {
  77. // TODO check for valid format.
  78. shorten = true;
  79. } else if (strcmp (metadata, "xsd:double") == 0) {
  80. // TODO check for valid format.
  81. shorten = true;
  82. } else if (strcmp (metadata, "xsd:decimal") == 0) {
  83. // TODO check for valid format.
  84. shorten = true;
  85. } else if (strcmp (metadata, "xsd:boolean") == 0) {
  86. // TODO check for valid format.
  87. shorten = true;
  88. } else {
  89. // Room for `^^<>` for FQURI, `^^` for NS URI
  90. padding = md_rc == VOLK_NORESULT ? 4 : 2;
  91. }
  92. buf_len += strlen (metadata) + padding;
  93. }
  94. out = realloc (*out_p, buf_len);
  95. if (UNLIKELY (!out)) return VOLK_MEM_ERR;
  96. if (shorten) {
  97. strcpy (out, tmp);
  98. } else if (metadata) {
  99. char *fmt = (
  100. md_rc == VOLK_NORESULT ? "\"%s\"^^<%s>"
  101. : "\"%s\"^^%s");
  102. sprintf (out, fmt, tmp, metadata);
  103. }
  104. else {
  105. sprintf (out, "\"%s\"", tmp);
  106. }
  107. free (tmp);
  108. rc = VOLK_OK;
  109. break;
  110. case VOLK_TERM_LT_LITERAL:
  111. // Calculate string length.
  112. if (escape_lit (term->data, &tmp) != VOLK_OK)
  113. return VOLK_ERROR;
  114. buf_len = strlen (tmp) + 3; // Room for "" and terminator
  115. if (term->lang[0] != '\0') {
  116. metadata = strndup (term->lang, sizeof (VOLK_LangTag));
  117. buf_len += strlen (metadata) + 1; // Room for @
  118. }
  119. out = realloc (*out_p, buf_len);
  120. if (UNLIKELY (!out)) return VOLK_MEM_ERR;
  121. sprintf (out, "\"%s\"", tmp);
  122. free (tmp);
  123. // Add lang.
  124. if (metadata) out = strcat (strcat (out, "@"), metadata);
  125. rc = VOLK_OK;
  126. break;
  127. case VOLK_TERM_BNODE:
  128. out = realloc (*out_p, strlen (term->data) + 3);
  129. if (UNLIKELY (!out)) return VOLK_MEM_ERR;
  130. sprintf (out, "_:%s", term->data);
  131. rc = VOLK_OK;
  132. break;
  133. default:
  134. out = *out_p; // This is considered garbage.
  135. log_error ("Invalid term type: %d", term->type);
  136. rc = VOLK_PARSE_ERR;
  137. }
  138. free (metadata);
  139. *out_p = out;
  140. return rc;
  141. }
  142. static void *
  143. gr_to_ttl_init (const VOLK_Graph *gr)
  144. {
  145. VOLK_TTLCodecIterator *it;
  146. CALLOC_GUARD (it, NULL);
  147. it->codec = &ttl_codec;
  148. it->gr = gr;
  149. it->subjects = VOLK_graph_unique_terms (gr, TRP_POS_S);
  150. // Sets the condition to build the prolog on 1st iteration.
  151. it->rc = VOLK_NORESULT;
  152. return it;
  153. }
  154. /// Build header and prolog.
  155. static VOLK_rc
  156. build_prolog (VOLK_TTLCodecIterator *it, char **res_p)
  157. {
  158. char *res = fmt_header ("# ");
  159. const char ***nsm = VOLK_nsmap_dump ();
  160. char *ns_tpl = "@prefix %s: <%s> .\n";
  161. // Prefix map.
  162. for (size_t i = 0; nsm[i]; i++) {
  163. const char **ns = nsm[i];
  164. size_t old_len = strlen (res);
  165. size_t ns_len = strlen (ns[0]) + strlen (ns[1]) + strlen (ns_tpl);
  166. char *tmp = realloc (res, old_len + ns_len + 1);
  167. if (UNLIKELY (!tmp)) return VOLK_MEM_ERR;
  168. res = tmp;
  169. sprintf (res + old_len, ns_tpl, ns[0], ns[1]);
  170. free (ns);
  171. }
  172. free (nsm);
  173. // Base.
  174. char *base_uri_str = VOLK_graph_uri (it->gr)->data;
  175. char *base_stmt_tpl = "\n@base <%s> .\n\n";
  176. char *base_stmt = malloc (
  177. strlen (base_stmt_tpl) + strlen (base_uri_str) + 1);
  178. if (!UNLIKELY (base_stmt)) return VOLK_MEM_ERR;
  179. sprintf (base_stmt, base_stmt_tpl, base_uri_str);
  180. res = realloc (res, strlen (res) + strlen (base_stmt) + 1);
  181. if (!UNLIKELY (res)) return VOLK_MEM_ERR;
  182. res = strcat (res, base_stmt);
  183. free (base_stmt);
  184. *res_p = res;
  185. it->rc = VOLK_OK;
  186. return VOLK_OK;
  187. }
  188. /// Encode all the triples for a single subject.
  189. static VOLK_rc
  190. gr_to_ttl_iter (void *h, char **res_p) {
  191. VOLK_TTLCodecIterator *it = h;
  192. if (it->rc == VOLK_NORESULT) return build_prolog (it, res_p);
  193. VOLK_Term *s = NULL;
  194. char *res = *res_p; // Result string will be reallocated.
  195. VOLK_rc rc = VOLK_term_set_next (it->subjects, &it->s_cur, &s);
  196. if (rc == VOLK_END) return rc; // Return without logging error.
  197. RCCK (rc); // Log error or warning for anything else.
  198. term_to_ttl (s, &res);
  199. VOLK_LinkMap *lmap = VOLK_graph_connections (
  200. it->gr, s, VOLK_LINK_OUTBOUND);
  201. VOLK_LinkMapIterator *lmit = VOLK_link_map_iter_new (lmap);
  202. VOLK_Term *p = NULL;
  203. VOLK_TermSet *o_ts = NULL;
  204. char *p_join = "\n "; // Newline & indent after subject.
  205. // Begin predicate loop.
  206. while (VOLK_link_map_next (lmit, &p, &o_ts) != VOLK_END) {
  207. // Add predicate representation.
  208. RCCK (term_to_ttl (p, &it->p_str));
  209. char *tmp = realloc (
  210. res, strlen (res) + strlen (it->p_str) + strlen (p_join) + 1);
  211. if (UNLIKELY (!tmp)) goto memfail;
  212. res = strcat (strcat (tmp, p_join), it->p_str);
  213. p_join = " ;\n ";
  214. // Add objects for predicate.
  215. size_t i = 0;
  216. VOLK_Term *o = NULL;
  217. char *o_join = " ";
  218. while (VOLK_term_set_next (o_ts, &i, &o) != VOLK_END) {
  219. it->rc = term_to_ttl (o, &it->o_str);
  220. RCCK (it->rc);
  221. char *tmp = realloc (
  222. res, strlen (res) + strlen (it->o_str) + strlen (o_join) + 1);
  223. if (UNLIKELY (!tmp)) goto memfail;
  224. res = strcat (strcat (tmp, o_join), it->o_str);
  225. o_join = " ,\n "; // Double indent for objects.
  226. }
  227. }
  228. char *s_sep = "\n.\n\n"; // Period goes on its own line for visibility.
  229. char *tmp = realloc (res, strlen (res) + strlen (s_sep) + 1);
  230. if (UNLIKELY (!tmp)) goto memfail;
  231. *res_p = strcat (tmp, s_sep);
  232. VOLK_link_map_iter_free (lmit);
  233. VOLK_link_map_free (lmap);
  234. return it->rc;
  235. memfail:
  236. free (res);
  237. *res_p = NULL;
  238. return VOLK_MEM_ERR;
  239. }
  240. static void
  241. gr_to_ttl_done (void *h)
  242. {
  243. VOLK_TTLCodecIterator *it = h;
  244. VOLK_term_set_free (it->subjects);
  245. free (it->s_str);
  246. free (it->p_str);
  247. free (it->o_str);
  248. free (it);
  249. }
  250. const VOLK_Codec ttl_codec = {
  251. .name = "Turtle",
  252. .mimetype = "text/turtle",
  253. .extension = "ttl",
  254. .encode_term = term_to_ttl,
  255. .encode_graph_init = gr_to_ttl_init,
  256. .encode_graph_iter = gr_to_ttl_iter,
  257. .encode_graph_done = gr_to_ttl_done,
  258. //.decode_term = VOLK_ttl_parse_term,
  259. .decode_graph = VOLK_ttl_parse_doc,
  260. };