codec_nt.c 5.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233
  1. #include "codec_nt.h"
  2. #include "nt_parser.h"
  3. /** @brief List of characters to be escaped in serialized literals.
  4. *
  5. * https://www.w3.org/TR/n-triples/#grammar-production-ECHAR
  6. */
  7. #define LIT_ECHAR "\t\b\n\r\f\"\'\\"
  8. /** @brief Regex of characters to be escaped in serialized IRIs.
  9. *
  10. * https://www.w3.org/TR/n-triples/#grammar-production-IRIREF
  11. */
  12. #define IRI_ECHAR_PTN "[\x00-\x20<>\"\\{\\}\\|\\^`\\\\]"
  13. /** @brief Default NT literal type.
  14. */
  15. #define XSD_STRING "http://www.w3.org/2001/XMLSchema#string"
  16. /* * * Static prototypes. * * */
  17. static LSUP_rc escape_lit (const char *in, char **out_p);
  18. /* * * Codec functions. * * */
  19. static LSUP_rc
  20. term_to_nt (const LSUP_Term *term, const LSUP_NSMap *nsm, char **out_p)
  21. {
  22. LSUP_rc rc;
  23. char *out = NULL, *tmp, *escaped;
  24. size_t buf_len;
  25. // Free previous content if not NULL.
  26. if (*out_p != NULL) out = realloc (*out_p, 0);
  27. switch (term->type) {
  28. case LSUP_TERM_URI:
  29. tmp = realloc (out, strlen (term->data) + 3);
  30. if (UNLIKELY (!tmp)) return LSUP_MEM_ERR;
  31. out = tmp;
  32. sprintf (out, "<%s>", term->data);
  33. rc = LSUP_OK;
  34. break;
  35. case LSUP_TERM_LITERAL:
  36. // Calculate string length.
  37. if (escape_lit (term->data, &escaped) != LSUP_OK)
  38. return LSUP_ERROR;
  39. buf_len = strlen (escaped) + 3; // Room for "" and terminator
  40. if (term->datatype && strcmp (term->datatype, XSD_STRING) != 0)
  41. buf_len += strlen (term->datatype) + 4; // Room for ^^<>
  42. if (strlen (term->lang) > 0)
  43. buf_len += strlen(term->lang) + 1; // Room for @
  44. //TRACE ("nt rep length: %lu\n", buf_len);
  45. tmp = realloc (out, buf_len);
  46. if (UNLIKELY (!tmp)) return LSUP_MEM_ERR;
  47. out = tmp;
  48. sprintf (out, "\"%s\"", escaped);
  49. free (escaped);
  50. // Always suppress xsd:string data type.
  51. if (term->datatype && strcmp (term->datatype, XSD_STRING) != 0)
  52. out = strcat (strcat (strcat (out, "^^<"), term->datatype), ">");
  53. if (strlen (term->lang) > 0)
  54. out = strcat (strcat (out, "@"), term->lang);
  55. rc = LSUP_OK;
  56. break;
  57. case LSUP_TERM_BNODE:
  58. tmp = realloc (out, strlen (term->data) + 3);
  59. if (UNLIKELY (!tmp)) return LSUP_MEM_ERR;
  60. out = tmp;
  61. sprintf (out, "_:%s", term->data);
  62. rc = LSUP_OK;
  63. break;
  64. default:
  65. out = NULL;
  66. rc = LSUP_VALUE_ERR;
  67. }
  68. *out_p = out;
  69. return rc;
  70. }
  71. static LSUP_CodecIterator *
  72. gr_to_nt_init (const LSUP_Graph *gr);
  73. static LSUP_rc
  74. gr_to_nt_iter (LSUP_CodecIterator *it, unsigned char **res) {
  75. LSUP_rc rc = LSUP_graph_iter_next (it->gr_it, it->trp);
  76. if (rc != LSUP_OK) return rc;
  77. term_to_nt (it->trp->s, it->nsm, &it->str_s);
  78. term_to_nt (it->trp->p, it->nsm, &it->str_p);
  79. term_to_nt (it->trp->o, it->nsm, &it->str_o);
  80. // 3 term separators + dot + newline + terminal = 6
  81. unsigned char *tmp = realloc (
  82. *res, strlen (it->str_s) + strlen (it->str_p)
  83. + strlen (it->str_o) + 6);
  84. if (UNLIKELY (!tmp)) {
  85. *res = NULL;
  86. return LSUP_MEM_ERR;
  87. }
  88. sprintf ((char*)tmp, "%s %s %s .\n", it->str_s, it->str_p, it->str_o);
  89. *res = tmp;
  90. it->cur++;
  91. return LSUP_OK;
  92. }
  93. static void
  94. gr_to_nt_done (LSUP_CodecIterator *it)
  95. {
  96. LSUP_graph_iter_free (it->gr_it);
  97. LSUP_triple_free (it->trp);
  98. free (it->rep);
  99. free (it->str_s);
  100. free (it->str_p);
  101. free (it->str_o);
  102. free (it);
  103. }
  104. const LSUP_Codec nt_codec = {
  105. .name = "N-Triples",
  106. .mimetype = "application/n-triples",
  107. .extension = "nt",
  108. .encode_term = term_to_nt,
  109. .encode_graph_init = gr_to_nt_init,
  110. .encode_graph_iter = gr_to_nt_iter,
  111. .encode_graph_done = gr_to_nt_done,
  112. .decode_term = LSUP_nt_parse_term,
  113. .decode_graph = LSUP_nt_parse_doc,
  114. };
  115. /* * * Other internal functions. * * */
  116. /** Replace non-printable characters with their literal byte.
  117. *
  118. * Escape backslash is to be added separately.
  119. */
  120. static inline char replace_char(const char c) {
  121. switch (c) {
  122. case '\t': return 't';
  123. case '\b': return 'b';
  124. case '\n': return 'n';
  125. case '\r': return 'r';
  126. case '\f': return 'f';
  127. default: return c;
  128. }
  129. }
  130. static LSUP_CodecIterator *
  131. gr_to_nt_init (const LSUP_Graph *gr)
  132. {
  133. LSUP_CodecIterator *it;
  134. MALLOC_GUARD (it, NULL);
  135. LSUP_Triple lut = {NULL, NULL, NULL};
  136. it->codec = &nt_codec;
  137. it->gr_it = LSUP_graph_lookup(gr, &lut, &it->cur);
  138. it->nsm = LSUP_graph_namespace (gr);
  139. it->cur = 0;
  140. it->trp = LSUP_triple_new (TERM_DUMMY, TERM_DUMMY, TERM_DUMMY);
  141. it->rep = NULL;
  142. it->str_s = NULL;
  143. it->str_p = NULL;
  144. it->str_o = NULL;
  145. return it;
  146. }
  147. /** @brief Add escape character (backslash) to illegal literal characters.
  148. */
  149. static LSUP_rc
  150. escape_lit (const char *in, char **out_p)
  151. {
  152. size_t out_size = strlen (in) + 1;
  153. // Expand output string size to accommodate escape characters.
  154. for (
  155. size_t i = strcspn (in, LIT_ECHAR);
  156. i < strlen (in);
  157. i += strcspn (in + i + 1, LIT_ECHAR) + 1) {
  158. out_size ++;
  159. }
  160. char *out = calloc (1, out_size);
  161. if (UNLIKELY (!out)) return LSUP_MEM_ERR;
  162. size_t boundary;
  163. boundary = strcspn (in, LIT_ECHAR);
  164. for (size_t i = 0, j = 0;;) {
  165. out = strncat (out, in + i, boundary);
  166. i += boundary;
  167. j += boundary;
  168. if (i >= strlen (in)) break;
  169. out[j++] = '\\';
  170. out[j++] = replace_char (in[i++]);
  171. boundary = strcspn (in + i, LIT_ECHAR);
  172. }
  173. *out_p = out;
  174. return 0;
  175. }