eolian: add documentation tokenizer

This provides an API to tokenize Eolian docstrings. It does not yet switch the rest of the infra to it, but it does have tests. It doesn't verify correctness of references, as that's Eolian's job. Therefore it's also your job to provide it with strings that do not contain invalid references. Lua bindings are to come and a complete switch will be done later. @feature
2016-12-01 16:37:01 +01:00 · 2016-12-01 16:37:01 +01:00 · d2105f99d4
parent 7d7c17c22a
commit d2105f99d4
3 changed files with 408 additions and 0 deletions
--- a/src/lib/eolian/Eolian.h
+++ b/src/lib/eolian/Eolian.h
@ -336,6 +336,25 @@ typedef enum
   EOLIAN_DECL_VAR
 } Eolian_Declaration_Type;

+typedef enum
+{
+   EOLIAN_DOC_TOKEN_UNKNOWN = -1,
+   EOLIAN_DOC_TOKEN_TEXT,
+   EOLIAN_DOC_TOKEN_REF,
+   EOLIAN_DOC_TOKEN_REF_EVENT,
+   EOLIAN_DOC_TOKEN_MARK_NOTE,
+   EOLIAN_DOC_TOKEN_MARK_WARNING,
+   EOLIAN_DOC_TOKEN_MARK_REMARK,
+   EOLIAN_DOC_TOKEN_MARK_TODO,
+   EOLIAN_DOC_TOKEN_MARKUP_MONOSPACE
+} Eolian_Doc_Token_Type;
+
+typedef struct _Eolian_Doc_Token
+{
+   Eolian_Doc_Token_Type type;
+   const char *text, *text_end;
+} Eolian_Doc_Token;
+
 /*
 * @brief Parse the given .eo or .eot file and fill the database.
 *
@ -2293,6 +2312,72 @@ EAPI Eina_Stringshare *eolian_documentation_description_get(const Eolian_Documen
 */
 EAPI Eina_Stringshare *eolian_documentation_since_get(const Eolian_Documentation *doc);

+/*
+ * @brief Split a documentation string into individual paragraphs.
+ *
+ * The items of the resulting list are strings that are fred with free().
+ *
+ * @param[in] doc the documentation string
+ * @return a list of allocated strings containing paragraphs
+ *
+ * @ingroup Eolian
+ */
+EAPI Eina_List *eolian_documentation_string_split(const char *doc);
+
+/*
+ * @brief Tokenize a documentation paragraph.
+ *
+ * This gradually splits the string into pieces (text, references, paragraph
+ * separators etc.) so that it can be more easily turned into a representation
+ * you want. On failure, token is initialized with EOLIAN_DOC_TOKEN_UNKNOWN.
+ *
+ * The function never allocates any memory and doesn't hold any state, instead
+ * the returned continuation has to be passed as first param on next iteration
+ * and you have to make sure the input data stays valid until you're completely
+ * done.
+ *
+ * The input string is assumed to be a single paragraph with all unnecessary
+ * whitespace already trimmed.
+ *
+ * If the given token is NULL, it will still tokenize, but without saving anything.
+ *
+ * @param[in] doc the documentation string
+ * @param[out] ret the token
+ * @return a continuation of the input string
+ *
+ * @ingroup Eolian
+ */
+EAPI const char *eolian_documentation_tokenize(const char *doc, Eolian_Doc_Token *ret);
+
+/*
+ * @brief Initialize a documentation token into an empty state.
+ *
+ * @param[in] tok the token
+ * @return the token type
+ */
+EAPI void eolian_doc_token_init(Eolian_Doc_Token *tok);
+
+/*
+ * @brief Get the type of a documentation token.
+ *
+ * @param[in] tok the token
+ * @return the token type
+ */
+EAPI Eolian_Doc_Token_Type eolian_doc_token_type_get(const Eolian_Doc_Token *tok);
+
+/*
+ * @brief Get the text of a documentation token.
+ *
+ * Works on every token type, but for unknown tokens it returns NULL.
+ * You need to free the text once you're done using normal free().
+ * This makes sure all escapes in the original doc comments are properly
+ * removed so you can use the string as-is.
+ *
+ * @param[in] tok the token
+ * @return the token text
+ */
+EAPI char *eolian_doc_token_text_get(const Eolian_Doc_Token *tok);
+
 #endif

 /**
--- a/src/lib/eolian/eolian_database.c
+++ b/src/lib/eolian/eolian_database.c
@ -2,6 +2,7 @@
 # include "config.h"
 #endif

+#include <ctype.h>
 #include <libgen.h>
 #include <Eina.h>
 #include "eo_parser.h"
@ -211,6 +212,246 @@ eolian_documentation_since_get(const Eolian_Documentation *doc)
   return doc->since;
 }

+EAPI Eina_List *
+eolian_documentation_string_split(const char *doc)
+{
+   EINA_SAFETY_ON_NULL_RETURN_VAL(doc, NULL);
+   const char *sep = strstr(doc, "\n\n");
+   Eina_List *ret = NULL;
+   while (doc)
+     {
+        Eina_Strbuf *buf = eina_strbuf_new();
+        if (sep)
+          eina_strbuf_append_length(buf, doc, sep - doc);
+        else
+          eina_strbuf_append(buf, doc);
+        eina_strbuf_trim(buf);
+        if (eina_strbuf_length_get(buf))
+          ret = eina_list_append(ret, eina_strbuf_string_steal(buf));
+        eina_strbuf_free(buf);
+        if (!sep)
+          break;
+        doc = sep + 2;
+        sep = strstr(doc, "\n\n");
+     }
+   return ret;
+}
+
+static Eina_Bool
+_skip_ref_word(const char **doc)
+{
+   if (((*doc)[0] != '_') && !isalpha((*doc)[0]))
+     return EINA_FALSE;
+
+   while (((*doc)[0] == '_') || isalnum((*doc)[0]))
+     ++*doc;
+
+   return EINA_TRUE;
+}
+
+/* this make sure the format is correct at least, it cannot verify the
+ * correctness of the reference itself (but Eolian will do it in its
+ * lexer, so there is nothing to worry about; all references are guaranteed
+ * to be right
+ */
+static Eolian_Doc_Token_Type
+_get_ref_token(const char *doc, const char **doc_end)
+{
+   /* not a ref at all, for convenience */
+   if (doc[0] != '@')
+     return EOLIAN_DOC_TOKEN_UNKNOWN;
+
+   ++doc;
+
+   Eina_Bool is_event = (doc[0] == '[');
+   if (is_event)
+     ++doc;
+
+   if ((doc[0] == '.') && (doc[1] != '_') && !isalpha(doc[1]))
+     return EOLIAN_DOC_TOKEN_UNKNOWN;
+
+   if (doc[0] == '.')
+     ++doc;
+
+   if (_skip_ref_word(&doc))
+     {
+        while (doc[0] == '.')
+          {
+             ++doc;
+             if (!_skip_ref_word(&doc))
+               {
+                  --doc;
+                  break;
+               }
+          }
+        if (is_event) while (doc[0] == ',')
+          {
+             ++doc;
+             if (!_skip_ref_word(&doc))
+               {
+                  --doc;
+                  break;
+               }
+          }
+     }
+
+   if (is_event)
+     {
+        if (doc[0] != ']')
+          return EOLIAN_DOC_TOKEN_UNKNOWN;
+        ++doc;
+     }
+
+   if (doc_end)
+     *doc_end = doc;
+
+   /* got a reference */
+   return is_event ? EOLIAN_DOC_TOKEN_REF_EVENT : EOLIAN_DOC_TOKEN_REF;
+}
+
+EAPI const char *
+eolian_documentation_tokenize(const char *doc, Eolian_Doc_Token *ret)
+{
+   /* token is used for statekeeping, so force it */
+   EINA_SAFETY_ON_NULL_RETURN_VAL(ret, NULL);
+
+   /* we've reached the end or invalid input */
+   if (!doc || !doc[0])
+     {
+        ret->text = ret->text_end = NULL;
+        ret->type = EOLIAN_DOC_TOKEN_UNKNOWN;
+        return NULL;
+     }
+
+   Eina_Bool cont = (ret->type != EOLIAN_DOC_TOKEN_UNKNOWN);
+
+   /* we can only check notes etc at beginning of parsing */
+   if (cont)
+     goto mloop;
+
+#define CMP_MARK_NOTE(doc, note) !strncmp(doc, note ": ", sizeof(note) + 1)
+
+   /* different types of notes */
+   if (CMP_MARK_NOTE(doc, "Note"))
+     {
+        ret->text = doc;
+        ret->text_end = doc + sizeof("Note:");
+        ret->type = EOLIAN_DOC_TOKEN_MARK_NOTE;
+        return ret->text_end;
+     }
+   else if (CMP_MARK_NOTE(doc, "Warning"))
+     {
+        ret->text = doc;
+        ret->text_end = doc + sizeof("Warning:");
+        ret->type = EOLIAN_DOC_TOKEN_MARK_WARNING;
+        return ret->text_end;
+     }
+   else if (CMP_MARK_NOTE(doc, "Remark"))
+     {
+        ret->text = doc;
+        ret->text_end = doc + sizeof("Remark:");
+        ret->type = EOLIAN_DOC_TOKEN_MARK_REMARK;
+        return ret->text_end;
+     }
+   else if (CMP_MARK_NOTE(doc, "TODO"))
+     {
+        ret->text = doc;
+        ret->text_end = doc + sizeof("TODO:");
+        ret->type = EOLIAN_DOC_TOKEN_MARK_TODO;
+        return ret->text_end;
+     }
+
+#undef CMP_MARK_NOTE
+
+mloop:
+
+   /* monospace markup ($foo) */
+   if ((doc[0] == '$') && ((doc[1] == '_') || isalpha(doc[1])))
+     {
+        ret->text = ++doc;
+        ret->text_end = ret->text;
+        while ((ret->text_end[0] == '_') || isalnum(ret->text_end[0]))
+          ++ret->text_end;
+        ret->type = EOLIAN_DOC_TOKEN_MARKUP_MONOSPACE;
+        return ret->text_end;
+     }
+
+   /* references */
+   Eolian_Doc_Token_Type rtp = _get_ref_token(doc, &ret->text_end);
+   if (rtp != EOLIAN_DOC_TOKEN_UNKNOWN)
+     {
+        ret->text = doc + 1;
+        ret->type = rtp;
+        return ret->text_end;
+     }
+
+   const char *schr = doc, *pschr = NULL;
+   /* keep finding potential tokens until a suitable one is found
+    * terminate text token there (it also means next token can directly
+    * be tested for event/monospace)
+    */
+   while ((schr = strpbrk(schr, "@$")))
+     {
+        /* escape sequences */
+        if ((schr != doc) && (schr[-1] == '\\'))
+          {
+             schr += 1;
+             continue;
+          }
+        /* monospace markup */
+        if ((schr[0] == '$') && ((schr[1] == '_') || isalpha(schr[1])))
+          {
+             pschr = schr;
+             break;
+          }
+        /* references */
+        if (_get_ref_token(schr, NULL) != EOLIAN_DOC_TOKEN_UNKNOWN)
+          {
+             pschr = schr;
+             break;
+          }
+        /* nothing, keep matching text from next char on */
+        schr += 1;
+     }
+
+   /* figure out where we actually end */
+   ret->text = doc;
+   ret->text_end = pschr ? pschr : (doc + strlen(doc));
+   ret->type = EOLIAN_DOC_TOKEN_TEXT;
+   return ret->text_end;
+}
+
+EAPI void eolian_doc_token_init(Eolian_Doc_Token *tok)
+{
+   if (!tok)
+     return;
+   tok->type = EOLIAN_DOC_TOKEN_UNKNOWN;
+   tok->text = tok->text_end = NULL;
+}
+
+EAPI Eolian_Doc_Token_Type
+eolian_doc_token_type_get(const Eolian_Doc_Token *tok)
+{
+   EINA_SAFETY_ON_NULL_RETURN_VAL(tok, EOLIAN_DOC_TOKEN_UNKNOWN);
+   return tok->type;
+}
+
+EAPI char *
+eolian_doc_token_text_get(const Eolian_Doc_Token *tok)
+{
+   EINA_SAFETY_ON_NULL_RETURN_VAL(tok, NULL);
+   if (tok->type == EOLIAN_DOC_TOKEN_UNKNOWN)
+     return NULL;
+   Eina_Strbuf *buf = eina_strbuf_new();
+   for (const char *p = tok->text; p != tok->text_end; ++p)
+     {
+        if (*p == '\\') ++p;
+        if (p != tok->text_end)
+          eina_strbuf_append_char(buf, *p);
+     }
+   return eina_strbuf_string_steal(buf);
+}
+
 #define EO_SUFFIX ".eo"
 #define EOT_SUFFIX ".eot"

--- a/src/tests/eolian/eolian_parsing.c
+++ b/src/tests/eolian/eolian_parsing.c
@ -1188,6 +1188,88 @@ START_TEST(eolian_docs)
   fail_if(strcmp(eolian_documentation_since_get(doc),
                  "1.66"));

+   const char *sdesc = eolian_documentation_description_get(doc);
+   Eina_List *sdoc = eolian_documentation_string_split(sdesc);
+
+   char *dpar = eina_list_data_get(sdoc);
+   fail_if(strcmp(dpar, "Note: This is a note."));
+   sdoc = eina_list_remove_list(sdoc, sdoc);
+   dpar = eina_list_data_get(sdoc);
+   fail_if(strcmp(dpar, "This is a longer description for struct Foo."));
+   EINA_LIST_FREE(sdoc, dpar)
+     free(dpar);
+
+   const char *tdoc = "Note: This is $something, see @Blah, @.bleh, "
+                      "@Foo.Bar.baz, \\@ref foo and @[Things.Stuffs.foo,bar].";
+
+   Eolian_Doc_Token tok;
+   eolian_doc_token_init(&tok);
+   fail_if(eolian_doc_token_type_get(&tok) != EOLIAN_DOC_TOKEN_UNKNOWN);
+   tdoc = eolian_documentation_tokenize(tdoc, &tok);
+   fail_if(eolian_doc_token_type_get(&tok) != EOLIAN_DOC_TOKEN_MARK_NOTE);
+   char *txt = eolian_doc_token_text_get(&tok);
+   fail_if(strcmp(txt, "Note: "));
+   free(txt);
+   tdoc = eolian_documentation_tokenize(tdoc, &tok);
+   fail_if(eolian_doc_token_type_get(&tok) != EOLIAN_DOC_TOKEN_TEXT);
+   txt = eolian_doc_token_text_get(&tok);
+   fail_if(strcmp(txt, "This is "));
+   free(txt);
+   tdoc = eolian_documentation_tokenize(tdoc, &tok);
+   fail_if(eolian_doc_token_type_get(&tok) != EOLIAN_DOC_TOKEN_MARKUP_MONOSPACE);
+   txt = eolian_doc_token_text_get(&tok);
+   fail_if(strcmp(txt, "something"));
+   free(txt);
+   tdoc = eolian_documentation_tokenize(tdoc, &tok);
+   fail_if(eolian_doc_token_type_get(&tok) != EOLIAN_DOC_TOKEN_TEXT);
+   txt = eolian_doc_token_text_get(&tok);
+   fail_if(strcmp(txt, ", see "));
+   free(txt);
+   tdoc = eolian_documentation_tokenize(tdoc, &tok);
+   fail_if(eolian_doc_token_type_get(&tok) != EOLIAN_DOC_TOKEN_REF);
+   txt = eolian_doc_token_text_get(&tok);
+   fail_if(strcmp(txt, "Blah"));
+   free(txt);
+   tdoc = eolian_documentation_tokenize(tdoc, &tok);
+   fail_if(eolian_doc_token_type_get(&tok) != EOLIAN_DOC_TOKEN_TEXT);
+   txt = eolian_doc_token_text_get(&tok);
+   fail_if(strcmp(txt, ", "));
+   free(txt);
+   tdoc = eolian_documentation_tokenize(tdoc, &tok);
+   fail_if(eolian_doc_token_type_get(&tok) != EOLIAN_DOC_TOKEN_REF);
+   txt = eolian_doc_token_text_get(&tok);
+   fail_if(strcmp(txt, ".bleh"));
+   free(txt);
+   tdoc = eolian_documentation_tokenize(tdoc, &tok);
+   fail_if(eolian_doc_token_type_get(&tok) != EOLIAN_DOC_TOKEN_TEXT);
+   txt = eolian_doc_token_text_get(&tok);
+   fail_if(strcmp(txt, ", "));
+   free(txt);
+   tdoc = eolian_documentation_tokenize(tdoc, &tok);
+   fail_if(eolian_doc_token_type_get(&tok) != EOLIAN_DOC_TOKEN_REF);
+   txt = eolian_doc_token_text_get(&tok);
+   fail_if(strcmp(txt, "Foo.Bar.baz"));
+   free(txt);
+   tdoc = eolian_documentation_tokenize(tdoc, &tok);
+   fail_if(eolian_doc_token_type_get(&tok) != EOLIAN_DOC_TOKEN_TEXT);
+   txt = eolian_doc_token_text_get(&tok);
+   fail_if(strcmp(txt, ", @ref foo and "));
+   free(txt);
+   tdoc = eolian_documentation_tokenize(tdoc, &tok);
+   fail_if(eolian_doc_token_type_get(&tok) != EOLIAN_DOC_TOKEN_REF_EVENT);
+   txt = eolian_doc_token_text_get(&tok);
+   fail_if(strcmp(txt, "[Things.Stuffs.foo,bar]"));
+   free(txt);
+   tdoc = eolian_documentation_tokenize(tdoc, &tok);
+   fail_if(tdoc[0] != '\0');
+   fail_if(eolian_doc_token_type_get(&tok) != EOLIAN_DOC_TOKEN_TEXT);
+   txt = eolian_doc_token_text_get(&tok);
+   fail_if(strcmp(txt, "."));
+   free(txt);
+   tdoc = eolian_documentation_tokenize(tdoc, &tok);
+   fail_if(tdoc != NULL);
+   fail_if(eolian_doc_token_type_get(&tok) != EOLIAN_DOC_TOKEN_UNKNOWN);
+
   fail_if(!(sfl = eolian_typedecl_struct_field_get(tdl, "field1")));
   fail_if(!(doc = eolian_typedecl_struct_field_documentation_get(sfl)));
   fail_if(strcmp(eolian_documentation_summary_get(doc),