From e6b4ff4b126f3f3e3c1a89eb5afcc15408f1b73f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=A0=D1=83=D1=81=D0=BB=D0=B0=D0=BD=20=D0=98=D0=B6=D0=B1=D1?=
 =?UTF-8?q?=83=D0=BB=D0=B0=D1=82=D0=BE=D0=B2?= <lrn1986@gmail.com>
Date: Sat, 24 Dec 2011 00:31:37 +0400
Subject: [PATCH] Extract keywords from filenames

---
 src/fs/fs_uri.c |  189 ++++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 files changed, 179 insertions(+), 10 deletions(-)

diff --git a/src/fs/fs_uri.c b/src/fs/fs_uri.c
index 62fd513..7ca41ef 100644
--- a/src/fs/fs_uri.c
+++ b/src/fs/fs_uri.c
@@ -1577,6 +1577,156 @@ GNUNET_FS_uri_test_loc (const struct GNUNET_FS_Uri *uri)
   return uri->type == loc;
 }
 
+static int
+insert_non_mandatory_keyword (const char *s, char **array, int index)
+{
+  char *nkword;
+  GNUNET_asprintf (&nkword, " %s", /* space to mark as 'non mandatory' */ s);
+  array[index] = nkword;
+  return 1;
+}
+
+static int
+find_duplicate (const char *s, const char **array, int index)
+{
+  int j;
+  for (j = index - 1; j >= 0; j--)
+    if (0 == strcmp (&array[j][1], s))
+      return GNUNET_YES;
+  return GNUNET_NO;
+}
+
+/**
+ * Break the filename up by matching [], () and {} pairs to make
+ * keywords. In case of nesting parentheses only the inner pair counts.
+ * You can't escape parentheses to scan something like "[blah\{foo]" to
+ * make a "blah{foo" keyword, this function is only a heuristic!
+ *
+ * @param s string to break down.
+ * @param array array to fill with enclosed tokens. If NULL, then tokens
+ *        are only counted.
+ * @param index index at which to start filling the array (entries prior
+ *        to it are used to check for duplicates). ignored if array == NULL.
+ * @return number of tokens counted (including duplicates), or number of
+ *         tokens extracted (excluding duplicates). 0 if there are no
+ *         matching parens in the string (when counting), or when all tokens 
+ *         were duplicates (when extracting).
+ */
+static int
+get_keywords_from_parens (char *s, char **array, int index)
+{
+  int count = 0;
+  char *open_paren, *close_paren, *ss, tmp;
+  if (NULL == s)
+    return 0;
+  if (NULL != array)
+    ss = GNUNET_strdup (s);
+  else
+    ss = s;
+  for (close_paren = ss - 1; NULL != (open_paren = strpbrk (close_paren + 1, "[{("));)
+  {
+    int match = 0;
+    close_paren = strpbrk (open_paren + 1, "]})");
+    if (NULL == close_paren)
+      break;
+    switch (open_paren[0])
+    {
+    case '[':
+      if (']' == close_paren[0])
+        match = 1;
+      break;
+    case '{':
+      if ('}' == close_paren[0])
+        match = 1;
+      break;
+    case '(':
+      if (')' == close_paren[0])
+        match = 1;
+      break;
+    default:
+      break;
+    }
+    if (match && (close_paren - open_paren > 1))
+    {
+      if (NULL != array)
+      {
+        tmp = close_paren[0];
+        close_paren[0] = '\0';
+        if (GNUNET_NO == find_duplicate ((const char *) &open_paren[1], (const char **) array, index + count))
+        {
+          count += insert_non_mandatory_keyword ((const char *) &open_paren[1], array,
+              index + count);
+        }
+        close_paren[0] = tmp;
+      }
+      else
+        count += 1;
+    }
+  }
+  if (NULL != array)
+    GNUNET_free (ss);
+  return count;
+}
+
+/**
+ * Break the filename up by "_", " " and "." (any other separators?) to make
+ * keywords.
+ *
+ * @param s string to break down.
+ * @param array array to fill with tokens. If NULL, then tokens are only
+ *        counted.
+ * @param index index at which to start filling the array (entries prior
+ *        to it are used to check for duplicates). ignored if array == NULL.
+ * @return number of tokens (>1) counted (including duplicates), or number of
+ *         tokens extracted (excluding duplicates). 0 if there are no
+ *         separators in the string (when counting), or when all tokens were
+ *         duplicates (when extracting).
+ */
+static int
+get_keywords_from_tokens (char *s, char **array, int index)
+{
+  char *p, *p_prev, *ss, tmp;
+  int seps = 0;
+  if (NULL != array)
+    ss = GNUNET_strdup (s);
+  else
+    ss = s;
+  p_prev = p = ss;
+  for (p_prev = p = ss; NULL != (p = strpbrk (p, "_. ")); p_prev = p = p + 1)
+  {
+    /* don't count 0-length tokens */
+    if (p - p_prev == 0)
+      continue;
+    if (NULL != array)
+    {
+      tmp = p[0];
+      p[0] = '\0';
+      if (GNUNET_NO == find_duplicate ((const char *) p_prev, (const char **) array, index + seps))
+      {
+        seps += insert_non_mandatory_keyword ((const char *) p_prev, array,
+              index + seps);
+      }
+      p[0] = tmp;
+    }
+    else
+      seps += 1;
+  }
+  if (NULL != array)
+  {
+    if (seps > 0 && p_prev != NULL && strlen (p_prev)
+        && !find_duplicate ((const char *) p_prev, (const char **) array,
+        index + seps))
+    {
+      seps += insert_non_mandatory_keyword ((const char *) p_prev, array,
+          index + seps);
+    }
+    GNUNET_free (ss);
+  }
+  else if (seps > 0)
+    /* Turn it into the number of keywords (1 separator == 2 keywords) */
+    seps += 1;
+  return seps;
+}
 
 /**
  * Function called on each value in the meta data.
@@ -1601,18 +1751,14 @@ gather_uri_data (void *cls, const char *plugin_name,
                  const char *data_mime_type, const char *data, size_t data_len)
 {
   struct GNUNET_FS_Uri *uri = cls;
-  char *nkword;
-  int j;
 
   if ((format != EXTRACTOR_METAFORMAT_UTF8) &&
       (format != EXTRACTOR_METAFORMAT_C_STRING))
     return 0;
-  for (j = uri->data.ksk.keywordCount - 1; j >= 0; j--)
-    if (0 == strcmp (&uri->data.ksk.keywords[j][1], data))
-      return GNUNET_OK;
-  GNUNET_asprintf (&nkword, " %s",      /* space to mark as 'non mandatory' */
-                   data);
-  uri->data.ksk.keywords[uri->data.ksk.keywordCount++] = nkword;
+  if (find_duplicate (data, (const char **) uri->data.ksk.keywords, uri->data.ksk.keywordCount))
+    return GNUNET_OK;
+  uri->data.ksk.keywordCount += insert_non_mandatory_keyword (data,
+      uri->data.ksk.keywords, uri->data.ksk.keywordCount);
   return 0;
 }
 
@@ -1630,7 +1776,9 @@ GNUNET_FS_uri_ksk_create_from_meta_data (const struct GNUNET_CONTAINER_MetaData
                                          *md)
 {
   struct GNUNET_FS_Uri *ret;
-  int ent;
+  char *filename, *full_name;
+  char *ss;
+  int ent, tok_keywords = 0, paren_keywords = 0;
 
   if (md == NULL)
     return NULL;
@@ -1639,9 +1787,30 @@ GNUNET_FS_uri_ksk_create_from_meta_data (const struct GNUNET_CONTAINER_MetaData
   ent = GNUNET_CONTAINER_meta_data_iterate (md, NULL, NULL);
   if (ent > 0)
   {
-    ret->data.ksk.keywords = GNUNET_malloc (sizeof (char *) * ent);
+    full_name = GNUNET_CONTAINER_meta_data_get_first_by_types (md,
+        EXTRACTOR_METATYPE_FILENAME, -1);
+    if (NULL != full_name)
+    {
+      filename = full_name;
+      while (NULL != (ss = strstr (filename, DIR_SEPARATOR_STR)))
+        filename = ss + 1;
+      tok_keywords = get_keywords_from_tokens (filename, NULL, 0);
+      paren_keywords = get_keywords_from_parens (filename, NULL, 0);
+    }
+    ret->data.ksk.keywords = GNUNET_malloc (sizeof (char *) * (ent
+        + tok_keywords + paren_keywords));
     GNUNET_CONTAINER_meta_data_iterate (md, &gather_uri_data, ret);
   }
+  if (tok_keywords > 0)
+    ret->data.ksk.keywordCount += get_keywords_from_tokens (filename,
+        ret->data.ksk.keywords,
+        ret->data.ksk.keywordCount);
+  if (paren_keywords > 0)
+    ret->data.ksk.keywordCount += get_keywords_from_parens (filename,
+        ret->data.ksk.keywords,
+        ret->data.ksk.keywordCount);
+  if (ent > 0)
+    GNUNET_free (full_name);
   return ret;
 }
 
-- 
1.7.4

