From e6b4ff4b126f3f3e3c1a89eb5afcc15408f1b73f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=A0=D1=83=D1=81=D0=BB=D0=B0=D0=BD=20=D0=98=D0=B6=D0=B1=D1?= =?UTF-8?q?=83=D0=BB=D0=B0=D1=82=D0=BE=D0=B2?= Date: Sat, 24 Dec 2011 00:31:37 +0400 Subject: [PATCH] Extract keywords from filenames --- src/fs/fs_uri.c | 189 ++++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 files changed, 179 insertions(+), 10 deletions(-) diff --git a/src/fs/fs_uri.c b/src/fs/fs_uri.c index 62fd513..7ca41ef 100644 --- a/src/fs/fs_uri.c +++ b/src/fs/fs_uri.c @@ -1577,6 +1577,156 @@ GNUNET_FS_uri_test_loc (const struct GNUNET_FS_Uri *uri) return uri->type == loc; } +static int +insert_non_mandatory_keyword (const char *s, char **array, int index) +{ + char *nkword; + GNUNET_asprintf (&nkword, " %s", /* space to mark as 'non mandatory' */ s); + array[index] = nkword; + return 1; +} + +static int +find_duplicate (const char *s, const char **array, int index) +{ + int j; + for (j = index - 1; j >= 0; j--) + if (0 == strcmp (&array[j][1], s)) + return GNUNET_YES; + return GNUNET_NO; +} + +/** + * Break the filename up by matching [], () and {} pairs to make + * keywords. In case of nesting parentheses only the inner pair counts. + * You can't escape parentheses to scan something like "[blah\{foo]" to + * make a "blah{foo" keyword, this function is only a heuristic! + * + * @param s string to break down. + * @param array array to fill with enclosed tokens. If NULL, then tokens + * are only counted. + * @param index index at which to start filling the array (entries prior + * to it are used to check for duplicates). ignored if array == NULL. + * @return number of tokens counted (including duplicates), or number of + * tokens extracted (excluding duplicates). 0 if there are no + * matching parens in the string (when counting), or when all tokens + * were duplicates (when extracting). + */ +static int +get_keywords_from_parens (char *s, char **array, int index) +{ + int count = 0; + char *open_paren, *close_paren, *ss, tmp; + if (NULL == s) + return 0; + if (NULL != array) + ss = GNUNET_strdup (s); + else + ss = s; + for (close_paren = ss - 1; NULL != (open_paren = strpbrk (close_paren + 1, "[{("));) + { + int match = 0; + close_paren = strpbrk (open_paren + 1, "]})"); + if (NULL == close_paren) + break; + switch (open_paren[0]) + { + case '[': + if (']' == close_paren[0]) + match = 1; + break; + case '{': + if ('}' == close_paren[0]) + match = 1; + break; + case '(': + if (')' == close_paren[0]) + match = 1; + break; + default: + break; + } + if (match && (close_paren - open_paren > 1)) + { + if (NULL != array) + { + tmp = close_paren[0]; + close_paren[0] = '\0'; + if (GNUNET_NO == find_duplicate ((const char *) &open_paren[1], (const char **) array, index + count)) + { + count += insert_non_mandatory_keyword ((const char *) &open_paren[1], array, + index + count); + } + close_paren[0] = tmp; + } + else + count += 1; + } + } + if (NULL != array) + GNUNET_free (ss); + return count; +} + +/** + * Break the filename up by "_", " " and "." (any other separators?) to make + * keywords. + * + * @param s string to break down. + * @param array array to fill with tokens. If NULL, then tokens are only + * counted. + * @param index index at which to start filling the array (entries prior + * to it are used to check for duplicates). ignored if array == NULL. + * @return number of tokens (>1) counted (including duplicates), or number of + * tokens extracted (excluding duplicates). 0 if there are no + * separators in the string (when counting), or when all tokens were + * duplicates (when extracting). + */ +static int +get_keywords_from_tokens (char *s, char **array, int index) +{ + char *p, *p_prev, *ss, tmp; + int seps = 0; + if (NULL != array) + ss = GNUNET_strdup (s); + else + ss = s; + p_prev = p = ss; + for (p_prev = p = ss; NULL != (p = strpbrk (p, "_. ")); p_prev = p = p + 1) + { + /* don't count 0-length tokens */ + if (p - p_prev == 0) + continue; + if (NULL != array) + { + tmp = p[0]; + p[0] = '\0'; + if (GNUNET_NO == find_duplicate ((const char *) p_prev, (const char **) array, index + seps)) + { + seps += insert_non_mandatory_keyword ((const char *) p_prev, array, + index + seps); + } + p[0] = tmp; + } + else + seps += 1; + } + if (NULL != array) + { + if (seps > 0 && p_prev != NULL && strlen (p_prev) + && !find_duplicate ((const char *) p_prev, (const char **) array, + index + seps)) + { + seps += insert_non_mandatory_keyword ((const char *) p_prev, array, + index + seps); + } + GNUNET_free (ss); + } + else if (seps > 0) + /* Turn it into the number of keywords (1 separator == 2 keywords) */ + seps += 1; + return seps; +} /** * Function called on each value in the meta data. @@ -1601,18 +1751,14 @@ gather_uri_data (void *cls, const char *plugin_name, const char *data_mime_type, const char *data, size_t data_len) { struct GNUNET_FS_Uri *uri = cls; - char *nkword; - int j; if ((format != EXTRACTOR_METAFORMAT_UTF8) && (format != EXTRACTOR_METAFORMAT_C_STRING)) return 0; - for (j = uri->data.ksk.keywordCount - 1; j >= 0; j--) - if (0 == strcmp (&uri->data.ksk.keywords[j][1], data)) - return GNUNET_OK; - GNUNET_asprintf (&nkword, " %s", /* space to mark as 'non mandatory' */ - data); - uri->data.ksk.keywords[uri->data.ksk.keywordCount++] = nkword; + if (find_duplicate (data, (const char **) uri->data.ksk.keywords, uri->data.ksk.keywordCount)) + return GNUNET_OK; + uri->data.ksk.keywordCount += insert_non_mandatory_keyword (data, + uri->data.ksk.keywords, uri->data.ksk.keywordCount); return 0; } @@ -1630,7 +1776,9 @@ GNUNET_FS_uri_ksk_create_from_meta_data (const struct GNUNET_CONTAINER_MetaData *md) { struct GNUNET_FS_Uri *ret; - int ent; + char *filename, *full_name; + char *ss; + int ent, tok_keywords = 0, paren_keywords = 0; if (md == NULL) return NULL; @@ -1639,9 +1787,30 @@ GNUNET_FS_uri_ksk_create_from_meta_data (const struct GNUNET_CONTAINER_MetaData ent = GNUNET_CONTAINER_meta_data_iterate (md, NULL, NULL); if (ent > 0) { - ret->data.ksk.keywords = GNUNET_malloc (sizeof (char *) * ent); + full_name = GNUNET_CONTAINER_meta_data_get_first_by_types (md, + EXTRACTOR_METATYPE_FILENAME, -1); + if (NULL != full_name) + { + filename = full_name; + while (NULL != (ss = strstr (filename, DIR_SEPARATOR_STR))) + filename = ss + 1; + tok_keywords = get_keywords_from_tokens (filename, NULL, 0); + paren_keywords = get_keywords_from_parens (filename, NULL, 0); + } + ret->data.ksk.keywords = GNUNET_malloc (sizeof (char *) * (ent + + tok_keywords + paren_keywords)); GNUNET_CONTAINER_meta_data_iterate (md, &gather_uri_data, ret); } + if (tok_keywords > 0) + ret->data.ksk.keywordCount += get_keywords_from_tokens (filename, + ret->data.ksk.keywords, + ret->data.ksk.keywordCount); + if (paren_keywords > 0) + ret->data.ksk.keywordCount += get_keywords_from_parens (filename, + ret->data.ksk.keywords, + ret->data.ksk.keywordCount); + if (ent > 0) + GNUNET_free (full_name); return ret; } -- 1.7.4