View Issue Details
ID | Project | Category | View Status | Date Submitted | Last Update |
---|---|---|---|---|---|
0002052 | GNUnet | file-sharing service | public | 2011-12-30 19:47 | 2012-02-28 11:04 |
Reporter | LRN | Assigned To | Christian Grothoff | ||
Priority | low | Severity | feature | Reproducibility | N/A |
Status | closed | Resolution | fixed | ||
Product Version | Git master | ||||
Target Version | 0.9.2 | Fixed in Version | 0.9.2 | ||
Summary | 0002052: [patch] libunistring-based normalization | ||||
Description | Normalization is back, and it is now done by libunistring \o/ Now normalization means lowercase conversion. Planned improvements: 1) Use libunistring character classification system to get rid of ALL punctuation characters (including Unicode ones) in all autogenerated keywords (right now only filename-generated keywords are stripped of punctuation characters, and only of the most basic ones, it doesn't work with unicode) 2) As mentioned in (1), parentheses-based and delimiter-based keyword splitting is not Unicode-aware, but it can be (hopefully) made so with libunistring. Patch 0010-New-normalization-with-libunistring.patch is attached | ||||
Tags | No tags attached. | ||||
Attached Files | 0010-New-normalization-with-libunistring.patch (6,705 bytes)
From 951a3316ecdc42f1fb6bbe9a02531a71d6423707 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=A0=D1=83=D1=81=D0=BB=D0=B0=D0=BD=20=D0=98=D0=B6=D0=B1=D1?= =?UTF-8?q?=83=D0=BB=D0=B0=D1=82=D0=BE=D0=B2?= <lrn1986@gmail.com> Date: Thu, 29 Dec 2011 22:00:10 +0400 Subject: [PATCH 10/10] New normalization with libunistring --- src/fs/Makefile.am | 2 +- src/fs/fs_uri.c | 105 ++++++++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 98 insertions(+), 9 deletions(-) diff --git a/src/fs/Makefile.am b/src/fs/Makefile.am index bf355e9..f480f28 100644 --- a/src/fs/Makefile.am +++ b/src/fs/Makefile.am @@ -43,7 +43,7 @@ libgnunetfs_la_LIBADD = \ $(top_builddir)/src/datastore/libgnunetdatastore.la \ $(top_builddir)/src/util/libgnunetutil.la \ -lextractor \ - $(GN_LIBINTL) $(XLIB) + $(GN_LIBINTL) $(XLIB) -lunistring libgnunetfs_la_LDFLAGS = \ $(GN_LIB_LDFLAGS) $(WINFLAGS) \ diff --git a/src/fs/fs_uri.c b/src/fs/fs_uri.c index a39eba8..93f9366 100644 --- a/src/fs/fs_uri.c +++ b/src/fs/fs_uri.c @@ -82,6 +82,11 @@ #include "gnunet_fs_service.h" #include "gnunet_signatures.h" #include "fs_api.h" +#include <unicase.h> +#include <unistr.h> +#include <unistdio.h> +#include <uniconv.h> + /** @@ -1493,6 +1498,48 @@ find_duplicate (const char *s, const char **array, int array_length) return GNUNET_NO; } +static char * +normalize_metadata (enum EXTRACTOR_MetaFormat format, const char *data, + size_t data_len) +{ + uint8_t *free_str = NULL; + uint8_t *str_to_normalize = (uint8_t *) data; + uint8_t *normalized; + size_t r_len; + if (str_to_normalize == NULL) + return NULL; + /* Don't trust libextractor */ + if (format == EXTRACTOR_METAFORMAT_UTF8) + { + free_str = (uint8_t *) u8_check ((const uint8_t *) data, data_len); + if (free_str == NULL) + free_str = NULL; + else + format = EXTRACTOR_METAFORMAT_C_STRING; + } + if (format == EXTRACTOR_METAFORMAT_C_STRING) + { + free_str = u8_strconv_from_encoding (data, locale_charset (), iconveh_escape_sequence); + if (free_str == NULL) + return NULL; + } + + normalized = u8_tolower (str_to_normalize, strlen ((char *) str_to_normalize), NULL, UNINORM_NFD, NULL, &r_len); + /* free_str is allocated by libunistring internally, use free() */ + if (free_str != NULL) + free (free_str); + if (normalized != NULL) + { + /* u8_tolower allocates a non-NULL-terminated string! */ + free_str = GNUNET_malloc (r_len + 1); + memcpy (free_str, normalized, r_len); + free_str[r_len] = '\0'; + free (normalized); + normalized = free_str; + } + return (char *) normalized; +} + /** * Break the filename up by matching [], () and {} pairs to make @@ -1551,14 +1598,29 @@ get_keywords_from_parens (const char *s, char **array, int index) { if (NULL != array) { + char *normalized; tmp = close_paren[0]; close_paren[0] = '\0'; - if (GNUNET_NO == find_duplicate ((const char *) &open_paren[1], (const char **) array, index + count)) + if (GNUNET_NO == find_duplicate ((const char *) &open_paren[1], + (const char **) array, index + count)) { insert_non_mandatory_keyword ((const char *) &open_paren[1], array, index + count); count++; } + normalized = normalize_metadata (EXTRACTOR_METAFORMAT_UTF8, + &open_paren[1], close_paren - &open_paren[1]); + if (normalized != NULL) + { + if (GNUNET_NO == find_duplicate ((const char *) normalized, + (const char **) array, index + count)) + { + insert_non_mandatory_keyword ((const char *) normalized, array, + index + count); + count++; + } + GNUNET_free (normalized); + } close_paren[0] = tmp; } else @@ -1601,12 +1663,26 @@ get_keywords_from_tokens (const char *s, char **array, int index) { if (NULL != array) { + char *normalized; if (GNUNET_NO == find_duplicate (p, (const char **) array, index + seps)) { insert_non_mandatory_keyword (p, array, index + seps); seps++; } + normalized = normalize_metadata (EXTRACTOR_METAFORMAT_UTF8, + p, strlen (p)); + if (normalized != NULL) + { + if (GNUNET_NO == find_duplicate ((const char *) normalized, + (const char **) array, index + seps)) + { + insert_non_mandatory_keyword ((const char *) normalized, array, + index + seps); + seps++; + } + GNUNET_free (normalized); + } } else seps++; @@ -1616,7 +1692,6 @@ get_keywords_from_tokens (const char *s, char **array, int index) } #undef TOKENS - /** * Function called on each value in the meta data. * Adds it to the URI. @@ -1640,15 +1715,28 @@ gather_uri_data (void *cls, const char *plugin_name, const char *data_mime_type, const char *data, size_t data_len) { struct GNUNET_FS_Uri *uri = cls; + char *normalized_data; if ((format != EXTRACTOR_METAFORMAT_UTF8) && (format != EXTRACTOR_METAFORMAT_C_STRING)) return 0; - if (find_duplicate (data, (const char **) uri->data.ksk.keywords, uri->data.ksk.keywordCount)) - return GNUNET_OK; - insert_non_mandatory_keyword (data, - uri->data.ksk.keywords, uri->data.ksk.keywordCount); - uri->data.ksk.keywordCount++; + normalized_data = normalize_metadata (format, data, data_len); + if (!find_duplicate (data, (const char **) uri->data.ksk.keywords, uri->data.ksk.keywordCount)) + { + insert_non_mandatory_keyword (data, + uri->data.ksk.keywords, uri->data.ksk.keywordCount); + uri->data.ksk.keywordCount++; + } + if (normalized_data != NULL) + { + if (!find_duplicate (normalized_data, (const char **) uri->data.ksk.keywords, uri->data.ksk.keywordCount)) + { + insert_non_mandatory_keyword (normalized_data, + uri->data.ksk.keywords, uri->data.ksk.keywordCount); + uri->data.ksk.keywordCount++; + } + GNUNET_free (normalized_data); + } return 0; } @@ -1690,8 +1778,9 @@ GNUNET_FS_uri_ksk_create_from_meta_data (const struct GNUNET_CONTAINER_MetaData tok_keywords = get_keywords_from_tokens (filename, NULL, 0); paren_keywords = get_keywords_from_parens (filename, NULL, 0); } + /* x2 because there might be a normalized variant of every keyword */ ret->data.ksk.keywords = GNUNET_malloc (sizeof (char *) * (ent - + tok_keywords + paren_keywords)); + + tok_keywords + paren_keywords) * 2); GNUNET_CONTAINER_meta_data_iterate (md, &gather_uri_data, ret); } if (tok_keywords > 0) -- 1.7.4 | ||||
Date Modified | Username | Field | Change |
---|---|---|---|
2011-12-30 19:47 | LRN | New Issue | |
2011-12-30 19:47 | LRN | File Added: 0010-New-normalization-with-libunistring.patch | |
2011-12-30 19:48 | LRN | Summary | libunistring-based normalization => [patch] libunistring-based normalization |
2011-12-30 19:48 | LRN | Description Updated | |
2011-12-30 23:31 | Christian Grothoff | Note Added: 0005221 | |
2011-12-30 23:31 | Christian Grothoff | Status | new => resolved |
2011-12-30 23:31 | Christian Grothoff | Fixed in Version | => 0.9.2 |
2011-12-30 23:31 | Christian Grothoff | Resolution | open => fixed |
2011-12-30 23:31 | Christian Grothoff | Assigned To | => Christian Grothoff |
2011-12-31 00:15 | Christian Grothoff | Target Version | => 0.9.2 |
2012-02-28 11:04 | Christian Grothoff | Status | resolved => closed |