0002052: [patch] libunistring-based normalization - MantisBT

ID	Project	Category	View Status	Date Submitted	Last Update

0002052	GNUnet	file-sharing service	public	2011-12-30 19:47	2012-02-28 11:04

Reporter	LRN	Assigned To	Christian Grothoff
Priority	low	Severity	feature	Reproducibility	N/A
Status	closed	Resolution	fixed
Product Version	Git master
Target Version	0.9.2	Fixed in Version	0.9.2

Summary	0002052: [patch] libunistring-based normalization
Description	Normalization is back, and it is now done by libunistring \o/ Now normalization means lowercase conversion. Planned improvements: 1) Use libunistring character classification system to get rid of ALL punctuation characters (including Unicode ones) in all autogenerated keywords (right now only filename-generated keywords are stripped of punctuation characters, and only of the most basic ones, it doesn't work with unicode) 2) As mentioned in (1), parentheses-based and delimiter-based keyword splitting is not Unicode-aware, but it can be (hopefully) made so with libunistring. Patch 0010-New-normalization-with-libunistring.patch is attached
Tags	No tags attached.
Attached Files	0010-New-normalization-with-libunistring.patch (6,705 bytes) From 951a3316ecdc42f1fb6bbe9a02531a71d6423707 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=A0=D1=83=D1=81=D0=BB=D0=B0=D0=BD=20=D0=98=D0=B6=D0=B1=D1?= =?UTF-8?q?=83=D0=BB=D0=B0=D1=82=D0=BE=D0=B2?= <lrn1986@gmail.com> Date: Thu, 29 Dec 2011 22:00:10 +0400 Subject: [PATCH 10/10] New normalization with libunistring --- src/fs/Makefile.am \| 2 +- src/fs/fs_uri.c \| 105 ++++++++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 98 insertions(+), 9 deletions(-) diff --git a/src/fs/Makefile.am b/src/fs/Makefile.am index bf355e9..f480f28 100644 --- a/src/fs/Makefile.am +++ b/src/fs/Makefile.am @@ -43,7 +43,7 @@ libgnunetfs_la_LIBADD = \ $(top_builddir)/src/datastore/libgnunetdatastore.la \ $(top_builddir)/src/util/libgnunetutil.la \ -lextractor \ - $(GN_LIBINTL) $(XLIB) + $(GN_LIBINTL) $(XLIB) -lunistring libgnunetfs_la_LDFLAGS = \ $(GN_LIB_LDFLAGS) $(WINFLAGS) \ diff --git a/src/fs/fs_uri.c b/src/fs/fs_uri.c index a39eba8..93f9366 100644 --- a/src/fs/fs_uri.c +++ b/src/fs/fs_uri.c @@ -82,6 +82,11 @@ #include "gnunet_fs_service.h" #include "gnunet_signatures.h" #include "fs_api.h" +#include <unicase.h> +#include <unistr.h> +#include <unistdio.h> +#include <uniconv.h> + /** @@ -1493,6 +1498,48 @@ find_duplicate (const char s, const char array, int array_length) return GNUNET_NO; } +static char +normalize_metadata (enum EXTRACTOR_MetaFormat format, const char data, + size_t data_len) +{ + uint8_t free_str = NULL; + uint8_t str_to_normalize = (uint8_t ) data; + uint8_t normalized; + size_t r_len; + if (str_to_normalize == NULL) + return NULL; + / Don't trust libextractor / + if (format == EXTRACTOR_METAFORMAT_UTF8) + { + free_str = (uint8_t ) u8_check ((const uint8_t ) data, data_len); + if (free_str == NULL) + free_str = NULL; + else + format = EXTRACTOR_METAFORMAT_C_STRING; + } + if (format == EXTRACTOR_METAFORMAT_C_STRING) + { + free_str = u8_strconv_from_encoding (data, locale_charset (), iconveh_escape_sequence); + if (free_str == NULL) + return NULL; + } + + normalized = u8_tolower (str_to_normalize, strlen ((char ) str_to_normalize), NULL, UNINORM_NFD, NULL, &r_len); + /* free_str is allocated by libunistring internally, use free() / + if (free_str != NULL) + free (free_str); + if (normalized != NULL) + { + / u8_tolower allocates a non-NULL-terminated string! / + free_str = GNUNET_malloc (r_len + 1); + memcpy (free_str, normalized, r_len); + free_str[r_len] = '\0'; + free (normalized); + normalized = free_str; + } + return (char ) normalized; +} + /** * Break the filename up by matching [], () and {} pairs to make @@ -1551,14 +1598,29 @@ get_keywords_from_parens (const char s, char array, int index) { if (NULL != array) { + char normalized; tmp = close_paren[0]; close_paren[0] = '\0'; - if (GNUNET_NO == find_duplicate ((const char ) &open_paren[1], (const char ) array, index + count)) + if (GNUNET_NO == find_duplicate ((const char ) &open_paren[1], + (const char *) array, index + count)) { insert_non_mandatory_keyword ((const char ) &open_paren[1], array, index + count); count++; } + normalized = normalize_metadata (EXTRACTOR_METAFORMAT_UTF8, + &open_paren[1], close_paren - &open_paren[1]); + if (normalized != NULL) + { + if (GNUNET_NO == find_duplicate ((const char ) normalized, + (const char ) array, index + count)) + { + insert_non_mandatory_keyword ((const char ) normalized, array, + index + count); + count++; + } + GNUNET_free (normalized); + } close_paren[0] = tmp; } else @@ -1601,12 +1663,26 @@ get_keywords_from_tokens (const char s, char array, int index) { if (NULL != array) { + char normalized; if (GNUNET_NO == find_duplicate (p, (const char *) array, index + seps)) { insert_non_mandatory_keyword (p, array, index + seps); seps++; } + normalized = normalize_metadata (EXTRACTOR_METAFORMAT_UTF8, + p, strlen (p)); + if (normalized != NULL) + { + if (GNUNET_NO == find_duplicate ((const char ) normalized, + (const char *) array, index + seps)) + { + insert_non_mandatory_keyword ((const char ) normalized, array, + index + seps); + seps++; + } + GNUNET_free (normalized); + } } else seps++; @@ -1616,7 +1692,6 @@ get_keywords_from_tokens (const char s, char array, int index) } #undef TOKENS - /* * Function called on each value in the meta data. * Adds it to the URI. @@ -1640,15 +1715,28 @@ gather_uri_data (void cls, const char plugin_name, const char data_mime_type, const char data, size_t data_len) { struct GNUNET_FS_Uri uri = cls; + char normalized_data; if ((format != EXTRACTOR_METAFORMAT_UTF8) && (format != EXTRACTOR_METAFORMAT_C_STRING)) return 0; - if (find_duplicate (data, (const char ) uri->data.ksk.keywords, uri->data.ksk.keywordCount)) - return GNUNET_OK; - insert_non_mandatory_keyword (data, - uri->data.ksk.keywords, uri->data.ksk.keywordCount); - uri->data.ksk.keywordCount++; + normalized_data = normalize_metadata (format, data, data_len); + if (!find_duplicate (data, (const char ) uri->data.ksk.keywords, uri->data.ksk.keywordCount)) + { + insert_non_mandatory_keyword (data, + uri->data.ksk.keywords, uri->data.ksk.keywordCount); + uri->data.ksk.keywordCount++; + } + if (normalized_data != NULL) + { + if (!find_duplicate (normalized_data, (const char *) uri->data.ksk.keywords, uri->data.ksk.keywordCount)) + { + insert_non_mandatory_keyword (normalized_data, + uri->data.ksk.keywords, uri->data.ksk.keywordCount); + uri->data.ksk.keywordCount++; + } + GNUNET_free (normalized_data); + } return 0; } @@ -1690,8 +1778,9 @@ GNUNET_FS_uri_ksk_create_from_meta_data (const struct GNUNET_CONTAINER_MetaData tok_keywords = get_keywords_from_tokens (filename, NULL, 0); paren_keywords = get_keywords_from_parens (filename, NULL, 0); } + / x2 because there might be a normalized variant of every keyword / ret->data.ksk.keywords = GNUNET_malloc (sizeof (char ) * (ent - + tok_keywords + paren_keywords)); + + tok_keywords + paren_keywords) * 2); GNUNET_CONTAINER_meta_data_iterate (md, &gather_uri_data, ret); } if (tok_keywords > 0) -- 1.7.4 0010-New-normalization-with-libunistring.patch (6,705 bytes)

Date Modified	Username	Field	Change
2011-12-30 19:47	LRN	New Issue
2011-12-30 19:47	LRN	File Added: 0010-New-normalization-with-libunistring.patch
2011-12-30 19:48	LRN	Summary	libunistring-based normalization => [patch] libunistring-based normalization
2011-12-30 19:48	LRN	Description Updated
2011-12-30 23:31	Christian Grothoff	Note Added: 0005221
2011-12-30 23:31	Christian Grothoff	Status	new => resolved
2011-12-30 23:31	Christian Grothoff	Fixed in Version	=> 0.9.2
2011-12-30 23:31	Christian Grothoff	Resolution	open => fixed
2011-12-30 23:31	Christian Grothoff	Assigned To	=> Christian Grothoff
2011-12-31 00:15	Christian Grothoff	Target Version	=> 0.9.2
2012-02-28 11:04	Christian Grothoff	Status	resolved => closed