View Issue Details
| ID | Project | Category | View Status | Date Submitted | Last Update |
|---|---|---|---|---|---|
| 0002052 | GNUnet | file-sharing service | public | 2011-12-30 19:47 | 2012-02-28 11:04 |
| Reporter | LRN | Assigned To | Christian Grothoff | ||
| Priority | low | Severity | feature | Reproducibility | N/A |
| Status | closed | Resolution | fixed | ||
| Product Version | Git master | ||||
| Target Version | 0.9.2 | Fixed in Version | 0.9.2 | ||
| Summary | 0002052: [patch] libunistring-based normalization | ||||
| Description | Normalization is back, and it is now done by libunistring \o/ Now normalization means lowercase conversion. Planned improvements: 1) Use libunistring character classification system to get rid of ALL punctuation characters (including Unicode ones) in all autogenerated keywords (right now only filename-generated keywords are stripped of punctuation characters, and only of the most basic ones, it doesn't work with unicode) 2) As mentioned in (1), parentheses-based and delimiter-based keyword splitting is not Unicode-aware, but it can be (hopefully) made so with libunistring. Patch 0010-New-normalization-with-libunistring.patch is attached | ||||
| Tags | No tags attached. | ||||
| Attached Files | 0010-New-normalization-with-libunistring.patch (6,705 bytes)
From 951a3316ecdc42f1fb6bbe9a02531a71d6423707 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=A0=D1=83=D1=81=D0=BB=D0=B0=D0=BD=20=D0=98=D0=B6=D0=B1=D1?=
=?UTF-8?q?=83=D0=BB=D0=B0=D1=82=D0=BE=D0=B2?= <lrn1986@gmail.com>
Date: Thu, 29 Dec 2011 22:00:10 +0400
Subject: [PATCH 10/10] New normalization with libunistring
---
src/fs/Makefile.am | 2 +-
src/fs/fs_uri.c | 105 ++++++++++++++++++++++++++++++++++++++++++++++++----
2 files changed, 98 insertions(+), 9 deletions(-)
diff --git a/src/fs/Makefile.am b/src/fs/Makefile.am
index bf355e9..f480f28 100644
--- a/src/fs/Makefile.am
+++ b/src/fs/Makefile.am
@@ -43,7 +43,7 @@ libgnunetfs_la_LIBADD = \
$(top_builddir)/src/datastore/libgnunetdatastore.la \
$(top_builddir)/src/util/libgnunetutil.la \
-lextractor \
- $(GN_LIBINTL) $(XLIB)
+ $(GN_LIBINTL) $(XLIB) -lunistring
libgnunetfs_la_LDFLAGS = \
$(GN_LIB_LDFLAGS) $(WINFLAGS) \
diff --git a/src/fs/fs_uri.c b/src/fs/fs_uri.c
index a39eba8..93f9366 100644
--- a/src/fs/fs_uri.c
+++ b/src/fs/fs_uri.c
@@ -82,6 +82,11 @@
#include "gnunet_fs_service.h"
#include "gnunet_signatures.h"
#include "fs_api.h"
+#include <unicase.h>
+#include <unistr.h>
+#include <unistdio.h>
+#include <uniconv.h>
+
/**
@@ -1493,6 +1498,48 @@ find_duplicate (const char *s, const char **array, int array_length)
return GNUNET_NO;
}
+static char *
+normalize_metadata (enum EXTRACTOR_MetaFormat format, const char *data,
+ size_t data_len)
+{
+ uint8_t *free_str = NULL;
+ uint8_t *str_to_normalize = (uint8_t *) data;
+ uint8_t *normalized;
+ size_t r_len;
+ if (str_to_normalize == NULL)
+ return NULL;
+ /* Don't trust libextractor */
+ if (format == EXTRACTOR_METAFORMAT_UTF8)
+ {
+ free_str = (uint8_t *) u8_check ((const uint8_t *) data, data_len);
+ if (free_str == NULL)
+ free_str = NULL;
+ else
+ format = EXTRACTOR_METAFORMAT_C_STRING;
+ }
+ if (format == EXTRACTOR_METAFORMAT_C_STRING)
+ {
+ free_str = u8_strconv_from_encoding (data, locale_charset (), iconveh_escape_sequence);
+ if (free_str == NULL)
+ return NULL;
+ }
+
+ normalized = u8_tolower (str_to_normalize, strlen ((char *) str_to_normalize), NULL, UNINORM_NFD, NULL, &r_len);
+ /* free_str is allocated by libunistring internally, use free() */
+ if (free_str != NULL)
+ free (free_str);
+ if (normalized != NULL)
+ {
+ /* u8_tolower allocates a non-NULL-terminated string! */
+ free_str = GNUNET_malloc (r_len + 1);
+ memcpy (free_str, normalized, r_len);
+ free_str[r_len] = '\0';
+ free (normalized);
+ normalized = free_str;
+ }
+ return (char *) normalized;
+}
+
/**
* Break the filename up by matching [], () and {} pairs to make
@@ -1551,14 +1598,29 @@ get_keywords_from_parens (const char *s, char **array, int index)
{
if (NULL != array)
{
+ char *normalized;
tmp = close_paren[0];
close_paren[0] = '\0';
- if (GNUNET_NO == find_duplicate ((const char *) &open_paren[1], (const char **) array, index + count))
+ if (GNUNET_NO == find_duplicate ((const char *) &open_paren[1],
+ (const char **) array, index + count))
{
insert_non_mandatory_keyword ((const char *) &open_paren[1], array,
index + count);
count++;
}
+ normalized = normalize_metadata (EXTRACTOR_METAFORMAT_UTF8,
+ &open_paren[1], close_paren - &open_paren[1]);
+ if (normalized != NULL)
+ {
+ if (GNUNET_NO == find_duplicate ((const char *) normalized,
+ (const char **) array, index + count))
+ {
+ insert_non_mandatory_keyword ((const char *) normalized, array,
+ index + count);
+ count++;
+ }
+ GNUNET_free (normalized);
+ }
close_paren[0] = tmp;
}
else
@@ -1601,12 +1663,26 @@ get_keywords_from_tokens (const char *s, char **array, int index)
{
if (NULL != array)
{
+ char *normalized;
if (GNUNET_NO == find_duplicate (p, (const char **) array, index + seps))
{
insert_non_mandatory_keyword (p, array,
index + seps);
seps++;
}
+ normalized = normalize_metadata (EXTRACTOR_METAFORMAT_UTF8,
+ p, strlen (p));
+ if (normalized != NULL)
+ {
+ if (GNUNET_NO == find_duplicate ((const char *) normalized,
+ (const char **) array, index + seps))
+ {
+ insert_non_mandatory_keyword ((const char *) normalized, array,
+ index + seps);
+ seps++;
+ }
+ GNUNET_free (normalized);
+ }
}
else
seps++;
@@ -1616,7 +1692,6 @@ get_keywords_from_tokens (const char *s, char **array, int index)
}
#undef TOKENS
-
/**
* Function called on each value in the meta data.
* Adds it to the URI.
@@ -1640,15 +1715,28 @@ gather_uri_data (void *cls, const char *plugin_name,
const char *data_mime_type, const char *data, size_t data_len)
{
struct GNUNET_FS_Uri *uri = cls;
+ char *normalized_data;
if ((format != EXTRACTOR_METAFORMAT_UTF8) &&
(format != EXTRACTOR_METAFORMAT_C_STRING))
return 0;
- if (find_duplicate (data, (const char **) uri->data.ksk.keywords, uri->data.ksk.keywordCount))
- return GNUNET_OK;
- insert_non_mandatory_keyword (data,
- uri->data.ksk.keywords, uri->data.ksk.keywordCount);
- uri->data.ksk.keywordCount++;
+ normalized_data = normalize_metadata (format, data, data_len);
+ if (!find_duplicate (data, (const char **) uri->data.ksk.keywords, uri->data.ksk.keywordCount))
+ {
+ insert_non_mandatory_keyword (data,
+ uri->data.ksk.keywords, uri->data.ksk.keywordCount);
+ uri->data.ksk.keywordCount++;
+ }
+ if (normalized_data != NULL)
+ {
+ if (!find_duplicate (normalized_data, (const char **) uri->data.ksk.keywords, uri->data.ksk.keywordCount))
+ {
+ insert_non_mandatory_keyword (normalized_data,
+ uri->data.ksk.keywords, uri->data.ksk.keywordCount);
+ uri->data.ksk.keywordCount++;
+ }
+ GNUNET_free (normalized_data);
+ }
return 0;
}
@@ -1690,8 +1778,9 @@ GNUNET_FS_uri_ksk_create_from_meta_data (const struct GNUNET_CONTAINER_MetaData
tok_keywords = get_keywords_from_tokens (filename, NULL, 0);
paren_keywords = get_keywords_from_parens (filename, NULL, 0);
}
+ /* x2 because there might be a normalized variant of every keyword */
ret->data.ksk.keywords = GNUNET_malloc (sizeof (char *) * (ent
- + tok_keywords + paren_keywords));
+ + tok_keywords + paren_keywords) * 2);
GNUNET_CONTAINER_meta_data_iterate (md, &gather_uri_data, ret);
}
if (tok_keywords > 0)
--
1.7.4
| ||||
| Date Modified | Username | Field | Change |
|---|---|---|---|
| 2011-12-30 19:47 | LRN | New Issue | |
| 2011-12-30 19:47 | LRN | File Added: 0010-New-normalization-with-libunistring.patch | |
| 2011-12-30 19:48 | LRN | Summary | libunistring-based normalization => [patch] libunistring-based normalization |
| 2011-12-30 19:48 | LRN | Description Updated | |
| 2011-12-30 23:31 | Christian Grothoff | Note Added: 0005221 | |
| 2011-12-30 23:31 | Christian Grothoff | Status | new => resolved |
| 2011-12-30 23:31 | Christian Grothoff | Fixed in Version | => 0.9.2 |
| 2011-12-30 23:31 | Christian Grothoff | Resolution | open => fixed |
| 2011-12-30 23:31 | Christian Grothoff | Assigned To | => Christian Grothoff |
| 2011-12-31 00:15 | Christian Grothoff | Target Version | => 0.9.2 |
| 2012-02-28 11:04 | Christian Grothoff | Status | resolved => closed |