View Issue Details

IDProjectCategoryView StatusLast Update
0002052GNUnetfile-sharing servicepublic2012-02-28 11:04
ReporterLRN Assigned ToChristian Grothoff  
PrioritylowSeverityfeatureReproducibilityN/A
Status closedResolutionfixed 
Product VersionGit master 
Target Version0.9.2Fixed in Version0.9.2 
Summary0002052: [patch] libunistring-based normalization
DescriptionNormalization is back, and it is now done by libunistring \o/
Now normalization means lowercase conversion.
Planned improvements:
1) Use libunistring character classification system to get rid of ALL punctuation characters (including Unicode ones) in all autogenerated keywords (right now only filename-generated keywords are stripped of punctuation characters, and only of the most basic ones, it doesn't work with unicode)
2) As mentioned in (1), parentheses-based and delimiter-based keyword splitting is not Unicode-aware, but it can be (hopefully) made so with libunistring.

Patch 0010-New-normalization-with-libunistring.patch is attached
TagsNo tags attached.
Attached Files
0010-New-normalization-with-libunistring.patch (6,705 bytes)   
From 951a3316ecdc42f1fb6bbe9a02531a71d6423707 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=A0=D1=83=D1=81=D0=BB=D0=B0=D0=BD=20=D0=98=D0=B6=D0=B1=D1?=
 =?UTF-8?q?=83=D0=BB=D0=B0=D1=82=D0=BE=D0=B2?= <lrn1986@gmail.com>
Date: Thu, 29 Dec 2011 22:00:10 +0400
Subject: [PATCH 10/10] New normalization with libunistring

---
 src/fs/Makefile.am |    2 +-
 src/fs/fs_uri.c    |  105 ++++++++++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 98 insertions(+), 9 deletions(-)

diff --git a/src/fs/Makefile.am b/src/fs/Makefile.am
index bf355e9..f480f28 100644
--- a/src/fs/Makefile.am
+++ b/src/fs/Makefile.am
@@ -43,7 +43,7 @@ libgnunetfs_la_LIBADD = \
   $(top_builddir)/src/datastore/libgnunetdatastore.la \
   $(top_builddir)/src/util/libgnunetutil.la \
   -lextractor \
-  $(GN_LIBINTL) $(XLIB)
+  $(GN_LIBINTL) $(XLIB) -lunistring
 
 libgnunetfs_la_LDFLAGS = \
   $(GN_LIB_LDFLAGS)  $(WINFLAGS) \
diff --git a/src/fs/fs_uri.c b/src/fs/fs_uri.c
index a39eba8..93f9366 100644
--- a/src/fs/fs_uri.c
+++ b/src/fs/fs_uri.c
@@ -82,6 +82,11 @@
 #include "gnunet_fs_service.h"
 #include "gnunet_signatures.h"
 #include "fs_api.h"
+#include <unicase.h>
+#include <unistr.h>
+#include <unistdio.h>
+#include <uniconv.h>
+
 
 
 /**
@@ -1493,6 +1498,48 @@ find_duplicate (const char *s, const char **array, int array_length)
   return GNUNET_NO;
 }
 
+static char *
+normalize_metadata (enum EXTRACTOR_MetaFormat format, const char *data,
+    size_t data_len)
+{
+  uint8_t *free_str = NULL;
+  uint8_t *str_to_normalize = (uint8_t *) data;
+  uint8_t *normalized;
+  size_t r_len;
+  if (str_to_normalize == NULL)
+    return NULL;
+  /* Don't trust libextractor */
+  if (format == EXTRACTOR_METAFORMAT_UTF8)
+  {
+    free_str = (uint8_t *) u8_check ((const uint8_t *) data, data_len);
+    if (free_str == NULL)
+      free_str = NULL;
+    else
+      format = EXTRACTOR_METAFORMAT_C_STRING;
+  }
+  if (format == EXTRACTOR_METAFORMAT_C_STRING)
+  {
+    free_str = u8_strconv_from_encoding (data, locale_charset (), iconveh_escape_sequence);
+    if (free_str == NULL)
+      return NULL;
+  }
+
+  normalized = u8_tolower (str_to_normalize, strlen ((char *) str_to_normalize), NULL, UNINORM_NFD, NULL, &r_len);
+  /* free_str is allocated by libunistring internally, use free() */
+  if (free_str != NULL)
+    free (free_str);
+  if (normalized != NULL)
+  {
+    /* u8_tolower allocates a non-NULL-terminated string! */
+    free_str = GNUNET_malloc (r_len + 1);
+    memcpy (free_str, normalized, r_len);
+    free_str[r_len] = '\0';
+    free (normalized);
+    normalized = free_str;
+  }
+  return (char *) normalized;
+}
+
 
 /**
  * Break the filename up by matching [], () and {} pairs to make
@@ -1551,14 +1598,29 @@ get_keywords_from_parens (const char *s, char **array, int index)
     {
       if (NULL != array)
       {
+        char *normalized;
         tmp = close_paren[0];
         close_paren[0] = '\0';
-        if (GNUNET_NO == find_duplicate ((const char *) &open_paren[1], (const char **) array, index + count))
+        if (GNUNET_NO == find_duplicate ((const char *) &open_paren[1],
+            (const char **) array, index + count))
         {
 	  insert_non_mandatory_keyword ((const char *) &open_paren[1], array,
 					index + count);
           count++;
         }
+        normalized = normalize_metadata (EXTRACTOR_METAFORMAT_UTF8,
+            &open_paren[1], close_paren - &open_paren[1]);
+        if (normalized != NULL)
+        {
+          if (GNUNET_NO == find_duplicate ((const char *) normalized,
+              (const char **) array, index + count))
+          {
+	    insert_non_mandatory_keyword ((const char *) normalized, array,
+					  index + count);
+            count++;
+          }
+          GNUNET_free (normalized);
+        }
         close_paren[0] = tmp;
       }
       else
@@ -1601,12 +1663,26 @@ get_keywords_from_tokens (const char *s, char **array, int index)
   {
     if (NULL != array)
     {
+      char *normalized;
       if (GNUNET_NO == find_duplicate (p, (const char **) array, index + seps))
       {
         insert_non_mandatory_keyword (p, array,
 				      index + seps);
 	seps++;
       }
+      normalized = normalize_metadata (EXTRACTOR_METAFORMAT_UTF8,
+          p, strlen (p));
+      if (normalized != NULL)
+      {
+        if (GNUNET_NO == find_duplicate ((const char *) normalized,
+            (const char **) array, index + seps))
+        {
+          insert_non_mandatory_keyword ((const char *) normalized, array,
+				  index + seps);
+          seps++;
+        }
+        GNUNET_free (normalized);
+      }
     }
     else
       seps++;
@@ -1616,7 +1692,6 @@ get_keywords_from_tokens (const char *s, char **array, int index)
 }
 #undef TOKENS
 
-
 /**
  * Function called on each value in the meta data.
  * Adds it to the URI.
@@ -1640,15 +1715,28 @@ gather_uri_data (void *cls, const char *plugin_name,
                  const char *data_mime_type, const char *data, size_t data_len)
 {
   struct GNUNET_FS_Uri *uri = cls;
+  char *normalized_data;
 
   if ((format != EXTRACTOR_METAFORMAT_UTF8) &&
       (format != EXTRACTOR_METAFORMAT_C_STRING))
     return 0;
-  if (find_duplicate (data, (const char **) uri->data.ksk.keywords, uri->data.ksk.keywordCount))
-    return GNUNET_OK;
-  insert_non_mandatory_keyword (data,
-				uri->data.ksk.keywords, uri->data.ksk.keywordCount);
-  uri->data.ksk.keywordCount++;
+  normalized_data = normalize_metadata (format, data, data_len);
+  if (!find_duplicate (data, (const char **) uri->data.ksk.keywords, uri->data.ksk.keywordCount))
+  {
+    insert_non_mandatory_keyword (data,
+				  uri->data.ksk.keywords, uri->data.ksk.keywordCount);
+    uri->data.ksk.keywordCount++;
+  }
+  if (normalized_data != NULL)
+  {
+    if (!find_duplicate (normalized_data, (const char **) uri->data.ksk.keywords, uri->data.ksk.keywordCount))
+    {
+      insert_non_mandatory_keyword (normalized_data,
+				    uri->data.ksk.keywords, uri->data.ksk.keywordCount);
+      uri->data.ksk.keywordCount++;
+    }
+    GNUNET_free (normalized_data);
+  }
   return 0;
 }
 
@@ -1690,8 +1778,9 @@ GNUNET_FS_uri_ksk_create_from_meta_data (const struct GNUNET_CONTAINER_MetaData
       tok_keywords = get_keywords_from_tokens (filename, NULL, 0);
       paren_keywords = get_keywords_from_parens (filename, NULL, 0);
     }
+    /* x2 because there might be a normalized variant of every keyword */
     ret->data.ksk.keywords = GNUNET_malloc (sizeof (char *) * (ent
-        + tok_keywords + paren_keywords));
+        + tok_keywords + paren_keywords) * 2);
     GNUNET_CONTAINER_meta_data_iterate (md, &gather_uri_data, ret);
   }
   if (tok_keywords > 0)
-- 
1.7.4

Activities

Christian Grothoff

2011-12-30 23:31

manager   ~0005221

Fixed in SVN 18854.

Issue History

Date Modified Username Field Change
2011-12-30 19:47 LRN New Issue
2011-12-30 19:47 LRN File Added: 0010-New-normalization-with-libunistring.patch
2011-12-30 19:48 LRN Summary libunistring-based normalization => [patch] libunistring-based normalization
2011-12-30 19:48 LRN Description Updated
2011-12-30 23:31 Christian Grothoff Note Added: 0005221
2011-12-30 23:31 Christian Grothoff Status new => resolved
2011-12-30 23:31 Christian Grothoff Fixed in Version => 0.9.2
2011-12-30 23:31 Christian Grothoff Resolution open => fixed
2011-12-30 23:31 Christian Grothoff Assigned To => Christian Grothoff
2011-12-31 00:15 Christian Grothoff Target Version => 0.9.2
2012-02-28 11:04 Christian Grothoff Status resolved => closed