[Xfce4-commits] <thunar:master> Use g_utf8_collate_key_for_filename for sorting (bug #7110).

Thu Oct 4 18:32:01 CEST 2012

Updating branch refs/heads/master
         to 1fcb0e71632b9ed21e5f51c022687605fa4b4537 (commit)
       from a877a2a50e64750f3118246f272bf17fb34de2cf (commit)

commit 1fcb0e71632b9ed21e5f51c022687605fa4b4537
Author: Andrzej <ndrwrdck at gmail.com>
Date:   Thu Oct 4 18:27:23 2012 +0200

    Use g_utf8_collate_key_for_filename for sorting (bug #7110).
    
    This should resolve issue with non-ascii locales and it
    respects the LC_COLLATE setting of the user.

 thunar/thunar-file.c |  281 ++++++++------------------------------------------
 thunar/thunar-file.h |    2 +-
 2 files changed, 44 insertions(+), 239 deletions(-)

diff --git a/thunar/thunar-file.c b/thunar/thunar-file.c
index dfd2977..68df146 100644
--- a/thunar/thunar-file.c
+++ b/thunar/thunar-file.c
@@ -155,6 +155,8 @@ struct _ThunarFile
   GFile         *gfile;
   gchar         *custom_icon_name;
   gchar         *display_name;
+  gchar         *collate_key;
+  gchar         *collate_key_nocase;
   gchar         *basename;
   gchar         *thumbnail_path;
   guint          flags;
@@ -331,6 +333,11 @@ thunar_file_finalize (GObject *object)
   g_free (file->display_name);
   g_free (file->basename);
 
+  /* free collate keys */
+  if (file->collate_key_nocase != file->collate_key)
+    g_free (file->collate_key_nocase);
+  g_free (file->collate_key);
+
   /* free the thumbnail path */
   g_free (file->thumbnail_path);
 
@@ -652,10 +659,6 @@ thunar_file_get (GFile   *gfile,
       /* allocate a new object */
       file = g_object_new (THUNAR_TYPE_FILE, NULL);
       file->gfile = g_object_ref (gfile);
-      file->info = NULL;
-      file->custom_icon_name = NULL;
-      file->display_name = NULL;
-      file->basename = NULL;
 
       if (thunar_file_load (file, NULL, error))
         {
@@ -740,6 +743,7 @@ thunar_file_load (ThunarFile   *file,
   gchar       *thumbnail_dir_path;
   const gchar *display_name;
   gboolean     is_secure = FALSE;
+  gchar       *casefold;
 
   _thunar_return_val_if_fail (THUNAR_IS_FILE (file), FALSE);
   _thunar_return_val_if_fail (error == NULL || *error == NULL, FALSE);
@@ -763,6 +767,14 @@ thunar_file_load (ThunarFile   *file,
   g_free (file->basename);
   file->basename = NULL;
 
+  /* free collate keys */
+  if (file->collate_key_nocase != file->collate_key)
+    g_free (file->collate_key_nocase);
+  file->collate_key_nocase = NULL;
+
+  g_free (file->collate_key);
+  file->collate_key = NULL;
+
   /* free thumbnail path */
   g_free (file->thumbnail_path);
   file->thumbnail_path = NULL;
@@ -921,6 +933,21 @@ thunar_file_load (ThunarFile   *file,
         }
     }
 
+  /* create case sensitive collation key */
+  file->collate_key = g_utf8_collate_key_for_filename (file->display_name, -1);
+
+  /* lowercase the display name */
+  casefold = g_utf8_casefold (file->display_name, -1);
+
+  /* if the lowercase name is equal, only peek the already hash key */
+  if (casefold != NULL && strcmp (casefold, file->display_name) != 0)
+    file->collate_key_nocase = g_utf8_collate_key_for_filename (casefold, -1);
+  else
+    file->collate_key_nocase = file->collate_key;
+
+  /* cleanup */
+  g_free (casefold);
+
   /* set thumb state to unknown */
   file->flags = 
     (file->flags & ~THUNAR_FILE_THUMB_STATE_MASK) | THUNAR_FILE_THUMB_STATE_UNKNOWN;
@@ -3250,100 +3277,6 @@ thunar_file_destroy (ThunarFile *file)
 
 
 
-static guint
-skip_leading_zeros (const gchar **ap,
-                    const gchar  *name)
-{
-  const gchar *bp;
-  guint        skipped_zeros = 0;
-
-  /* do a backward search to check if the number starts with a '0' */
-  for (bp = *ap; bp >= name; --bp)
-    {
-      if (*bp != '0')
-        break;
-    }
-
-  /* if the number starts with a '0' skip all following '0' */
-  if (!g_ascii_isdigit (*bp) || *bp == '0')
-   {
-     for (bp = *ap; *bp != '\0'; ++bp)
-       {
-         if (*bp != '0')
-           break;
-       }
-
-     skipped_zeros = bp - *ap;
-     *ap = bp;
-     return skipped_zeros;
-   }
-
-  return 0;
-}
-
-
-
-static gint
-compare_by_name_using_number (const gchar *ap,
-                              const gchar *bp,
-                              const gchar *start_a,
-                              const gchar *start_b)
-{
-  const gchar *ai;
-  const gchar *bi;
-  gchar        ac;
-  gchar        bc;
-  guint        skipped_zeros_a;
-  guint        skipped_zeros_b;
-
-  /* up until now the numbers match. Now compare the numbers by digit
-   * count, the longest number is the largest. If the lengths are equal
-   * compare the digits. */
-
-  /* skip leading zeros of both numbers */
-  skipped_zeros_a = skip_leading_zeros (&ap, start_a);
-  skipped_zeros_b = skip_leading_zeros (&bp, start_b);
-
-  /* determine the largest number */
-  for (ai = ap, bi = bp;; ++ai, ++bi)
-    {
-      ac = *ai;
-      bc = *bi;
-      if (!g_ascii_isdigit (ac) || !g_ascii_isdigit (bc))
-        break;
-    }
-
-  /* if one of the numbers still has a digit, that number is the largest. */
-  if (g_ascii_isdigit (ac))
-    return 1;
-  else if (g_ascii_isdigit (bc))
-    return -1;
-
-  /* both numbers have the same length. look for the first digit that
-   * is different */
-  for (;; ++ap, ++bp)
-    {
-      ac = *ap;
-      bc = *bp;
-
-      /* check if the characters differ or we have a non-digit char */
-      if (ac != bc || !g_ascii_isdigit (ac))
-        break;
-    }
-
-  /* if we have reached the end of the numbers and they are still equal,
-   * then they differ only in the number of leading zeros. let us always
-   * sort the one with more leading zeros first. */
-  if (G_UNLIKELY (!g_ascii_isdigit (ac) || !g_ascii_isdigit (bc)))
-    return skipped_zeros_b - skipped_zeros_a;
-      
-  /* for all regular numbers that have the same length, the one with the
-   * lowest different digit should be sorted first */
-  return (ac - bc);
-}
-
-
-
 /**
  * thunar_file_compare_by_name:
  * @file_a         : the first #ThunarFile.
@@ -3361,12 +3294,7 @@ thunar_file_compare_by_name (const ThunarFile *file_a,
                              const ThunarFile *file_b,
                              gboolean          case_sensitive)
 {
-  const gchar *ap;
-  const gchar *bp;
-  const gchar *filename_a;
-  const gchar *filename_b;
-  guchar       ac;
-  guchar       bc;
+  gint result = 0;
 
 #ifdef G_ENABLE_DEBUG
   /* probably too expensive to do the instance check every time
@@ -3376,143 +3304,20 @@ thunar_file_compare_by_name (const ThunarFile *file_a,
   _thunar_return_val_if_fail (THUNAR_IS_FILE (file_b), 0);
 #endif
 
-  /* we compare only the display names (UTF-8!) */
-  filename_a = thunar_file_get_display_name (file_a);
-  filename_b = thunar_file_get_display_name (file_b);
-
-  /* start at the beginning of both strings */
-  ap = filename_a;
-  bp = filename_b;
-
-  /* check if we should ignore case */
-  if (G_LIKELY (case_sensitive))
-    {
-      /* try simple (fast) ASCII comparison first */
-      for (;; ++ap, ++bp)
-        {
-          /* check if the characters differ or we have a non-ASCII char */
-          ac = *((const guchar *)ap);
-          bc = *((const guchar *)bp);
-          if (ac != bc || ac == 0 || ac > 127)
-            break;
-        }
-
-      /* fallback to Unicode comparison */
-      if (G_UNLIKELY (ac > 127 || bc > 127))
-        {
-          for (;; ap = g_utf8_next_char (ap), bp = g_utf8_next_char (bp))
-            {
-              /* check if characters differ or end of string */
-              ac = g_utf8_get_char (ap);
-              bc = g_utf8_get_char (bp);
-              if (ac != bc || ac == 0)
-                break;
-            }
-        }
-    }
-  else
-    {
-      /* try simple (fast) ASCII comparison first (case-insensitive!) */
-      for (;; ++ap, ++bp)
-        {
-          /* check if the characters differ or we have a non-ASCII char */
-          ac = *((const guchar *)ap);
-          bc = *((const guchar *)bp);
-          if (g_ascii_tolower (ac) != g_ascii_tolower (bc) || ac == 0 || ac > 127)
-            break;
-        }
+  /* case insensitive checking */
+  if (G_LIKELY (!case_sensitive))
+    result = strcmp (file_a->collate_key_nocase, file_b->collate_key_nocase);
 
-      /* fallback to Unicode comparison (case-insensitive!) */
-      if (G_UNLIKELY (ac > 127 || bc > 127))
-        {
-          for (;; ap = g_utf8_next_char (ap), bp = g_utf8_next_char (bp))
-            {
-              /* check if characters differ or end of string */
-              ac = g_utf8_get_char (ap);
-              bc = g_utf8_get_char (bp);
-              if (g_unichar_tolower (ac) != g_unichar_tolower (bc) || ac == 0)
-                break;
-            }
-        }
-    }
+  /* fall-back to case sensitive */
+  if (result == 0)
+    result = strcmp (file_a->collate_key, file_b->collate_key);
 
-  /* if both strings are equal, we're done */
-  if (G_UNLIKELY (ac == bc
-                  || (!case_sensitive
-                      && g_unichar_tolower (ac) == g_unichar_tolower (bc))))
-    {
-      return 0;
-    }
-
-  /* check if one of the characters that differ is a digit */
-  if (G_UNLIKELY (g_ascii_isdigit (ac) || g_ascii_isdigit (bc)))
-    {
-      /* if both strings differ in a digit, we use a smarter comparison
-       * to get sorting 'file1', 'file5', 'file10' done the right way.
-       */
-      if (g_ascii_isdigit (ac) && g_ascii_isdigit (bc))
-        {
-          return compare_by_name_using_number (ap, bp, filename_a, filename_b);
-        }
-
-      /* a second case is '20 file' and '2file', where comparison by number
-       * makes sense if the previous char for both strings is a digit.
-       */
-      if (ap > filename_a
-          && bp > filename_b
-          && g_ascii_isdigit (*(ap - 1))
-          && g_ascii_isdigit (*(bp - 1)))
-        {
-          /* go back one character to have both variables point to the numbers again */
-          ap -= 1;
-          bp -= 1;
-
-          return compare_by_name_using_number (ap, bp, filename_a, filename_b);
-        }
-    }
-
-  /* otherwise, if they differ in a unicode char, use the
-   * appropriate collate function for the current locale (only
-   * if charset is UTF-8, else the required transformations
-   * would be too expensive)
-   */
-#ifdef HAVE_STRCOLL
-  if ((ac > 127 || bc > 127) && g_get_charset (NULL))
-    {
-      /* case-sensitive is easy, case-insensitive is expensive,
-       * but we use a simple optimization to make it fast.
-       */
-      if (G_LIKELY (case_sensitive))
-        {
-          return strcoll (ap, bp);
-        }
-      else
-        {
-          /* we use a trick here, so we don't need to allocate
-           * and transform the two strings completely first (8
-           * byte for each buffer, so all compilers should align
-           * them properly)
-           */
-          gchar abuf[8];
-          gchar bbuf[8];
-
-          /* transform the unicode chars to strings and
-           * make sure the strings are nul-terminated.
-           */
-          abuf[g_unichar_to_utf8 (g_unichar_tolower(ac), abuf)] = '\0';
-          bbuf[g_unichar_to_utf8 (g_unichar_tolower(bc), bbuf)] = '\0';
-
-          /* compare the unicode chars (as strings) */
-          return strcoll (abuf, bbuf);
-        }
-    }
-#endif
+#ifdef G_ENABLE_DEBUG
+  /* check final output */
+  _thunar_return_val_if_fail (result != 0, 0);
+ #endif
 
-  /* else, they differ in an ASCII character */
-  if (G_UNLIKELY (!case_sensitive))
-    return (g_unichar_tolower (ac) > g_unichar_tolower (bc)) ? 1 : -1;
-  else
-    return (ac > bc) ? 1 : -1;
+  return result;
 }
 
 
diff --git a/thunar/thunar-file.h b/thunar/thunar-file.h
index 9bfce8f..52c7407 100644
--- a/thunar/thunar-file.h
+++ b/thunar/thunar-file.h
@@ -218,7 +218,7 @@ void              thunar_file_destroy              (ThunarFile             *file
 
 gint              thunar_file_compare_by_name      (const ThunarFile       *file_a,
                                                     const ThunarFile       *file_b,
-                                                    gboolean                case_sensitive);
+                                                    gboolean                case_sensitive) G_GNUC_PURE;
 
 ThunarFile       *thunar_file_cache_lookup         (const GFile            *file);
 gchar            *thunar_file_cached_display_name  (const GFile            *file);