This is the mail archive of the libc-hacker@sources.redhat.com mailing list for the glibc project.
Note that libc-hacker is a closed list. You may look at the archives of this list, but subscription and posting are not open.
| Index Nav: | [Date Index] [Subject Index] [Author Index] [Thread Index] | |
|---|---|---|
| Message Nav: | [Date Prev] [Date Next] | [Thread Prev] [Thread Next] |
| Other format: | [Raw text] | |
Hi!
The following patch speeds up UTF-8 handling in regex
(and perhaps other MB charsets if start of mb character can be
determined, though for them I haven't implemented the hooks yet).
The extended tst-regex test can last several hours without this
patch and finish within seconds with it.
>From real world tests, e.g.:
time LC_ALL=en_US.UTF-8 LD_LIBRARY_PATH=/usr/src/libc.old/obj /bin/sed 's/./x/g' /etc/termcap > /dev/null
real 0m8.884s
user 0m8.880s
sys 0m0.010s
time LC_ALL=en_US.UTF-8 LD_LIBRARY_PATH=/usr/src/libc/obj /bin/sed 's/./x/g' /etc/termcap > /dev/null
real 0m3.121s
user 0m3.100s
sys 0m0.020s
(where the only difference between those 2 libcs is this patch).
2003-11-11 Jakub Jelinek <jakub@redhat.com>
* iconv/gconv.h (__gconv_prevmb_fct): New typedef.
(struct __gconv_step): New field __prevmb_fct.
* iconv/gconv_int.h (__gconv_prevmb_ascii): New declaration.
* iconv/gconv_simple.c (BUILTIN_TRANSFORMATION): Add PrevMbFct
argument.
(__gconv_prevmb_ascii): New function.
* iconv/gconv_builtin.h: Add PrevMbFct argument to all
BUILTIN_TRANSFORMATION invocations.
* iconv/gconv_conf.c (BUILTIN_TRANSFORMATION): Add PrevMbFct
argument.
* iconv/iconvconfig.c (BUILTIN_TRANSFORMATION): Likewise.
* iconv/gconv_builtin.c (map): New field prevmb_fct.
(BUILTIN_TRANSFORMATION): Add PrevMbFct argument. Use it to
initialize prevmb_fct field.
(__gconv_get_builtin_trans): Initialize __prevmb_fct field.
* iconv/gconv_cache.c (find_module): Initialize __prevmb_fct field.
* iconv/gconv_db.c (gen_steps, increment_counter): Likewise.
* iconv/skeleton.c: Document FROM_PREVMB.
(gconv_init): Initialize __prevmb_fct field.
Undefine FROM_PREVMB at the end.
* iconv/loop.c: Document PREVMB_BODY.
(gconv_prevmb, FROM_PREVMB): Define if PREVMB_BODY is defined.
Undefine PREVMB_BODY at the end.
* posix/regex_internal.c [_LIBC]: Include wcsmbs/wcsmbsload.h
and dlfcn.h.
(re_string_reconstruct) [_LIBC]: Use __prevmb_fct if available.
* posix/tst-regex.c (umemlen): New variable.
(test_expr): Add expectedicase argument. Test case insensitive
searches as well as backwards searches (case sensitive and
insensitive) too.
(run_test): Add icase argument. Use it to compute regcomp flags.
(run_test_backwards): New function.
(main): Cast read to size_t to avoid warning. Set umemlen.
Add expectedicase arguments to test_expr.
--- libc/iconv/gconv_conf.c.jj 2003-09-14 20:13:39.000000000 +0200
+++ libc/iconv/gconv_conf.c 2003-11-11 11:44:23.000000000 +0100
@@ -62,7 +62,7 @@ static const char gconv_module_ext[] = M
static struct gconv_module builtin_modules[] =
{
#define BUILTIN_TRANSFORMATION(From, To, Cost, Name, Fct, BtowcFct, \
- MinF, MaxF, MinT, MaxT) \
+ PrevMbFct, MinF, MaxF, MinT, MaxT) \
{ \
.from_string = From, \
.to_string = To, \
@@ -81,7 +81,7 @@ static struct gconv_module builtin_modul
static const char *builtin_aliases[] =
{
#define BUILTIN_TRANSFORMATION(From, To, Cost, Name, Fct, BtowcFct, \
- MinF, MaxF, MinT, MaxT)
+ PrevMbFct, MinF, MaxF, MinT, MaxT)
#define BUILTIN_ALIAS(From, To) From " " To,
#include "gconv_builtin.h"
--- libc/iconv/gconv.h.jj 2002-12-02 22:44:26.000000000 +0100
+++ libc/iconv/gconv.h 2003-11-11 12:05:21.000000000 +0100
@@ -74,6 +74,13 @@ typedef int (*__gconv_fct) (struct __gco
/* Type of a specialized conversion function for a single byte to INTERNAL. */
typedef wint_t (*__gconv_btowc_fct) (struct __gconv_step *, unsigned char);
+/* Type of a specialized function to return starting byte of a multi-byte
+ character. Searching starts from ptr-1 backwards. If no starting byte
+ of a multi-byte character is found even at the byte pointed by first,
+ the function returns NULL. */
+typedef __const unsigned char *(*__gconv_prevmb_fct) (__const unsigned char *,
+ __const unsigned char *);
+
/* Constructor and destructor for local data for conversion step. */
typedef int (*__gconv_init_fct) (struct __gconv_step *);
typedef void (*__gconv_end_fct) (struct __gconv_step *);
@@ -124,6 +131,7 @@ struct __gconv_step
__gconv_fct __fct;
__gconv_btowc_fct __btowc_fct;
+ __gconv_prevmb_fct __prevmb_fct;
__gconv_init_fct __init_fct;
__gconv_end_fct __end_fct;
--- libc/iconv/gconv_builtin.c.jj 2002-12-02 22:48:08.000000000 +0100
+++ libc/iconv/gconv_builtin.c 2003-11-11 11:43:00.000000000 +0100
@@ -31,6 +31,7 @@ static struct builtin_map
{
const char *name;
__gconv_fct fct;
+ __gconv_prevmb_fct prevmb_fct;
__gconv_btowc_fct btowc_fct;
int min_needed_from;
@@ -41,11 +42,12 @@ static struct builtin_map
} map[] =
{
#define BUILTIN_TRANSFORMATION(From, To, Cost, Name, Fct, BtowcFct, \
- MinF, MaxF, MinT, MaxT) \
+ PrevMbFct, MinF, MaxF, MinT, MaxT) \
{ \
.name = Name, \
.fct = Fct, \
.btowc_fct = BtowcFct, \
+ .prevmb_fct = PrevMbFct, \
\
.min_needed_from = MinF, \
.max_needed_from = MaxF, \
@@ -72,6 +74,7 @@ __gconv_get_builtin_trans (const char *n
step->__fct = map[cnt].fct;
step->__btowc_fct = map[cnt].btowc_fct;
+ step->__prevmb_fct = map[cnt].prevmb_fct;
step->__init_fct = NULL;
step->__end_fct = NULL;
step->__shlib_handle = NULL;
--- libc/iconv/gconv_int.h.jj 2003-06-11 23:33:21.000000000 +0200
+++ libc/iconv/gconv_int.h 2003-11-11 12:14:58.000000000 +0100
@@ -297,6 +297,14 @@ __BUILTIN_TRANSFORM (__gconv_transform_u
only ASCII characters. */
extern wint_t __gconv_btwoc_ascii (struct __gconv_step *step, unsigned char c);
+/* Specialized function to return starting byte of a multi-byte
+ character for encodings where only ASCII characters start multi-byte
+ sequences. Searching starts from ptr-1 backwards. If no starting byte
+ of a multi-byte character is found even at the byte pointed by first,
+ the function returns NULL. */
+extern const unsigned char *__gconv_prevmb_ascii (const unsigned char *ptr,
+ const unsigned char *first);
+
#endif
__END_DECLS
--- libc/iconv/gconv_simple.c.jj 2003-06-11 23:36:37.000000000 +0200
+++ libc/iconv/gconv_simple.c 2003-11-11 12:14:37.000000000 +0100
@@ -32,7 +32,7 @@
#define BUILTIN_ALIAS(s1, s2) /* nothing */
#define BUILTIN_TRANSFORMATION(From, To, Cost, Name, Fct, BtowcFct, \
- MinF, MaxF, MinT, MaxT) \
+ PrevMbFct, MinF, MaxF, MinT, MaxT) \
extern int Fct (struct __gconv_step *, struct __gconv_step_data *, \
__const unsigned char **, __const unsigned char *, \
unsigned char **, size_t *, int, int);
@@ -56,6 +56,22 @@ __gconv_btwoc_ascii (struct __gconv_step
}
+/* Specialized function to return starting byte of a multi-byte
+ character for encodings where only ASCII characters start multi-byte
+ sequences. Searching starts from ptr-1 backwards. If no starting byte
+ of a multi-byte character is found even at the byte pointed by first,
+ the function returns NULL. */
+const unsigned char *
+__gconv_prevmb_ascii (const unsigned char *ptr,
+ const unsigned char *first)
+{
+ while (--ptr >= first)
+ if (*ptr < 0x80)
+ return ptr;
+ return NULL;
+}
+
+
/* Transform from the internal, UCS4-like format, to UCS4. The
difference between the internal ucs4 format and the real UCS4
format is, if any, the endianess. The Unicode/ISO 10646 says that
--- libc/iconv/iconvconfig.c.jj 2003-06-11 23:38:47.000000000 +0200
+++ libc/iconv/iconvconfig.c 2003-11-11 11:45:22.000000000 +0100
@@ -202,7 +202,7 @@ static struct
#define BUILTIN_ALIAS(alias, real) \
{ .from = alias, .to = real },
#define BUILTIN_TRANSFORMATION(From, To, Cost, Name, Fct, BtowcFct, \
- MinF, MaxF, MinT, MaxT)
+ PrevMbFct, MinF, MaxF, MinT, MaxT)
#include <gconv_builtin.h>
};
#undef BUILTIN_ALIAS
@@ -219,7 +219,7 @@ static struct
{
#define BUILTIN_ALIAS(alias, real)
#define BUILTIN_TRANSFORMATION(From, To, Cost, Name, Fct, BtowcFct, \
- MinF, MaxF, MinT, MaxT) \
+ PrevMbFct, MinF, MaxF, MinT, MaxT) \
{ .from = From, .to = To, .module = Name, .cost = Cost },
#include <gconv_builtin.h>
};
--- libc/iconv/gconv_builtin.h.jj 2002-12-02 22:46:00.000000000 +0100
+++ libc/iconv/gconv_builtin.h 2003-11-11 12:15:19.000000000 +0100
@@ -30,14 +30,18 @@ BUILTIN_ALIAS ("OSF00010105//", "ISO-106
BUILTIN_ALIAS ("OSF00010106//", "ISO-10646/UCS4/") /* level 3 */
BUILTIN_TRANSFORMATION ("INTERNAL", "ISO-10646/UCS4/", 1, "=INTERNAL->ucs4",
- __gconv_transform_internal_ucs4, NULL, 4, 4, 4, 4)
+ __gconv_transform_internal_ucs4, NULL, NULL,
+ 4, 4, 4, 4)
BUILTIN_TRANSFORMATION ("ISO-10646/UCS4/", "INTERNAL", 1, "=ucs4->INTERNAL",
- __gconv_transform_ucs4_internal, NULL, 4, 4, 4, 4)
+ __gconv_transform_ucs4_internal, NULL, NULL,
+ 4, 4, 4, 4)
BUILTIN_TRANSFORMATION ("INTERNAL", "UCS-4LE//", 1, "=INTERNAL->ucs4le",
- __gconv_transform_internal_ucs4le, NULL, 4, 4, 4, 4)
+ __gconv_transform_internal_ucs4le, NULL, NULL,
+ 4, 4, 4, 4)
BUILTIN_TRANSFORMATION ("UCS-4LE//", "INTERNAL", 1, "=ucs4le->INTERNAL",
- __gconv_transform_ucs4le_internal, NULL, 4, 4, 4, 4)
+ __gconv_transform_ucs4le_internal, NULL, NULL,
+ 4, 4, 4, 4)
BUILTIN_ALIAS ("WCHAR_T//", "INTERNAL")
@@ -48,11 +52,12 @@ BUILTIN_ALIAS ("OSF05010001//", "ISO-106
BUILTIN_ALIAS ("ISO-10646/UTF-8/", "ISO-10646/UTF8/")
BUILTIN_TRANSFORMATION ("INTERNAL", "ISO-10646/UTF8/", 1, "=INTERNAL->utf8",
- __gconv_transform_internal_utf8, NULL, 4, 4, 1, 6)
+ __gconv_transform_internal_utf8, NULL, NULL,
+ 4, 4, 1, 6)
BUILTIN_TRANSFORMATION ("ISO-10646/UTF8/", "INTERNAL", 1, "=utf8->INTERNAL",
__gconv_transform_utf8_internal, __gconv_btwoc_ascii,
- 1, 6, 4, 4)
+ __gconv_prevmb_ascii, 1, 6, 4, 4)
BUILTIN_ALIAS ("UCS2//", "ISO-10646/UCS2/")
BUILTIN_ALIAS ("UCS-2//", "ISO-10646/UCS2/")
@@ -61,10 +66,12 @@ BUILTIN_ALIAS ("OSF00010101//", "ISO-106
BUILTIN_ALIAS ("OSF00010102//", "ISO-10646/UCS2/") /* level 3 */
BUILTIN_TRANSFORMATION ("ISO-10646/UCS2/", "INTERNAL", 1, "=ucs2->INTERNAL",
- __gconv_transform_ucs2_internal, NULL, 2, 2, 4, 4)
+ __gconv_transform_ucs2_internal, NULL, NULL,
+ 2, 2, 4, 4)
BUILTIN_TRANSFORMATION ("INTERNAL", "ISO-10646/UCS2/", 1, "=INTERNAL->ucs2",
- __gconv_transform_internal_ucs2, NULL, 4, 4, 2, 2)
+ __gconv_transform_internal_ucs2, NULL, NULL,
+ 4, 4, 2, 2)
BUILTIN_ALIAS ("ANSI_X3.4//", "ANSI_X3.4-1968//")
@@ -82,10 +89,11 @@ BUILTIN_ALIAS ("OSF00010020//", "ANSI_X3
BUILTIN_TRANSFORMATION ("ANSI_X3.4-1968//", "INTERNAL", 1, "=ascii->INTERNAL",
__gconv_transform_ascii_internal, __gconv_btwoc_ascii,
- 4, 4, 1, 1)
+ NULL, 4, 4, 1, 1)
BUILTIN_TRANSFORMATION ("INTERNAL", "ANSI_X3.4-1968//", 1, "=INTERNAL->ascii",
- __gconv_transform_internal_ascii, NULL, 4, 4, 1, 1)
+ __gconv_transform_internal_ascii, NULL, NULL,
+ 4, 4, 1, 1)
#if BYTE_ORDER == BIG_ENDIAN
@@ -96,12 +104,12 @@ BUILTIN_ALIAS ("UCS-2LE//", "UNICODELITT
BUILTIN_TRANSFORMATION ("UNICODELITTLE//", "INTERNAL", 1,
"=ucs2reverse->INTERNAL",
- __gconv_transform_ucs2reverse_internal, NULL,
+ __gconv_transform_ucs2reverse_internal, NULL, NULL,
2, 2, 4, 4)
BUILTIN_TRANSFORMATION ("INTERNAL", "UNICODELITTLE//", 1,
"=INTERNAL->ucs2reverse",
- __gconv_transform_internal_ucs2reverse, NULL,
+ __gconv_transform_internal_ucs2reverse, NULL, NULL,
4, 4, 2, 2)
#else
BUILTIN_ALIAS ("UNICODELITTLE//", "ISO-10646/UCS2/")
@@ -111,11 +119,11 @@ BUILTIN_ALIAS ("UCS-2BE//", "UNICODEBIG/
BUILTIN_TRANSFORMATION ("UNICODEBIG//", "INTERNAL", 1,
"=ucs2reverse->INTERNAL",
- __gconv_transform_ucs2reverse_internal, NULL,
+ __gconv_transform_ucs2reverse_internal, NULL, NULL,
2, 2, 4, 4)
BUILTIN_TRANSFORMATION ("INTERNAL", "UNICODEBIG//", 1,
"=INTERNAL->ucs2reverse",
- __gconv_transform_internal_ucs2reverse, NULL,
+ __gconv_transform_internal_ucs2reverse, NULL, NULL,
4, 4, 2, 2)
#endif
--- libc/iconv/skeleton.c.jj 2002-12-02 22:49:35.000000000 +0100
+++ libc/iconv/skeleton.c 2003-11-11 11:56:37.000000000 +0100
@@ -339,6 +339,9 @@ gconv_init (struct __gconv_step *step)
#ifdef FROM_ONEBYTE
step->__btowc_fct = FROM_ONEBYTE;
#endif
+#ifdef FROM_PREVMB
+ step->__prevmb_fct = FROM_PREVMB;
+#endif
}
else if (__builtin_expect (strcmp (step->__to_name, CHARSET_NAME), 0) == 0)
{
--- libc/iconv/loop.c.jj 2003-06-11 23:38:13.000000000 +0200
+++ libc/iconv/loop.c 2003-11-11 12:05:59.000000000 +0100
@@ -46,6 +46,8 @@
ONEBYTE_BODY body of the specialized conversion function for a
single byte from the current character set to INTERNAL.
+ PREVMB_BODY body of the specialized function for searching backwards
+ for start of a multi-byte character.
*/
#include <assert.h>
@@ -471,6 +473,14 @@ gconv_btowc (struct __gconv_step *step,
#endif
+#ifdef PREVMB_BODY
+static const unsigned char *
+gconv_prevmb (const unsigned char *ptr, const unsigned char *first)
+ PREVMB_BODY
+# define FROM_PREVMB gconv_prevmb
+#endif
+
+
/* We remove the macro definitions so that we can include this file again
for the definition of another function. */
#undef MIN_NEEDED_INPUT
@@ -484,6 +494,7 @@ gconv_btowc (struct __gconv_step *step,
#undef INIT_PARAMS
#undef UPDATE_PARAMS
#undef ONEBYTE_BODY
+#undef PREVMB_BODY
#undef UNPACK_BYTES
#undef CLEAR_STATE
#undef LOOP_NEED_STATE
--- libc/iconv/gconv_cache.c.jj 2003-06-11 23:38:47.000000000 +0200
+++ libc/iconv/gconv_cache.c 2003-11-11 19:52:25.000000000 +0100
@@ -205,6 +205,7 @@ find_module (const char *directory, cons
/* These settings can be overridden by the init function. */
result->__btowc_fct = NULL;
+ result->__prevmb_fct = NULL;
result->__data = NULL;
/* Call the init function. */
--- libc/iconv/gconv_db.c.jj 2003-06-11 23:31:59.000000000 +0200
+++ libc/iconv/gconv_db.c 2003-11-11 19:53:39.000000000 +0100
@@ -269,6 +269,7 @@ gen_steps (struct derivation_step *best,
/* These settings can be overridden by the init function. */
result[step_cnt].__btowc_fct = NULL;
+ result[step_cnt].__prevmb_fct = NULL;
/* Call the init function. */
if (result[step_cnt].__init_fct != NULL)
@@ -358,6 +359,7 @@ increment_counter (struct __gconv_step *
/* These settings can be overridden by the init function. */
step->__btowc_fct = NULL;
+ step->__prevmb_fct = NULL;
}
/* Call the init function. */
--- libc/posix/tst-regex.c.jj 2001-07-06 06:55:38.000000000 +0200
+++ libc/posix/tst-regex.c 2003-11-11 18:57:30.000000000 +0100
@@ -1,4 +1,4 @@
-/* Copyright (C) 2001 Free Software Foundation, Inc.
+/* Copyright (C) 2001, 2003 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@@ -44,10 +44,13 @@ static iconv_t cd;
static char *mem;
static char *umem;
static size_t memlen;
+static size_t umemlen;
-static int test_expr (const char *expr, int expected);
+static int test_expr (const char *expr, int expected, int expectedicase);
static int run_test (const char *expr, const char *mem, size_t memlen,
- int expected);
+ int icase, int expected);
+static int run_test_backwards (const char *expr, const char *mem,
+ size_t memlen, int icase, int expected);
int
@@ -78,7 +81,7 @@ main (void)
if (mem == NULL)
error (EXIT_FAILURE, errno, "while allocating buffer");
- if (read (fd, mem, memlen) != memlen)
+ if ((size_t) read (fd, mem, memlen) != memlen)
error (EXIT_FAILURE, 0, "cannot read entire file");
mem[memlen] = '\0';
@@ -102,6 +105,7 @@ main (void)
outmem = umem;
outlen = 2 * memlen - 1;
iconv (cd, &inmem, &inlen, &outmem, &outlen);
+ umemlen = outmem - umem;
if (inlen != 0)
error (EXIT_FAILURE, errno, "cannot convert buffer");
@@ -116,11 +120,11 @@ main (void)
/* Run the actual tests. All tests are run in a single-byte and a
multi-byte locale. */
- result = test_expr ("[äáàâéèêíìîñöóòôüúùû]", 2);
- result |= test_expr ("G.ran", 2);
- result |= test_expr ("G.\\{1\\}ran", 2);
- result |= test_expr ("G.*ran", 3);
- result |= test_expr ("[äáàâ]", 0);
+ result = test_expr ("[äáàâéèêíìîñöóòôüúùû]", 2, 2);
+ result |= test_expr ("G.ran", 2, 3);
+ result |= test_expr ("G.\\{1\\}ran", 2, 3);
+ result |= test_expr ("G.*ran", 3, 44);
+ result |= test_expr ("[äáàâ]", 0, 0);
/* Free the resources. */
free (umem);
@@ -132,7 +136,7 @@ main (void)
static int
-test_expr (const char *expr, int expected)
+test_expr (const char *expr, int expected, int expectedicase)
{
int result;
char *inmem;
@@ -146,7 +150,14 @@ test_expr (const char *expr, int expecte
error (EXIT_FAILURE, 0, "cannot set locale de_DE.ISO-8859-1");
printf ("\nTest \"%s\" with 8-bit locale\n", expr);
- result = run_test (expr, mem, memlen, expected);
+ result = run_test (expr, mem, memlen, 0, expected);
+ printf ("\nTest \"%s\" with 8-bit locale, case insensitive\n", expr);
+ result |= run_test (expr, mem, memlen, 1, expectedicase);
+ printf ("\nTest \"%s\" backwards with 8-bit locale\n", expr);
+ result |= run_test_backwards (expr, mem, memlen, 0, expected);
+ printf ("\nTest \"%s\" backwards with 8-bit locale, case insensitive\n",
+ expr);
+ result |= run_test_backwards (expr, mem, memlen, 1, expectedicase);
/* Second test: search with an UTF-8 locale. */
if (setlocale (LC_ALL, "de_DE.UTF-8") == NULL)
@@ -163,14 +174,22 @@ test_expr (const char *expr, int expecte
/* Run the tests. */
printf ("\nTest \"%s\" with multi-byte locale\n", expr);
- result |= run_test (uexpr, umem, 2 * memlen - outlen, expected);
+ result |= run_test (uexpr, umem, umemlen, 0, expected);
+ printf ("\nTest \"%s\" with multi-byte locale, case insensitive\n", expr);
+ result |= run_test (uexpr, umem, umemlen, 1, expectedicase);
+ printf ("\nTest \"%s\" backwards with multi-byte locale\n", expr);
+ result |= run_test_backwards (uexpr, umem, umemlen, 0, expected);
+ printf ("\nTest \"%s\" backwards with multi-byte locale, case insensitive\n",
+ expr);
+ result |= run_test_backwards (uexpr, umem, umemlen, 1, expectedicase);
return result;
}
static int
-run_test (const char *expr, const char *mem, size_t memlen, int expected)
+run_test (const char *expr, const char *mem, size_t memlen, int icase,
+ int expected)
{
#ifdef _POSIX_CPUTIME
struct timespec start;
@@ -186,7 +205,7 @@ run_test (const char *expr, const char *
use_clock = clock_gettime (cl, &start) == 0;
#endif
- err = regcomp (&re, expr, REG_NEWLINE);
+ err = regcomp (&re, expr, REG_NEWLINE | (icase ? REG_ICASE : 0));
if (err != REG_NOERROR)
{
char buf[200];
@@ -257,3 +276,97 @@ run_test (const char *expr, const char *
expect. */
return cnt != expected;
}
+
+
+static int
+run_test_backwards (const char *expr, const char *mem, size_t memlen,
+ int icase, int expected)
+{
+#ifdef _POSIX_CPUTIME
+ struct timespec start;
+ struct timespec finish;
+#endif
+ struct re_pattern_buffer re;
+ const char *err;
+ size_t offset;
+ int cnt;
+
+#ifdef _POSIX_CPUTIME
+ if (use_clock)
+ use_clock = clock_gettime (cl, &start) == 0;
+#endif
+
+ re_set_syntax ((RE_SYNTAX_POSIX_BASIC & ~RE_DOT_NEWLINE)
+ | RE_HAT_LISTS_NOT_NEWLINE
+ | (icase ? RE_ICASE : 0));
+
+ memset (&re, 0, sizeof (re));
+ re.fastmap = malloc (256);
+ if (re.fastmap == NULL)
+ error (EXIT_FAILURE, errno, "cannot allocate fastmap");
+
+ err = re_compile_pattern (expr, strlen (expr), &re);
+ if (err != NULL)
+ error (EXIT_FAILURE, 0, "cannot compile expression: %s", err);
+
+ if (re_compile_fastmap (&re))
+ error (EXIT_FAILURE, 0, "couldn't compile fastmap");
+
+ cnt = 0;
+ offset = memlen;
+ assert (mem[memlen] == '\0');
+ while (offset <= memlen)
+ {
+ int start;
+ const char *sp;
+ const char *ep;
+
+ start = re_search (&re, mem, memlen, offset, -offset, NULL);
+ if (start == -1)
+ break;
+
+ if (start == -2)
+ error (EXIT_FAILURE, 0, "internal error in re_search");
+
+ sp = mem + start;
+ while (sp > mem && sp[-1] != '\n')
+ --sp;
+
+ ep = mem + start;
+ while (*ep != '\0' && *ep != '\n')
+ ++ep;
+
+ printf ("match %d: \"%.*s\"\n", ++cnt, (int) (ep - sp), sp);
+
+ offset = sp - 1 - mem;
+ }
+
+ regfree (&re);
+
+#ifdef _POSIX_CPUTIME
+ if (use_clock)
+ {
+ use_clock = clock_gettime (cl, &finish) == 0;
+ if (use_clock)
+ {
+ if (finish.tv_nsec < start.tv_nsec)
+ {
+ finish.tv_nsec -= start.tv_nsec - 1000000000;
+ finish.tv_sec -= 1 + start.tv_sec;
+ }
+ else
+ {
+ finish.tv_nsec -= start.tv_nsec;
+ finish.tv_sec -= start.tv_sec;
+ }
+
+ printf ("elapsed time: %ld.%09ld sec\n",
+ finish.tv_sec, finish.tv_nsec);
+ }
+ }
+#endif
+
+ /* Return an error if the number of matches found is not match we
+ expect. */
+ return cnt != expected;
+}
--- libc/posix/regex_internal.c.jj 2003-11-11 17:35:49.000000000 +0100
+++ libc/posix/regex_internal.c 2003-11-11 19:29:19.000000000 +0100
@@ -18,6 +18,11 @@
Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
02111-1307 USA. */
+#ifdef _LIBC
+# include <wcsmbs/wcsmbsload.h>
+# include <dlfcn.h>
+#endif
+
static void re_string_construct_common (const char *str, int len,
re_string_t *pstr,
RE_TRANSLATE_TYPE trans, int icase);
@@ -432,10 +437,42 @@ re_string_reconstruct (pstr, idx, eflags
if (MB_CUR_MAX > 1)
{
int wcs_idx;
- wint_t wc;
- pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
- for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
- pstr->wcs[wcs_idx] = WEOF;
+ wint_t wc = WEOF;
+# ifdef _LIBC
+ const struct gconv_fcts *fcts;
+
+ /* Get the conversion functions. */
+ fcts = get_gconv_fcts (_NL_CURRENT_DATA (LC_CTYPE));
+
+ if (__builtin_expect (fcts->towc_nsteps == 1, 1)
+ && __builtin_expect (fcts->towc->__prevmb_fct != NULL, 1))
+ {
+ /* Use the shortcut function. */
+ const char *prev, *raw;
+ raw = pstr->raw_mbs + pstr->raw_mbs_idx;
+ prev = DL_CALL_FCT (fcts->towc->__prevmb_fct,
+ (raw + offset, raw + pstr->valid_len));
+ if (prev != NULL)
+ {
+ mbstate_t cur_state;
+ wchar_t wc2;
+
+ memset (&cur_state, 0, sizeof (cur_state));
+ if (mbrtowc (&wc2, prev, raw + offset - prev, &cur_state)
+ == raw + offset - prev)
+ {
+ memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
+ wc = wc2;
+ }
+ }
+ }
+# endif
+ if (wc == WEOF)
+ {
+ pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
+ for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
+ pstr->wcs[wcs_idx] = WEOF;
+ }
if (pstr->trans && wc <= 0xff)
wc = pstr->trans[wc];
pstr->tip_context = (IS_WIDE_WORD_CHAR (wc) ? CONTEXT_WORD
Jakub
| Index Nav: | [Date Index] [Subject Index] [Author Index] [Thread Index] | |
|---|---|---|
| Message Nav: | [Date Prev] [Date Next] | [Thread Prev] [Thread Next] |