Support multi-byte --transform='...\L...' etc

Support upcasing and downcasing in multi-byte locales.
* gnulib.modules: Add c32rtomb, c32tolower, c32toupper,
mbrtoc32-regular.
* src/transform.c: Do not include ctype.h.  Include mcel.h.
(stk, stk_init): Move up.
(run_case_conv): Return void, not char *.  Append result to
stk directly; this avoids the need for a separate allocation.
All callers changed.  Do not assume a single-byte locale.
* tests/xform04.at: New test.
* tests/Makefile.am (TESTSUITE_AT):
* tests/testsuite.at: Add it.
This commit is contained in:
Paul Eggert
2023-09-12 23:21:18 -05:00
parent 783321ff1b
commit c1e277476c
6 changed files with 101 additions and 63 deletions

5
NEWS
View File

@@ -1,4 +1,4 @@
GNU tar NEWS - User visible changes. 2023-09-10
GNU tar NEWS - User visible changes. 2023-09-12
Please send GNU tar bug reports to <bug-tar@gnu.org>
version TBD
@@ -33,6 +33,9 @@ used, command output will be parsed using strptime(3).
** When diagnosing invalid extended headers tar now quotes control characters.
** Transformations that change case (e.g., --transform='s/.*/\L&/')
now work correctly with multi-byte characters.
version 1.35 - Sergey Poznyakoff, 2023-07-18

View File

@@ -25,6 +25,9 @@ argp-version-etc
attribute
backupfile
c-ctype
c32rtomb
c32tolower
c32toupper
closeout
configmake
dirname
@@ -64,6 +67,7 @@ lchown
linkat
localcharset
manywarnings
mbrtoc32-regular
mcel-prefer
mkdirat
mkdtemp

View File

@@ -15,8 +15,8 @@
with this program. If not, see <http://www.gnu.org/licenses/>. */
#include <system.h>
#include <ctype.h>
#include <regex.h>
#include <mcel.h>
#include "common.h"
enum transform_type
@@ -417,51 +417,44 @@ set_transform_expr (const char *expr)
expr = parse_transform_expr (expr);
}
/* Run case conversion specified by CASE_CTL on array PTR of SIZE
characters. Returns pointer to statically allocated storage. */
static char *
run_case_conv (enum case_ctl_type case_ctl, char *ptr, size_t size)
{
static char *case_ctl_buffer;
static size_t case_ctl_bufsize;
char *p;
if (case_ctl_bufsize < size)
{
case_ctl_bufsize = size;
case_ctl_buffer = xrealloc (case_ctl_buffer, case_ctl_bufsize);
}
memcpy (case_ctl_buffer, ptr, size);
switch (case_ctl)
{
case ctl_upcase_next:
case_ctl_buffer[0] = toupper ((unsigned char) case_ctl_buffer[0]);
break;
case ctl_locase_next:
case_ctl_buffer[0] = tolower ((unsigned char) case_ctl_buffer[0]);
break;
case ctl_upcase:
for (p = case_ctl_buffer; p < case_ctl_buffer + size; p++)
*p = toupper ((unsigned char) *p);
break;
case ctl_locase:
for (p = case_ctl_buffer; p < case_ctl_buffer + size; p++)
*p = tolower ((unsigned char) *p);
break;
case ctl_stop:
break;
}
return case_ctl_buffer;
}
static struct obstack stk;
static bool stk_init;
/* Run case conversion specified by CASE_CTL on array PTR of SIZE
characters. Append the result to STK. */
static void
run_case_conv (enum case_ctl_type case_ctl, char *ptr, size_t size)
{
char const *p = ptr, *plim = ptr + size;
mbstate_t mbs; mbszero (&mbs);
while (p < plim)
{
mcel_t g = mcel_scan (p, plim);
char32_t ch;
switch (case_ctl)
{
case ctl_upcase: case ctl_upcase_next: ch = c32toupper (g.ch); break;
case ctl_locase: case ctl_locase_next: ch = c32tolower (g.ch); break;
default: ch = g.ch; break;
}
if (ch == g.ch)
obstack_grow (&stk, p, g.len);
else
{
obstack_make_room (&stk, MB_LEN_MAX);
mbstate_t ombs; mbszero (&ombs);
size_t outbytes = c32rtomb (obstack_next_free (&stk), ch, &ombs);
obstack_blank_fast (&stk, outbytes);
}
p += g.len;
if (case_ctl != ctl_upcase && case_ctl != ctl_locase)
break;
}
obstack_grow (&stk, p, plim - p);
}
static void
_single_transform_name_to_obstack (struct transform *tf, char *input)
{
@@ -484,7 +477,6 @@ _single_transform_name_to_obstack (struct transform *tf, char *input)
while (*input)
{
size_t disp;
char *ptr;
rc = regexec (&tf->regex, input, tf->regex.re_nsub + 1, rmp, 0);
@@ -510,16 +502,10 @@ _single_transform_name_to_obstack (struct transform *tf, char *input)
switch (segm->type)
{
case segm_literal: /* Literal segment */
if (case_ctl == ctl_stop)
ptr = segm->v.literal.ptr;
else
{
ptr = run_case_conv (case_ctl,
segm->v.literal.ptr,
segm->v.literal.size);
CASE_CTL_RESET();
}
obstack_grow (&stk, ptr, segm->v.literal.size);
run_case_conv (case_ctl,
segm->v.literal.ptr,
segm->v.literal.size);
CASE_CTL_RESET ();
break;
case segm_backref: /* Back-reference segment */
@@ -528,14 +514,9 @@ _single_transform_name_to_obstack (struct transform *tf, char *input)
{
size_t size = rmp[segm->v.ref].rm_eo
- rmp[segm->v.ref].rm_so;
ptr = input + rmp[segm->v.ref].rm_so;
if (case_ctl != ctl_stop)
{
ptr = run_case_conv (case_ctl, ptr, size);
CASE_CTL_RESET();
}
obstack_grow (&stk, ptr, size);
run_case_conv (case_ctl,
input + rmp[segm->v.ref].rm_so, size);
CASE_CTL_RESET ();
}
break;

View File

@@ -287,7 +287,8 @@ TESTSUITE_AT = \
xform-h.at\
xform01.at\
xform02.at\
xform03.at
xform03.at\
xform04.at
distclean-local:
-rm -rf download

View File

@@ -293,6 +293,7 @@ m4_include([xform-h.at])
m4_include([xform01.at])
m4_include([xform02.at])
m4_include([xform03.at])
m4_include([xform04.at])
AT_BANNER([Exclude])
m4_include([exclude.at])

48
tests/xform04.at Normal file
View File

@@ -0,0 +1,48 @@
# Process this file with autom4te to create testsuite. -*- Autotest -*-
# Test suite for GNU tar.
# Copyright 2023 Free Software Foundation, Inc.
# This file is part of GNU tar.
# GNU tar is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
# GNU tar is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
# Transformations can change the number of bytes when downcasing.
AT_SETUP([transformations and multi-byte downcasing])
AT_KEYWORDS([transform xform xform04])
AT_TAR_CHECK([
if test "`(locale charmap) 2>/dev/null`" != UTF-8; then
for locale in en_US.UTF-8 `(locale -a) 2>/dev/null` not-found; do
case $locale in
*.[[Uu][Tt][Ff]]*8)
if test "`(LC_ALL=$locale locale charmap) 2>/dev/null`" = UTF-8; then
LC_ALL=$locale
export LC_ALL
break
fi;;
not-found)
AT_SKIP_TEST;;
esac
done
fi
genfile --file Aa.Ⱥⱥ
tar -cvf /dev/null --transform='s/.*/\L&-\U&/' --show-transformed-name Aa.Ⱥⱥ],
[0],
[aa.ⱥⱥ-AA.ȺȺ
])
AT_CLEANUP