Support multi-byte --transform='...\L...' etc
Support upcasing and downcasing in multi-byte locales. * gnulib.modules: Add c32rtomb, c32tolower, c32toupper, mbrtoc32-regular. * src/transform.c: Do not include ctype.h. Include mcel.h. (stk, stk_init): Move up. (run_case_conv): Return void, not char *. Append result to stk directly; this avoids the need for a separate allocation. All callers changed. Do not assume a single-byte locale. * tests/xform04.at: New test. * tests/Makefile.am (TESTSUITE_AT): * tests/testsuite.at: Add it.
This commit is contained in:
5
NEWS
5
NEWS
@@ -1,4 +1,4 @@
|
||||
GNU tar NEWS - User visible changes. 2023-09-10
|
||||
GNU tar NEWS - User visible changes. 2023-09-12
|
||||
Please send GNU tar bug reports to <bug-tar@gnu.org>
|
||||
|
||||
version TBD
|
||||
@@ -33,6 +33,9 @@ used, command output will be parsed using strptime(3).
|
||||
|
||||
** When diagnosing invalid extended headers tar now quotes control characters.
|
||||
|
||||
** Transformations that change case (e.g., --transform='s/.*/\L&/')
|
||||
now work correctly with multi-byte characters.
|
||||
|
||||
|
||||
version 1.35 - Sergey Poznyakoff, 2023-07-18
|
||||
|
||||
|
||||
@@ -25,6 +25,9 @@ argp-version-etc
|
||||
attribute
|
||||
backupfile
|
||||
c-ctype
|
||||
c32rtomb
|
||||
c32tolower
|
||||
c32toupper
|
||||
closeout
|
||||
configmake
|
||||
dirname
|
||||
@@ -64,6 +67,7 @@ lchown
|
||||
linkat
|
||||
localcharset
|
||||
manywarnings
|
||||
mbrtoc32-regular
|
||||
mcel-prefer
|
||||
mkdirat
|
||||
mkdtemp
|
||||
|
||||
@@ -15,8 +15,8 @@
|
||||
with this program. If not, see <http://www.gnu.org/licenses/>. */
|
||||
|
||||
#include <system.h>
|
||||
#include <ctype.h>
|
||||
#include <regex.h>
|
||||
#include <mcel.h>
|
||||
#include "common.h"
|
||||
|
||||
enum transform_type
|
||||
@@ -417,51 +417,44 @@ set_transform_expr (const char *expr)
|
||||
expr = parse_transform_expr (expr);
|
||||
}
|
||||
|
||||
/* Run case conversion specified by CASE_CTL on array PTR of SIZE
|
||||
characters. Returns pointer to statically allocated storage. */
|
||||
static char *
|
||||
run_case_conv (enum case_ctl_type case_ctl, char *ptr, size_t size)
|
||||
{
|
||||
static char *case_ctl_buffer;
|
||||
static size_t case_ctl_bufsize;
|
||||
char *p;
|
||||
|
||||
if (case_ctl_bufsize < size)
|
||||
{
|
||||
case_ctl_bufsize = size;
|
||||
case_ctl_buffer = xrealloc (case_ctl_buffer, case_ctl_bufsize);
|
||||
}
|
||||
memcpy (case_ctl_buffer, ptr, size);
|
||||
switch (case_ctl)
|
||||
{
|
||||
case ctl_upcase_next:
|
||||
case_ctl_buffer[0] = toupper ((unsigned char) case_ctl_buffer[0]);
|
||||
break;
|
||||
|
||||
case ctl_locase_next:
|
||||
case_ctl_buffer[0] = tolower ((unsigned char) case_ctl_buffer[0]);
|
||||
break;
|
||||
|
||||
case ctl_upcase:
|
||||
for (p = case_ctl_buffer; p < case_ctl_buffer + size; p++)
|
||||
*p = toupper ((unsigned char) *p);
|
||||
break;
|
||||
|
||||
case ctl_locase:
|
||||
for (p = case_ctl_buffer; p < case_ctl_buffer + size; p++)
|
||||
*p = tolower ((unsigned char) *p);
|
||||
break;
|
||||
|
||||
case ctl_stop:
|
||||
break;
|
||||
}
|
||||
return case_ctl_buffer;
|
||||
}
|
||||
|
||||
|
||||
static struct obstack stk;
|
||||
static bool stk_init;
|
||||
|
||||
/* Run case conversion specified by CASE_CTL on array PTR of SIZE
|
||||
characters. Append the result to STK. */
|
||||
static void
|
||||
run_case_conv (enum case_ctl_type case_ctl, char *ptr, size_t size)
|
||||
{
|
||||
char const *p = ptr, *plim = ptr + size;
|
||||
mbstate_t mbs; mbszero (&mbs);
|
||||
while (p < plim)
|
||||
{
|
||||
mcel_t g = mcel_scan (p, plim);
|
||||
char32_t ch;
|
||||
switch (case_ctl)
|
||||
{
|
||||
case ctl_upcase: case ctl_upcase_next: ch = c32toupper (g.ch); break;
|
||||
case ctl_locase: case ctl_locase_next: ch = c32tolower (g.ch); break;
|
||||
default: ch = g.ch; break;
|
||||
}
|
||||
if (ch == g.ch)
|
||||
obstack_grow (&stk, p, g.len);
|
||||
else
|
||||
{
|
||||
obstack_make_room (&stk, MB_LEN_MAX);
|
||||
mbstate_t ombs; mbszero (&ombs);
|
||||
size_t outbytes = c32rtomb (obstack_next_free (&stk), ch, &ombs);
|
||||
obstack_blank_fast (&stk, outbytes);
|
||||
}
|
||||
p += g.len;
|
||||
if (case_ctl != ctl_upcase && case_ctl != ctl_locase)
|
||||
break;
|
||||
}
|
||||
|
||||
obstack_grow (&stk, p, plim - p);
|
||||
}
|
||||
|
||||
static void
|
||||
_single_transform_name_to_obstack (struct transform *tf, char *input)
|
||||
{
|
||||
@@ -484,7 +477,6 @@ _single_transform_name_to_obstack (struct transform *tf, char *input)
|
||||
while (*input)
|
||||
{
|
||||
size_t disp;
|
||||
char *ptr;
|
||||
|
||||
rc = regexec (&tf->regex, input, tf->regex.re_nsub + 1, rmp, 0);
|
||||
|
||||
@@ -510,16 +502,10 @@ _single_transform_name_to_obstack (struct transform *tf, char *input)
|
||||
switch (segm->type)
|
||||
{
|
||||
case segm_literal: /* Literal segment */
|
||||
if (case_ctl == ctl_stop)
|
||||
ptr = segm->v.literal.ptr;
|
||||
else
|
||||
{
|
||||
ptr = run_case_conv (case_ctl,
|
||||
run_case_conv (case_ctl,
|
||||
segm->v.literal.ptr,
|
||||
segm->v.literal.size);
|
||||
CASE_CTL_RESET();
|
||||
}
|
||||
obstack_grow (&stk, ptr, segm->v.literal.size);
|
||||
CASE_CTL_RESET ();
|
||||
break;
|
||||
|
||||
case segm_backref: /* Back-reference segment */
|
||||
@@ -528,14 +514,9 @@ _single_transform_name_to_obstack (struct transform *tf, char *input)
|
||||
{
|
||||
size_t size = rmp[segm->v.ref].rm_eo
|
||||
- rmp[segm->v.ref].rm_so;
|
||||
ptr = input + rmp[segm->v.ref].rm_so;
|
||||
if (case_ctl != ctl_stop)
|
||||
{
|
||||
ptr = run_case_conv (case_ctl, ptr, size);
|
||||
CASE_CTL_RESET();
|
||||
}
|
||||
|
||||
obstack_grow (&stk, ptr, size);
|
||||
run_case_conv (case_ctl,
|
||||
input + rmp[segm->v.ref].rm_so, size);
|
||||
CASE_CTL_RESET ();
|
||||
}
|
||||
break;
|
||||
|
||||
|
||||
@@ -287,7 +287,8 @@ TESTSUITE_AT = \
|
||||
xform-h.at\
|
||||
xform01.at\
|
||||
xform02.at\
|
||||
xform03.at
|
||||
xform03.at\
|
||||
xform04.at
|
||||
|
||||
distclean-local:
|
||||
-rm -rf download
|
||||
|
||||
@@ -293,6 +293,7 @@ m4_include([xform-h.at])
|
||||
m4_include([xform01.at])
|
||||
m4_include([xform02.at])
|
||||
m4_include([xform03.at])
|
||||
m4_include([xform04.at])
|
||||
|
||||
AT_BANNER([Exclude])
|
||||
m4_include([exclude.at])
|
||||
|
||||
48
tests/xform04.at
Normal file
48
tests/xform04.at
Normal file
@@ -0,0 +1,48 @@
|
||||
# Process this file with autom4te to create testsuite. -*- Autotest -*-
|
||||
|
||||
# Test suite for GNU tar.
|
||||
# Copyright 2023 Free Software Foundation, Inc.
|
||||
|
||||
# This file is part of GNU tar.
|
||||
|
||||
# GNU tar is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation; either version 3 of the License, or
|
||||
# (at your option) any later version.
|
||||
|
||||
# GNU tar is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
# Transformations can change the number of bytes when downcasing.
|
||||
|
||||
AT_SETUP([transformations and multi-byte downcasing])
|
||||
AT_KEYWORDS([transform xform xform04])
|
||||
|
||||
AT_TAR_CHECK([
|
||||
if test "`(locale charmap) 2>/dev/null`" != UTF-8; then
|
||||
for locale in en_US.UTF-8 `(locale -a) 2>/dev/null` not-found; do
|
||||
case $locale in
|
||||
*.[[Uu][Tt][Ff]]*8)
|
||||
if test "`(LC_ALL=$locale locale charmap) 2>/dev/null`" = UTF-8; then
|
||||
LC_ALL=$locale
|
||||
export LC_ALL
|
||||
break
|
||||
fi;;
|
||||
not-found)
|
||||
AT_SKIP_TEST;;
|
||||
esac
|
||||
done
|
||||
fi
|
||||
|
||||
genfile --file Aa.Ⱥⱥ
|
||||
tar -cvf /dev/null --transform='s/.*/\L&-\U&/' --show-transformed-name Aa.Ⱥⱥ],
|
||||
[0],
|
||||
[aa.ⱥⱥ-AA.ȺȺ
|
||||
])
|
||||
|
||||
AT_CLEANUP
|
||||
Reference in New Issue
Block a user