[RFC PATCH:app/xprop] Print UTF8_STRING type as UTF-8 when locale supports it
Yang Zhao
yang at yangman.ca
Sat Oct 17 20:11:49 PDT 2009
Introduces 'u' format character, which behaves like 's', but leaves
UTF-8 encoding intact.
Property value is checked for UTF-8 validity before being printed. What
happens when the value isn't a valid UTF-8 string needs improvement.
---
Currently, when an invalid UTF-8 string is detected, an error message is printed
instead of the string value. I don't think this is ideal. What would be better?
xprop.c | 89 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++------
1 files changed, 80 insertions(+), 9 deletions(-)
diff --git a/xprop.c b/xprop.c
index 8261b15..bb8c71f 100644
--- a/xprop.c
+++ b/xprop.c
@@ -409,6 +409,7 @@ static propertyRec windowPropTable[] = {
{"RECTANGLE", XA_RECTANGLE, "16iicc", RECTANGLE_DFORMAT },
{"RGB_COLOR_MAP", XA_RGB_COLOR_MAP,"32xcccccccxx",RGB_COLOR_MAP_DFORMAT},
{"STRING", XA_STRING, "8s", 0 },
+ {"UTF8_STRING", 0, "8u", 0 },
{"WINDOW", XA_WINDOW, "32x", ": window id # $0+\n" },
{"VISUALID", XA_VISUALID, "32x", ": visual id # $0\n" },
{"WM_COLORMAP_WINDOWS", 0, "32x", ": window id # $0+\n"},
@@ -683,7 +684,7 @@ _put_char (char c)
}
static void
-_format_char (char c)
+_format_char (char c, int unicode)
{
switch (c) {
case '\\':
@@ -701,17 +702,21 @@ _format_char (char c)
break;
default:
if (!c_isprint(c)) {
- _put_char('\\');
- snprintf(_buf_ptr, _buf_len, "%03o", (unsigned char) c);
- _buf_ptr += 3;
- _buf_len -= 3;
+ if (unicode && (c & 0x80)) {
+ _put_char(c);
+ } else {
+ _put_char('\\');
+ snprintf(_buf_ptr, _buf_len, "%03o", (unsigned char) c);
+ _buf_ptr += 3;
+ _buf_len -= 3;
+ }
} else
_put_char(c);
}
}
static const char *
-Format_String (const char *string)
+Format_String (const char *string, int unicode)
{
char c;
@@ -720,7 +725,7 @@ Format_String (const char *string)
_put_char('\"');
while ((c = string++[0]))
- _format_char(c);
+ _format_char(c, unicode);
*_buf_ptr++ = '"';
*_buf_ptr++ = '\0';
@@ -738,7 +743,7 @@ Format_Len_String (const char *string, int len)
memcpy(data, string, len);
data[len] = '\0';
- result = Format_String(data);
+ result = Format_String(data, 0);
free(data);
return result;
@@ -904,6 +909,70 @@ Format_Len_Text (const char *string, int len, Atom encoding)
return Format_Len_String(string, len);
}
+static int
+is_valid_utf8 (const char *string, int len)
+{
+ unsigned short codepoint;
+ int rem, i;
+ char c;
+
+ rem = 0;
+ for (i = 0; i < len; i++) {
+ c = string[i];
+ codepoint = 0;
+
+ if ((c & 0x8F) ^ 0x80) {
+ if (rem > 0) return 0;
+ rem = 0;
+ codepoint |= c;
+ } else if ((c & 0xC0) == 0x80) {
+ if (rem == 0) return 0;
+ rem--;
+ codepoint |= (c & 0x3F) << (rem * 2);
+ if (codepoint == 0x00) return 0;
+ } else if ((c & 0xE0) == 0xC0) {
+ if (rem > 0) return 0;
+ rem = 1;
+ codepoint = (c & 0x1F) << 6;
+ if ((codepoint & 0xF0) == 0x00) return 0;
+ } else if ((c & 0xF0) == 0xE0) {
+ if (rem > 0) return 0;
+ rem = 2;
+ codepoint = (c & 0x0F) << 12;
+ } else if ((c & 0xF8) == 0xF0) {
+ if (rem > 0) return 0;
+ rem = 3;
+ codepoint = (c & 0x07) << 20;
+ } else
+ return 0;
+ }
+
+ return 1;
+}
+
+static const char *
+Format_Len_Unicode (const char *string, int len)
+{
+ char *data;
+ const char *result;
+
+ if (!is_valid_utf8(string, len))
+ return "<Not a valid UTF-8 string>";
+
+ if (!is_utf8_locale())
+ return Format_Len_String(string, len);
+
+ data = (char *) Malloc(len+1);
+
+ memcpy(data, string, len);
+ data[len] = '\0';
+
+ result = Format_String(data, 1);
+ free(data);
+
+ return result;
+}
+
/*
*
* The Format Manager: a group of routines to manage "formats"
@@ -956,6 +1025,8 @@ Format_Thunk (thunk t, char format_char)
switch (format_char) {
case 's':
return Format_Len_String(t.extra_value, (int)t.value);
+ case 'u':
+ return Format_Len_Unicode(t.extra_value, (int)t.value);
case 't':
return Format_Len_Text(t.extra_value, (int)t.value, t.extra_encoding);
case 'x':
@@ -1252,7 +1323,7 @@ Break_Down_Property (const char *pointer, int length, Atom type, const char *for
while (length >= size/8) {
format_char = Get_Format_Char(format, i);
- if (format_char == 's')
+ if (format_char == 's' || format_char == 'u')
t.value = Extract_Len_String(&pointer,&length,size,&t.extra_value);
else if (format_char == 't') {
t.extra_encoding = type;
--
1.6.4.4
More information about the xorg-devel
mailing list