[RFC PATCH:app/xprop] Print UTF8_STRING type as UTF-8 when locale supports it

Yang Zhao yang at yangman.ca
Sat Oct 17 20:11:49 PDT 2009


Introduces 'u' format character, which behaves like 's', but leaves
UTF-8 encoding intact.

Property value is checked for UTF-8 validity before being printed.  What
happens when the value isn't a valid UTF-8 string needs improvement.
---

Currently, when an invalid UTF-8 string is detected, an error message is printed
instead of the string value.  I don't think this is ideal.  What would be better?

 xprop.c |   89 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++------
 1 files changed, 80 insertions(+), 9 deletions(-)

diff --git a/xprop.c b/xprop.c
index 8261b15..bb8c71f 100644
--- a/xprop.c
+++ b/xprop.c
@@ -409,6 +409,7 @@ static propertyRec windowPropTable[] = {
     {"RECTANGLE",	XA_RECTANGLE,	 "16iicc",    RECTANGLE_DFORMAT },
     {"RGB_COLOR_MAP",	XA_RGB_COLOR_MAP,"32xcccccccxx",RGB_COLOR_MAP_DFORMAT},
     {"STRING",		XA_STRING,	 "8s",	      0 },
+    {"UTF8_STRING",		0,	 "8u",	      0 },
     {"WINDOW",		XA_WINDOW,	 "32x",	      ": window id # $0+\n" },
     {"VISUALID",	XA_VISUALID,	 "32x",	      ": visual id # $0\n" },
     {"WM_COLORMAP_WINDOWS",	0,	 "32x",       ": window id # $0+\n"},
@@ -683,7 +684,7 @@ _put_char (char c)
 }
 
 static void
-_format_char (char c)
+_format_char (char c, int unicode)
 {
     switch (c) {
       case '\\':
@@ -701,17 +702,21 @@ _format_char (char c)
 	break;
       default:
 	if (!c_isprint(c)) {
-	    _put_char('\\');
-	    snprintf(_buf_ptr, _buf_len, "%03o", (unsigned char) c);
-	    _buf_ptr += 3;
-	    _buf_len -= 3;
+	    if (unicode && (c & 0x80)) {
+		_put_char(c);
+	    } else {
+		_put_char('\\');
+		snprintf(_buf_ptr, _buf_len, "%03o", (unsigned char) c);
+		_buf_ptr += 3;
+		_buf_len -= 3;
+	    }
 	} else
 	  _put_char(c);
     }
 }
 
 static const char *
-Format_String (const char *string)
+Format_String (const char *string, int unicode)
 {
     char c;
 
@@ -720,7 +725,7 @@ Format_String (const char *string)
     _put_char('\"');
 
     while ((c = string++[0]))
-	_format_char(c);
+	_format_char(c, unicode);
 
     *_buf_ptr++ = '"';
     *_buf_ptr++ = '\0';
@@ -738,7 +743,7 @@ Format_Len_String (const char *string, int len)
     memcpy(data, string, len);
     data[len] = '\0';
 
-    result = Format_String(data);
+    result = Format_String(data, 0);
     free(data);
 
     return result;
@@ -904,6 +909,70 @@ Format_Len_Text (const char *string, int len, Atom encoding)
 	return Format_Len_String(string, len);
 }
 
+static int
+is_valid_utf8 (const char *string, int len)
+{
+    unsigned short codepoint;
+    int rem, i;
+    char c;
+
+    rem = 0;
+    for (i = 0; i < len; i++) {
+	c = string[i];
+	codepoint = 0;
+
+	if ((c & 0x8F) ^ 0x80) {
+	    if (rem > 0) return 0;
+	    rem = 0;
+	    codepoint |= c;
+	} else if ((c & 0xC0) == 0x80) {
+	    if (rem == 0) return 0;
+	    rem--;
+	    codepoint |= (c & 0x3F) << (rem * 2);
+	    if (codepoint == 0x00) return 0;
+	} else if ((c & 0xE0) == 0xC0) {
+	    if (rem > 0) return 0;
+	    rem = 1;
+	    codepoint = (c & 0x1F) << 6;
+	    if ((codepoint & 0xF0) == 0x00) return 0;
+	} else if ((c & 0xF0) == 0xE0) {
+	    if (rem > 0) return 0;
+	    rem = 2;
+	    codepoint = (c & 0x0F) << 12;
+	} else if ((c & 0xF8) == 0xF0) {
+	    if (rem > 0) return 0;
+	    rem = 3;
+	    codepoint = (c & 0x07) << 20;
+	} else
+	    return 0;
+    }
+
+    return 1;
+}
+
+static const char *
+Format_Len_Unicode (const char *string, int len)
+{
+    char *data;
+    const char *result;
+
+    if (!is_valid_utf8(string, len))
+	return "<Not a valid UTF-8 string>";
+
+    if (!is_utf8_locale())
+	return Format_Len_String(string, len);
+
+    data = (char *) Malloc(len+1);
+
+    memcpy(data, string, len);
+    data[len] = '\0';
+
+    result = Format_String(data, 1);
+    free(data);
+
+    return result;
+}
+
 /*
  *
  * The Format Manager: a group of routines to manage "formats"
@@ -956,6 +1025,8 @@ Format_Thunk (thunk t, char format_char)
     switch (format_char) {
       case 's':
 	return Format_Len_String(t.extra_value, (int)t.value);
+      case 'u':
+	return Format_Len_Unicode(t.extra_value, (int)t.value);
       case 't':
 	return Format_Len_Text(t.extra_value, (int)t.value, t.extra_encoding);
       case 'x':
@@ -1252,7 +1323,7 @@ Break_Down_Property (const char *pointer, int length, Atom type, const char *for
 
     while (length >= size/8) {
 	format_char = Get_Format_Char(format, i);
-	if (format_char == 's')
+	if (format_char == 's' || format_char == 'u')
 	    t.value = Extract_Len_String(&pointer,&length,size,&t.extra_value);
 	else if (format_char == 't') {
 	    t.extra_encoding = type;
-- 
1.6.4.4



More information about the xorg-devel mailing list