Character classification
Tim Bray
tbray at textuality.com
Wed Sep 3 21:54:40 BST 1997
I've been working on making Lark really do Unicode. JDK 1.1 is supposed
to have, unlike 1.0, a usable input method; thus the problem is to check,
when you're reading a GI or Attribute name, whether the characters are
legal namestart/name characters.
It turns out to be quite a lot of work, so this is an offer to share.
I wrote a program (based on Lark) that pulls the relevant character
classes out of the XML spec, picks apart the markup, and writes another
Java class that has some static arrays and offers two methods:
package textuality.lark;
public class CharClasses
{
public static boolean isNameC(char c)
public static boolean isNameStart(char c)
}
It needs about 4k of tables (which it binary-searches); it might be faster
with 128k of byte-addressable tables or 16K of bitmaps, neither of which
would be hard to implement.
(a) is this a waste of time, i.e. are there Unicode library calls that
do it?
(b) if not, has everyone else already done this?
(c) if not, if I'm going to publish this, is the API above OK?
I've attached the current Java source file for those who find the
explanation above insufficiently clear.
-------------- next part --------------
// Synthetically generated; do not edit!
//
package textuality.lark;
import java.util.*;
public class CharClasses
{
static final char[] sNameStart =
{
170,170, 181,181, 186,186, 192,214, 216,246,
248,501, 506,535, 592,680, 688,696, 699,705,
736,740, 890,890, 902,902, 904,906, 908,908,
910,929, 931,974, 976,982, 986,986, 988,988,
990,990, 992,992, 994,1011, 1025,1036, 1038,1103,
1105,1116, 1118,1153, 1168,1220, 1223,1224, 1227,1228,
1232,1259, 1262,1269, 1272,1273, 1329,1366, 1369,1369,
1377,1415, 1488,1514, 1520,1522, 1569,1594, 1601,1610,
1649,1719, 1722,1726, 1728,1742, 1744,1747, 1749,1749,
1765,1766, 2309,2361, 2365,2365, 2392,2401, 2437,2444,
2447,2448, 2451,2472, 2474,2480, 2482,2482, 2486,2489,
2524,2525, 2527,2529, 2544,2545, 2565,2570, 2575,2576,
2579,2600, 2602,2608, 2610,2611, 2613,2614, 2616,2617,
2649,2652, 2654,2654, 2674,2676, 2693,2699, 2701,2701,
2703,2705, 2707,2728, 2730,2736, 2738,2739, 2741,2745,
2749,2749, 2784,2784, 2821,2828, 2831,2832, 2835,2856,
2858,2864, 2866,2867, 2870,2873, 2877,2877, 2908,2909,
2911,2913, 2949,2954, 2958,2960, 2962,2965, 2969,2970,
2972,2972, 2974,2975, 2979,2980, 2984,2986, 2990,2997,
2999,3001, 3077,3084, 3086,3088, 3090,3112, 3114,3123,
3125,3129, 3168,3169, 3205,3212, 3214,3216, 3218,3240,
3242,3251, 3253,3257, 3294,3294, 3296,3297, 3333,3340,
3342,3344, 3346,3368, 3370,3385, 3424,3425, 3585,3630,
3632,3632, 3634,3635, 3648,3653, 3713,3714, 3716,3716,
3719,3720, 3722,3722, 3725,3725, 3732,3735, 3737,3743,
3745,3747, 3749,3749, 3751,3751, 3754,3755, 3757,3758,
3760,3760, 3762,3763, 3773,3773, 3776,3780, 3804,3805,
3904,3911, 3913,3945, 4256,4293, 4304,4342, 4352,4441,
4447,4514, 4520,4601, 7680,7835, 7840,7929, 7936,7957,
7960,7965, 7968,8005, 8008,8013, 8016,8023, 8025,8025,
8027,8027, 8029,8029, 8031,8061, 8064,8116, 8118,8124,
8126,8126, 8130,8132, 8134,8140, 8144,8147, 8150,8155,
8160,8172, 8178,8180, 8182,8188, 8319,8319, 8450,8450,
8455,8455, 8458,8467, 8469,8469, 8472,8477, 8484,8484,
8486,8486, 8488,8488, 8490,8497, 8499,8504, 8544,8578,
12295,12295, 12321,12329, 12353,12436, 12449,12538, 12549,12588,
12593,12686, 19968,40869, 44032,55203, 63744,64045, 64256,64262,
64275,64279, 64287,64296, 64298,64310, 64312,64316, 64318,64318,
64320,64321, 64323,64324, 64326,64433, 64467,64829, 64848,64911,
64914,64967, 65008,65019, 65136,65437, 65440,65470, 65474,65479,
65482,65487, 65490,65495, 65498,65500
};
static final char[] sNameC =
{
170,170, 181,181, 183,183, 186,186, 192,214,
216,246, 248,501, 506,535, 592,680, 688,696,
699,705, 720,721, 736,740, 768,837, 864,865,
890,890, 902,906, 908,908, 910,929, 931,974,
976,982, 986,986, 988,988, 990,990, 992,992,
994,1011, 1025,1036, 1038,1103, 1105,1116, 1118,1153,
1155,1158, 1168,1220, 1223,1224, 1227,1228, 1232,1259,
1262,1269, 1272,1273, 1329,1366, 1369,1369, 1377,1415,
1425,1441, 1443,1465, 1467,1469, 1471,1471, 1473,1474,
1476,1476, 1488,1514, 1520,1522, 1569,1594, 1600,1618,
1632,1641, 1648,1719, 1722,1726, 1728,1742, 1744,1747,
1749,1768, 1770,1773, 1776,1785, 2305,2307, 2309,2361,
2364,2381, 2385,2388, 2392,2403, 2406,2415, 2433,2435,
2437,2444, 2447,2448, 2451,2472, 2474,2480, 2482,2482,
2486,2489, 2492,2492, 2494,2500, 2503,2504, 2507,2509,
2519,2519, 2524,2525, 2527,2531, 2534,2545, 2562,2562,
2565,2570, 2575,2576, 2579,2600, 2602,2608, 2610,2611,
2613,2614, 2616,2617, 2620,2620, 2622,2626, 2631,2632,
2635,2637, 2649,2652, 2654,2654, 2662,2676, 2689,2691,
2693,2699, 2701,2701, 2703,2705, 2707,2728, 2730,2736,
2738,2739, 2741,2745, 2748,2757, 2759,2761, 2763,2765,
2784,2784, 2790,2799, 2817,2819, 2821,2828, 2831,2832,
2835,2856, 2858,2864, 2866,2867, 2870,2873, 2876,2883,
2887,2888, 2891,2893, 2902,2903, 2908,2909, 2911,2913,
2918,2927, 2946,2947, 2949,2954, 2958,2960, 2962,2965,
2969,2970, 2972,2972, 2974,2975, 2979,2980, 2984,2986,
2990,2997, 2999,3001, 3006,3010, 3014,3016, 3018,3021,
3031,3031, 3047,3055, 3073,3075, 3077,3084, 3086,3088,
3090,3112, 3114,3123, 3125,3129, 3134,3140, 3142,3144,
3146,3149, 3157,3158, 3168,3169, 3174,3183, 3202,3203,
3205,3212, 3214,3216, 3218,3240, 3242,3251, 3253,3257,
3262,3268, 3270,3272, 3274,3277, 3285,3286, 3294,3294,
3296,3297, 3302,3311, 3330,3331, 3333,3340, 3342,3344,
3346,3368, 3370,3385, 3390,3395, 3398,3400, 3402,3405,
3415,3415, 3424,3425, 3430,3439, 3585,3630, 3632,3642,
3648,3662, 3664,3673, 3713,3714, 3716,3716, 3719,3720,
3722,3722, 3725,3725, 3732,3735, 3737,3743, 3745,3747,
3749,3749, 3751,3751, 3754,3755, 3757,3758, 3760,3769,
3771,3773, 3776,3780, 3782,3782, 3784,3789, 3792,3801,
3804,3805, 3864,3865, 3872,3881, 3893,3893, 3895,3895,
3897,3897, 3902,3911, 3913,3945, 3953,3972, 3974,3979,
3984,3989, 3991,3991, 3993,4013, 4017,4023, 4025,4025,
4256,4293, 4304,4342, 4352,4441, 4447,4514, 4520,4601,
7680,7835, 7840,7929, 7936,7957, 7960,7965, 7968,8005,
8008,8013, 8016,8023, 8025,8025, 8027,8027, 8029,8029,
8031,8061, 8064,8116, 8118,8124, 8126,8126, 8130,8132,
8134,8140, 8144,8147, 8150,8155, 8160,8172, 8178,8180,
8182,8188, 8204,8207, 8234,8238, 8298,8303, 8319,8319,
8400,8412, 8417,8417, 8450,8450, 8455,8455, 8458,8467,
8469,8469, 8472,8477, 8484,8484, 8486,8486, 8488,8488,
8490,8497, 8499,8504, 8544,8578, 12293,12293, 12295,12295,
12321,12335, 12337,12341, 12353,12436, 12441,12446, 12449,12538,
12540,12542, 12549,12588, 12593,12686, 19968,40869, 44032,55203,
63744,64045, 64256,64262, 64275,64279, 64286,64296, 64298,64310,
64312,64316, 64318,64318, 64320,64321, 64323,64324, 64326,64433,
64467,64829, 64848,64911, 64914,64967, 65008,65019, 65056,65059,
65136,65470, 65474,65479, 65482,65487, 65490,65495, 65498,65500
};
public static boolean isNameC(char c)
{
return find(c, sNameC);
}
public static boolean isNameStart(char c)
{
return find(c, sNameStart);
}
// binary-search to find out if C is in one of the ranges in the
// map. Remember that the map consists of pairs, not individuals.
// If this turns into a horrible performance bottleneck, we could
// put the maps into a 64k byte array or as a compromise 2 * 8k bitmaps; the
// pair-array trick uses about 4k for both, at the cost of all this
// binary searching
private static boolean find(char c, char[] map)
{
int high, low, probe;
low = -1; high = map.length/2;
while ((high - low) > 1)
{
// invariant (modulo division by 2):
// map[high] is strictly greater than c
probe = (high + low) / 2;
if (c < map[probe * 2])
high = probe;
else
low = probe;
}
return (low != -1 && c >= map[low*2] && c <= map[(low*2) + 1]);
}
}
-------------- next part --------------
Cheers, Tim Bray
tbray at textuality.com http://www.textuality.com/ +1-604-708-9592
More information about the Xml-dev
mailing list