1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733
|
#!/usr/bin/env python2
#
# Extract rules for Unicode case conversion, specifically the behavior
# required by ECMAScript E5 in Sections 15.5.4.16 to 15.5.4.19. The
# bitstream encoded rules are used for the slow path at run time, so
# compactness is favored over speed.
#
# There is no support for context or locale sensitive rules, as they
# are handled directly in C code before consulting tables generated
# here. ECMAScript requires case conversion both with and without
# locale/language specific rules (e.g. String.prototype.toLowerCase()
# and String.prototype.toLocaleLowerCase()), so they are best handled
# in C anyway.
#
# Case conversion rules for ASCII are also excluded as they are handled
# by the C fast path. Rules for non-BMP characters (codepoints above
# U+FFFF) are omitted as they're not required for standard ECMAScript.
#
import os
import sys
import re
import math
import optparse
import dukutil
class UnicodeData:
"""Read UnicodeData.txt into an internal representation."""
def __init__(self, filename):
self.data = self.read_unicode_data(filename)
print('read %d unicode data entries' % len(self.data))
def read_unicode_data(self, filename):
res = []
f = open(filename, 'rb')
for line in f:
if line.startswith('#'):
continue
line = line.strip()
if line == '':
continue
parts = line.split(';')
if len(parts) != 15:
raise Exception('invalid unicode data line')
res.append(parts)
f.close()
# Sort based on Unicode codepoint.
def mycmp(a,b):
return cmp(long(a[0], 16), long(b[0], 16))
res.sort(cmp=mycmp)
return res
class SpecialCasing:
"""Read SpecialCasing.txt into an internal representation."""
def __init__(self, filename):
self.data = self.read_special_casing_data(filename)
print('read %d special casing entries' % len(self.data))
def read_special_casing_data(self, filename):
res = []
f = open(filename, 'rb')
for line in f:
try:
idx = line.index('#')
line = line[:idx]
except ValueError:
pass
line = line.strip()
if line == '':
continue
parts = line.split(';')
parts = [i.strip() for i in parts]
while len(parts) < 6:
parts.append('')
res.append(parts)
f.close()
return res
def parse_unicode_sequence(x):
"""Parse a Unicode sequence like ABCD 1234 into a unicode string."""
res = ''
for i in x.split(' '):
i = i.strip()
if i == '':
continue
res += unichr(long(i, 16))
return res
def get_base_conversion_maps(unicode_data):
"""Create case conversion tables without handling special casing yet."""
uc = {} # uppercase, codepoint (number) -> string
lc = {} # lowercase
tc = {} # titlecase
for x in unicode_data.data:
c1 = long(x[0], 16)
# just 16-bit support needed
if c1 >= 0x10000:
continue
if x[12] != '':
# field 12: simple uppercase mapping
c2 = parse_unicode_sequence(x[12])
uc[c1] = c2
tc[c1] = c2 # titlecase default == uppercase, overridden below if necessary
if x[13] != '':
# field 13: simple lowercase mapping
c2 = parse_unicode_sequence(x[13])
lc[c1] = c2
if x[14] != '':
# field 14: simple titlecase mapping
c2 = parse_unicode_sequence(x[14])
tc[c1] = c2
return uc, lc, tc
def update_special_casings(uc, lc, tc, special_casing):
"""Update case conversion tables with special case conversion rules."""
for x in special_casing.data:
c1 = long(x[0], 16)
if x[4] != '':
# conditions
continue
lower = parse_unicode_sequence(x[1])
title = parse_unicode_sequence(x[2])
upper = parse_unicode_sequence(x[3])
if len(lower) > 1:
lc[c1] = lower
if len(upper) > 1:
uc[c1] = upper
if len(title) > 1:
tc[c1] = title
print('- special case: %d %d %d' % (len(lower), len(upper), len(title)))
def remove_ascii_part(convmap):
"""Remove ASCII case conversion parts (handled by C fast path)."""
for i in xrange(128):
if convmap.has_key(i):
del convmap[i]
def scan_range_with_skip(convmap, start_idx, skip):
"""Scan for a range of continuous case conversion with a certain 'skip'."""
conv_i = start_idx
if not convmap.has_key(conv_i):
return None, None, None
elif len(convmap[conv_i]) > 1:
return None, None, None
else:
conv_o = ord(convmap[conv_i])
start_i = conv_i
start_o = conv_o
while True:
new_i = conv_i + skip
new_o = conv_o + skip
if not convmap.has_key(new_i):
break
if len(convmap[new_i]) > 1:
break
if ord(convmap[new_i]) != new_o:
break
conv_i = new_i
conv_o = new_o
# [start_i,conv_i] maps to [start_o,conv_o], ignore ranges of 1 char.
count = (conv_i - start_i) / skip + 1
if count <= 1:
return None, None, None
# We have an acceptable range, remove them from the convmap here.
for i in xrange(start_i, conv_i + skip, skip):
del convmap[i]
return start_i, start_o, count
def find_first_range_with_skip(convmap, skip):
"""Find first range with a certain 'skip' value."""
for i in xrange(65536):
start_i, start_o, count = scan_range_with_skip(convmap, i, skip)
if start_i is None:
continue
return start_i, start_o, count
return None, None, None
def generate_caseconv_tables(convmap):
"""Generate bit-packed case conversion table for a given conversion map."""
# The bitstream encoding is based on manual inspection for whatever
# regularity the Unicode case conversion rules have.
#
# Start with a full description of case conversions which does not
# cover all codepoints; unmapped codepoints convert to themselves.
# Scan for range-to-range mappings with a range of skips starting from 1.
# Whenever a valid range is found, remove it from the map. Finally,
# output the remaining case conversions (1:1 and 1:n) on a per codepoint
# basis.
#
# This is very slow because we always scan from scratch, but its the
# most reliable and simple way to scan
print('generate caseconv tables')
ranges = [] # range mappings (2 or more consecutive mappings with a certain skip)
singles = [] # 1:1 character mappings
multis = [] # 1:n character mappings
# Ranges with skips
for skip in xrange(1,6+1): # skips 1...6 are useful
while True:
start_i, start_o, count = find_first_range_with_skip(convmap, skip)
if start_i is None:
break
print('- skip %d: %d %d %d' % (skip, start_i, start_o, count))
ranges.append([start_i, start_o, count, skip])
# 1:1 conversions
k = convmap.keys()
k.sort()
for i in k:
if len(convmap[i]) > 1:
continue
singles.append([i, ord(convmap[i])]) # codepoint, codepoint
del convmap[i]
# There are many mappings to 2-char sequences with latter char being U+0399.
# These could be handled as a special case, but we don't do that right now.
#
# [8064L, u'\u1f08\u0399']
# [8065L, u'\u1f09\u0399']
# [8066L, u'\u1f0a\u0399']
# [8067L, u'\u1f0b\u0399']
# [8068L, u'\u1f0c\u0399']
# [8069L, u'\u1f0d\u0399']
# [8070L, u'\u1f0e\u0399']
# [8071L, u'\u1f0f\u0399']
# ...
#
# tmp = {}
# k = convmap.keys()
# k.sort()
# for i in k:
# if len(convmap[i]) == 2 and convmap[i][1] == u'\u0399':
# tmp[i] = convmap[i][0]
# del convmap[i]
# print(repr(tmp))
#
# skip = 1
# while True:
# start_i, start_o, count = find_first_range_with_skip(tmp, skip)
# if start_i is None:
# break
# print('- special399, skip %d: %d %d %d' % (skip, start_i, start_o, count))
# print(len(tmp.keys()))
# print(repr(tmp))
# XXX: need to put 12 remaining mappings back to convmap
# 1:n conversions
k = convmap.keys()
k.sort()
for i in k:
multis.append([i, convmap[i]]) # codepoint, string
del convmap[i]
for t in singles:
print '- singles: ' + repr(t)
for t in multis:
print '- multis: ' + repr(t)
print '- range mappings: %d' % len(ranges)
print '- single character mappings: %d' % len(singles)
print '- complex mappings (1:n): %d' % len(multis)
print '- remaining (should be zero): %d' % len(convmap.keys())
# XXX: opportunities for diff encoding skip=3 ranges?
prev = None
for t in ranges:
# range: [start_i, start_o, count, skip]
if t[3] != 3:
continue
if prev is not None:
print '- %d %d' % (t[0] - prev[0], t[1] - prev[1])
else:
print '- start: %d %d' % (t[0], t[1])
prev = t
# Bit packed encoding.
be = dukutil.BitEncoder()
for curr_skip in xrange(1, 7): # 1...6
count = 0
for r in ranges:
start_i, start_o, r_count, skip = r[0], r[1], r[2], r[3]
if skip != curr_skip:
continue
count += 1
be.bits(count, 6)
print('- encode: skip=%d, count=%d' % (curr_skip, count))
for r in ranges:
start_i, start_o, r_count, skip = r[0], r[1], r[2], r[3]
if skip != curr_skip:
continue
be.bits(start_i, 16)
be.bits(start_o, 16)
be.bits(r_count, 7)
be.bits(0x3f, 6) # maximum count value = end of skips
count = len(singles)
be.bits(count, 7)
for t in singles:
cp_i, cp_o = t[0], t[1]
be.bits(cp_i, 16)
be.bits(cp_o, 16)
count = len(multis)
be.bits(count, 7)
for t in multis:
cp_i, str_o = t[0], t[1]
be.bits(cp_i, 16)
be.bits(len(str_o), 2)
for i in xrange(len(str_o)):
be.bits(ord(str_o[i]), 16)
return be.getBytes(), be.getNumBits()
def generate_regexp_canonicalize_tables(convmap):
"""Generate tables for case insensitive RegExp normalization."""
# Generate a direct codepoint lookup for canonicalizing BMP range.
def generate_canontab():
res = []
highest_nonid = -1
for cp in xrange(65536):
res_cp = cp # default to as is
if convmap.has_key(cp):
tmp = convmap[cp]
if len(tmp) == 1:
# If multiple codepoints from input, ignore.
res_cp = ord(tmp[0])
if cp >= 0x80 and res_cp < 0x80:
res_cp = cp # If non-ASCII mapped to ASCII, ignore.
if cp != res_cp:
highest_nonid = cp
res.append(res_cp)
# At the moment this is 65370, which means there's very little
# gain in assuming 1:1 mapping above a certain BMP codepoint
# (though we do assume 1:1 mapping for above BMP codepoints).
print('- highest non-identity mapping: %d' % highest_nonid)
return res
print('generate canontab')
canontab = generate_canontab()
# Figure out which BMP values are never the result of canonicalization.
# Such codepoints are "don't care" in the sense that they are never
# matched against at runtime: ranges are canonicalized at compile time,
# and codepoint being matched is also canonicalized at run time.
# (Currently unused.)
def generate_dontcare():
res = [ True ] * 65536
for cp in canontab:
res[cp] = False
res_count = 0
for x in res:
if x:
res_count += 1
print('- %d dontcare codepoints' % res_count)
return res
print('generate canon dontcare')
dontcare = generate_dontcare()
# Generate maximal continuous ranges for canonicalization. A continuous
# range is a sequence with N codepoints where IN+i canonicalizes to OUT+i
# for fixed IN, OUT, and i in 0...N-1. There are unfortunately >1000
# of these ranges, mostly because there are a lot of individual exceptions.
# (Currently unused.)
canon_ranges = []
for cp in xrange(65536):
canon_ranges.append([ cp, canontab[cp], 1 ]) # 1 codepoint ranges at first
def merge_compatible_nogap(rng1, rng2):
# Merge adjacent ranges if continuity allows.
if rng1[0] + rng1[2] == rng2[0] and \
rng1[1] + rng1[2] == rng2[1]:
return [ rng1[0], rng1[1], rng1[2] + rng2[2] ]
return None
def merge_check_nogap():
len_start = len(canon_ranges)
for i in xrange(len(canon_ranges) - 1):
j = i + 1
rng1 = canon_ranges[i]
rng2 = canon_ranges[j]
if rng1 is None or rng2 is None: continue
merged = merge_compatible_nogap(rng1, rng2)
if merged is not None:
canon_ranges[j] = None
canon_ranges[i] = merged
filtered = []
for x in canon_ranges:
if x is not None:
filtered.append(x)
len_end = len(filtered)
if len_end < len_start:
return filtered
return None
print('generate canon_ranges')
while True:
# Starting from individual ranges of 1 codepoint, merge adjacent
# ranges until no more ranges can be merged.
t = merge_check_nogap()
if t is None:
break
canon_ranges = t
print('- %d ranges' % len(canon_ranges))
#for rng in canon_ranges:
# print('canon_ranges:')
# print(repr(rng))
# Generate true/false ranges for BMP codepoints where:
# - A codepoint is flagged true if continuity is broken at that point, so
# an explicit codepoint canonicalization is needed at runtime.
# - A codepoint is flagged false if case conversion is continuous from the
# previous codepoint, i.e. out_curr = out_prev + 1.
#
# The result is a lot of small ranges due to a lot of small 'false' ranges.
# Reduce the range set by checking if adjacent 'true' ranges have at most
# false_limit 'false' entries between them. If so, force the 'false'
# entries to 'true' (safe but results in an unnecessary runtime codepoint
# lookup) and merge the three ranges into a larger 'true' range.
#
# (Currently unused.)
def generate_needcheck_straight():
res = [ True ] * 65536
assert(canontab[0] == 0) # can start from in == out == 0
prev_in = -1
prev_out = -1
for i in xrange(65536):
# First create a straight true/false bitmap for BMP.
curr_in = i
curr_out = canontab[i]
if prev_in + 1 == curr_in and prev_out + 1 == curr_out:
res[i] = False
prev_in = curr_in
prev_out = curr_out
return res
def generate_needcheck_ranges(data):
# Generate maximal accurate ranges.
prev = None
count = 0
ranges = []
for i in data:
if prev is None or prev != i:
if prev is not None:
ranges.append([ prev, count ])
prev = i
count = 1
else:
count += 1
if prev is not None:
ranges.append([ prev, count ])
return ranges
def fillin_needcheck_ranges(data, false_limit):
# Fill in TRUE-FALSE*N-TRUE gaps into TRUE-TRUE*N-TRUE which is
# safe (leads to an unnecessary runtime check) but reduces
# range data size considerably.
res = []
for r in data:
res.append([ r[0], r[1] ])
while True:
found = False
for i in xrange(len(res) - 2):
r1 = res[i]
r2 = res[i + 1]
r3 = res[i + 2]
if r1[0] == True and r2[0] == False and r3[0] == True and \
r2[1] <= false_limit:
#print('fillin %d falses' % r2[1])
res.pop(i + 2)
res.pop(i + 1)
res[i] = [ True, r1[1] + r2[1] + r3[1] ]
found = True
break
if not found:
break
return res
print('generate needcheck straight')
needcheck = generate_needcheck_straight()
print('generate needcheck without false fillins')
needcheck_ranges1 = generate_needcheck_ranges(needcheck)
print('- %d ranges' % len(needcheck_ranges1))
#print(needcheck_ranges1)
print('generate needcheck with false fillins')
needcheck_ranges2 = fillin_needcheck_ranges(needcheck_ranges1, 11)
print('- %d ranges' % len(needcheck_ranges2))
#print(needcheck_ranges2)
# Generate a bitmap for BMP, divided into N-codepoint blocks, with each
# bit indicating: "entire codepoint block canonicalizes continuously, and
# the block is continuous with the previous and next block". A 'true'
# entry allows runtime code to just skip the block, advancing 'in' and
# 'out' by the block size, with no codepoint conversion. The block size
# should be large enough to produce a relatively small lookup table, but
# small enough to reduce codepoint conversions to a manageable number
# because the conversions are (currently) quite slow. This matters
# especially for case-insensitive RegExps; without any optimization,
# /[\u0000-\uffff]/i requires 65536 case conversions for runtime
# normalization.
block_shift = 5
block_size = 1 << block_shift
block_mask = block_size - 1
num_blocks = 65536 / block_size
def generate_block_bits(check_continuity):
res = [ True ] * num_blocks
for i in xrange(num_blocks):
base_in = i * block_size
base_out = canontab[base_in]
if check_continuity:
lower = -1 # [-1,block_size]
upper = block_size + 1
else:
lower = 0 # [0,block_size-1]
upper = block_size
for j in xrange(lower, upper):
cp = base_in + j
if cp >= 0x0000 and cp <= 0xffff and canontab[cp] != base_out + j:
res[i] = False
break
return res
def dump_block_bitmap(bits):
tmp = ''.join([ ({ True: 'x', False: '.' })[b] for b in bits])
tmp = re.sub(r'.{64}', lambda x: x.group(0) + '\n', tmp)
blocks_true = tmp.count('x')
blocks_false = tmp.count('.')
print('%d codepoint blocks are continuous, %d blocks are not' % (blocks_true, blocks_false))
sys.stdout.write(tmp)
#print(bits)
def dump_test_lookup(bits):
sys.stdout.write('duk_uint8_t test = {');
for b in bits:
if b:
sys.stdout.write('1,')
else:
sys.stdout.write('0,')
sys.stdout.write('};\n')
def convert_to_bitmap(bits):
# C code looks up bits as:
# index = codepoint >> N
# bitnum = codepoint & mask
# bitmask = 1 << bitnum
# So block 0 is mask 0x01 of first byte, block 1 is mask 0x02 of
# first byte, etc.
res = []
curr = 0
mask = 0x01
for b in bits:
if b:
curr += mask
mask = mask * 2
if mask == 0x100:
res.append(curr)
curr = 0
mask = 0x01
assert(mask == 0x01) # no leftover
return res
print('generate canon block bitmap without continuity')
block_bits1 = generate_block_bits(False)
dump_block_bitmap(block_bits1)
dump_test_lookup(block_bits1)
print('generate canon block bitmap with continuity')
block_bits2 = generate_block_bits(True)
dump_block_bitmap(block_bits2)
dump_test_lookup(block_bits2)
print('generate final canon bitmap')
block_bitmap = convert_to_bitmap(block_bits2)
print('- %d bytes' % len(block_bitmap))
print('- ' + repr(block_bitmap))
canon_bitmap = {
'data': block_bitmap,
'block_size': block_size,
'block_shift': block_shift,
'block_mask': block_mask
}
# This is useful to figure out corner case test cases.
print('canon blocks which are different with and without continuity check')
for i in xrange(num_blocks):
if block_bits1[i] != block_bits2[i]:
print('- block %d ([%d,%d]) differs' % (i, i * block_size, i * block_size + block_size - 1))
return canontab, canon_bitmap
def clonedict(x):
"Shallow clone of input dict."
res = {}
for k in x.keys():
res[k] = x[k]
return res
def main():
parser = optparse.OptionParser()
parser.add_option('--command', dest='command', default='caseconv_bitpacked')
parser.add_option('--unicode-data', dest='unicode_data')
parser.add_option('--special-casing', dest='special_casing')
parser.add_option('--out-source', dest='out_source')
parser.add_option('--out-header', dest='out_header')
parser.add_option('--table-name-lc', dest='table_name_lc', default='caseconv_lc')
parser.add_option('--table-name-uc', dest='table_name_uc', default='caseconv_uc')
parser.add_option('--table-name-re-canon-lookup', dest='table_name_re_canon_lookup', default='caseconv_re_canon_lookup')
parser.add_option('--table-name-re-canon-bitmap', dest='table_name_re_canon_bitmap', default='caseconv_re_canon_bitmap')
(opts, args) = parser.parse_args()
unicode_data = UnicodeData(opts.unicode_data)
special_casing = SpecialCasing(opts.special_casing)
uc, lc, tc = get_base_conversion_maps(unicode_data)
update_special_casings(uc, lc, tc, special_casing)
if opts.command == 'caseconv_bitpacked':
# XXX: ASCII and non-BMP filtering could be an option but is now hardcoded
# ASCII is handled with 'fast path' so not needed here.
t = clonedict(uc)
remove_ascii_part(t)
uc_bytes, uc_nbits = generate_caseconv_tables(t)
t = clonedict(lc)
remove_ascii_part(t)
lc_bytes, lc_nbits = generate_caseconv_tables(t)
# Generate C source and header files.
genc = dukutil.GenerateC()
genc.emitHeader('extract_caseconv.py')
genc.emitArray(uc_bytes, opts.table_name_uc, size=len(uc_bytes), typename='duk_uint8_t', intvalues=True, const=True)
genc.emitArray(lc_bytes, opts.table_name_lc, size=len(lc_bytes), typename='duk_uint8_t', intvalues=True, const=True)
f = open(opts.out_source, 'wb')
f.write(genc.getString())
f.close()
genc = dukutil.GenerateC()
genc.emitHeader('extract_caseconv.py')
genc.emitLine('extern const duk_uint8_t %s[%d];' % (opts.table_name_uc, len(uc_bytes)))
genc.emitLine('extern const duk_uint8_t %s[%d];' % (opts.table_name_lc, len(lc_bytes)))
f = open(opts.out_header, 'wb')
f.write(genc.getString())
f.close()
elif opts.command == 're_canon_lookup':
# Direct canonicalization lookup for case insensitive regexps, includes ascii part.
t = clonedict(uc)
re_canon_lookup, re_canon_bitmap = generate_regexp_canonicalize_tables(t)
genc = dukutil.GenerateC()
genc.emitHeader('extract_caseconv.py')
genc.emitArray(re_canon_lookup, opts.table_name_re_canon_lookup, size=len(re_canon_lookup), typename='duk_uint16_t', intvalues=True, const=True)
f = open(opts.out_source, 'wb')
f.write(genc.getString())
f.close()
genc = dukutil.GenerateC()
genc.emitHeader('extract_caseconv.py')
genc.emitLine('extern const duk_uint16_t %s[%d];' % (opts.table_name_re_canon_lookup, len(re_canon_lookup)))
f = open(opts.out_header, 'wb')
f.write(genc.getString())
f.close()
elif opts.command == 're_canon_bitmap':
# N-codepoint block bitmap for skipping continuous codepoint blocks
# quickly.
t = clonedict(uc)
re_canon_lookup, re_canon_bitmap = generate_regexp_canonicalize_tables(t)
genc = dukutil.GenerateC()
genc.emitHeader('extract_caseconv.py')
genc.emitArray(re_canon_bitmap['data'], opts.table_name_re_canon_bitmap, size=len(re_canon_bitmap['data']), typename='duk_uint8_t', intvalues=True, const=True)
f = open(opts.out_source, 'wb')
f.write(genc.getString())
f.close()
genc = dukutil.GenerateC()
genc.emitHeader('extract_caseconv.py')
genc.emitDefine('DUK_CANON_BITMAP_BLKSIZE', re_canon_bitmap['block_size'])
genc.emitDefine('DUK_CANON_BITMAP_BLKSHIFT', re_canon_bitmap['block_shift'])
genc.emitDefine('DUK_CANON_BITMAP_BLKMASK', re_canon_bitmap['block_mask'])
genc.emitLine('extern const duk_uint8_t %s[%d];' % (opts.table_name_re_canon_bitmap, len(re_canon_bitmap['data'])))
f = open(opts.out_header, 'wb')
f.write(genc.getString())
f.close()
else:
raise Exception('invalid command: %r' % opts.command)
if __name__ == '__main__':
main()
|