import sys
_SURROGATE_CODEPOINTS_RANGE = (0xD800, 0xDFFF + 1)
_TRAILING_SURROGATE_START = 0xDC00
def is_surrogate(n):
return _SURROGATE_CODEPOINTS_RANGE[0] <= n < _SURROGATE_CODEPOINTS_RANGE[1]
def cp_to_char(codepoint):
if sys.maxunicode == 0x10FFFF:
return unichr(codepoint)
else:
return ("\\U%08x" % codepoint).decode('unicode-escape')
def codepoints_from_string(unistr):
if sys.maxunicode == 0x10FFFF:
return [ord(ch) for ch in unistr]
else:
return [cp for cp in _codepoints_from_utf16(unistr)]
def _is_leading_surrogate(code_unit):
return _SURROGATE_CODEPOINTS_RANGE[0] <= code_unit < _TRAILING_SURROGATE_START
def _is_trailing_surrogate(code_unit):
return _TRAILING_SURROGATE_START <= code_unit < _SURROGATE_CODEPOINTS_RANGE[1]
def _decode_surrogate_pair(leading, trailing):
return ((leading - _SURROGATE_CODEPOINTS_RANGE[0]) << 10) + (trailing - 0xDC00) + 0x10000
def _codepoints_from_utf16(unistr):
assert sys.maxunicode == 0xFFFF
leading = None
for code_unit_char in unistr:
code_unit = ord(code_unit_char)
if leading is not None:
if _is_trailing_surrogate(code_unit):
yield _decode_surrogate_pair(leading, code_unit)
leading = None
continue
else:
yield leading
leading = None
if _is_leading_surrogate(code_unit):
leading = code_unit
else:
yield code_unit
if leading is not None:
yield leading