# https://rushter.com/blog/python-strings-and-memory/
# https://github.com/abarnert/superhackyinternals/blob/master/internals.py
import ctypes
class LegacyUnion(ctypes.Union):
_fields_ = [
("any", ctypes.c_void_p),
("latin1", ctypes.POINTER(ctypes.c_uint8)), # Py_UCS1 *
("ucs2", ctypes.POINTER(ctypes.c_uint16)), # Py_UCS2 *
("ucs4", ctypes.POINTER(ctypes.c_uint32)), # Py_UCS4 *
]
class PyUnicodeObject(ctypes.Structure):
SSTATE_NOT_INTERNED = 0
SSTATE_INTERNED_MORTAL = 1
SSTATE_INTERNED_IMMORTAL = 2
PyUnicode_WCHAR_KIND = 0
PyUnicode_1BYTE_KIND = 1
PyUnicode_2BYTE_KIND = 2
PyUnicode_4BYTE_KIND = 4
_fields_ = [
# PyASCIIObject
("ob_refcnt", ctypes.c_long),
("ob_type", ctypes.c_void_p),
("length", ctypes.c_ssize_t),
("hash", ctypes.c_ssize_t),
("interned", ctypes.c_uint, 2),
("kind", ctypes.c_uint, 3),
("compact", ctypes.c_uint, 1),
("ascii", ctypes.c_uint, 1),
("ready", ctypes.c_uint, 1),
("padding", ctypes.c_uint, 24),
("wstr", ctypes.POINTER(ctypes.c_wchar)),
# PyCompactUnicodeObject
("utf8_length", ctypes.c_ssize_t),
("utf8", ctypes.c_char_p),
("wstr_length", ctypes.c_ssize_t),
# PyUnicodeObject
("data", LegacyUnion),
]
_KINDS = {
PyUnicodeObject.PyUnicode_WCHAR_KIND: ctypes.c_wchar,
PyUnicodeObject.PyUnicode_1BYTE_KIND: ctypes.c_uint8,
PyUnicodeObject.PyUnicode_2BYTE_KIND: ctypes.c_uint16,
PyUnicodeObject.PyUnicode_4BYTE_KIND: ctypes.c_uint32,
}
def get_data(s):
# Using the official rules from the header file, which could
# of course be simplified.
p = PyUnicodeObject.from_address(id(s))
length = p.length
t = _KINDS[p.kind]
if p.compact and p.ascii:
# ASCII buffer comes right after wstr
t = ctypes.c_char
addr = id(s) + PyUnicodeObject.utf8_length.offset
elif p.compact and not p.ascii:
# UCS1/2/4 buffer comes right after wstr
addr = id(s) + PyUnicodeObject.data.offset
elif p.kind == p.PyUnicode_WCHAR_KIND:
# Note that this goes with wstr_length, not length!
return p.wstr
elif not p.compact and p.kind != p.PyUnicode_WCHAR_KIND:
if p.kind == p.PyUnicode_1BYTE_KIND:
return p.data.latin1
elif p.kind == p.PyUnicode_2BYTE_KIND:
return p.data.ucs2
elif p.kind == p.PyUnicode_4BYTE_KIND:
return p.data.ucs4
return bytes((t * length).from_address(addr))
def print_data(s):
data = get_data(s)
print(s)
print(data)
print(data.hex(" "))
print()
print_data("Hello")
# Hello
# b'Hello'
# 48 65 6c 6c 6f
print_data("Hellö")
# Hellö
# b'Hell\xf6'
# 48 65 6c 6c f6
print_data("Hell\u2764")
# Hell❤
# b"H\x00e\x00l\x00l\x00d'"
# 48 00 65 00 6c 00 6c 00 64 27
print_data("Hell\U00010000")
# Hell?
# b'H\x00\x00\x00e\x00\x00\x00l\x00\x00\x00l\x00\x00\x00\x00\x00\x01\x00'
# 48 00 00 00 65 00 00 00 6c 00 00 00 6c 00 00 00 00 00 01 00