From anonymous, 1 Month ago, written in Python.
Embed
  1. # https://rushter.com/blog/python-strings-and-memory/
  2. # https://github.com/abarnert/superhackyinternals/blob/master/internals.py
  3.  
  4. import ctypes
  5.  
  6.  
  7. class LegacyUnion(ctypes.Union):
  8.     _fields_ = [
  9.         ("any", ctypes.c_void_p),
  10.         ("latin1", ctypes.POINTER(ctypes.c_uint8)),  # Py_UCS1 *
  11.         ("ucs2", ctypes.POINTER(ctypes.c_uint16)),  # Py_UCS2 *
  12.         ("ucs4", ctypes.POINTER(ctypes.c_uint32)),  # Py_UCS4 *
  13.     ]
  14.  
  15.  
  16. class PyUnicodeObject(ctypes.Structure):
  17.     SSTATE_NOT_INTERNED = 0
  18.     SSTATE_INTERNED_MORTAL = 1
  19.     SSTATE_INTERNED_IMMORTAL = 2
  20.     PyUnicode_WCHAR_KIND = 0
  21.     PyUnicode_1BYTE_KIND = 1
  22.     PyUnicode_2BYTE_KIND = 2
  23.     PyUnicode_4BYTE_KIND = 4
  24.  
  25.     _fields_ = [
  26.         # PyASCIIObject
  27.         ("ob_refcnt", ctypes.c_long),
  28.         ("ob_type", ctypes.c_void_p),
  29.         ("length", ctypes.c_ssize_t),
  30.         ("hash", ctypes.c_ssize_t),
  31.         ("interned", ctypes.c_uint, 2),
  32.         ("kind", ctypes.c_uint, 3),
  33.         ("compact", ctypes.c_uint, 1),
  34.         ("ascii", ctypes.c_uint, 1),
  35.         ("ready", ctypes.c_uint, 1),
  36.         ("padding", ctypes.c_uint, 24),
  37.         ("wstr", ctypes.POINTER(ctypes.c_wchar)),
  38.         # PyCompactUnicodeObject
  39.         ("utf8_length", ctypes.c_ssize_t),
  40.         ("utf8", ctypes.c_char_p),
  41.         ("wstr_length", ctypes.c_ssize_t),
  42.         # PyUnicodeObject
  43.         ("data", LegacyUnion),
  44.     ]
  45.  
  46.  
  47. _KINDS = {
  48.     PyUnicodeObject.PyUnicode_WCHAR_KIND: ctypes.c_wchar,
  49.     PyUnicodeObject.PyUnicode_1BYTE_KIND: ctypes.c_uint8,
  50.     PyUnicodeObject.PyUnicode_2BYTE_KIND: ctypes.c_uint16,
  51.     PyUnicodeObject.PyUnicode_4BYTE_KIND: ctypes.c_uint32,
  52. }
  53.  
  54.  
  55. def get_data(s):
  56.     # Using the official rules from the header file, which could
  57.     # of course be simplified.
  58.     p = PyUnicodeObject.from_address(id(s))
  59.     length = p.length
  60.     t = _KINDS[p.kind]
  61.     if p.compact and p.ascii:
  62.         # ASCII buffer comes right after wstr
  63.         t = ctypes.c_char
  64.         addr = id(s) + PyUnicodeObject.utf8_length.offset
  65.     elif p.compact and not p.ascii:
  66.         # UCS1/2/4 buffer comes right after wstr
  67.         addr = id(s) + PyUnicodeObject.data.offset
  68.     elif p.kind == p.PyUnicode_WCHAR_KIND:
  69.         # Note that this goes with wstr_length, not length!
  70.         return p.wstr
  71.     elif not p.compact and p.kind != p.PyUnicode_WCHAR_KIND:
  72.         if p.kind == p.PyUnicode_1BYTE_KIND:
  73.             return p.data.latin1
  74.         elif p.kind == p.PyUnicode_2BYTE_KIND:
  75.             return p.data.ucs2
  76.         elif p.kind == p.PyUnicode_4BYTE_KIND:
  77.             return p.data.ucs4
  78.     return bytes((t * length).from_address(addr))
  79.  
  80.  
  81. def print_data(s):
  82.     data = get_data(s)
  83.     print(s)
  84.     print(data)
  85.     print(data.hex(" "))
  86.     print()
  87.  
  88.  
  89. print_data("Hello")
  90. # Hello
  91. # b'Hello'
  92. # 48 65 6c 6c 6f
  93.  
  94. print_data("Hellö")
  95. # Hellö
  96. # b'Hell\xf6'
  97. # 48 65 6c 6c f6
  98.  
  99. print_data("Hell\u2764")
  100. # Hell❤
  101. # b"H\x00e\x00l\x00l\x00d'"
  102. # 48 00 65 00 6c 00 6c 00 64 27
  103.  
  104. print_data("Hell\U00010000")
  105. # Hell?
  106. # b'H\x00\x00\x00e\x00\x00\x00l\x00\x00\x00l\x00\x00\x00\x00\x00\x01\x00'
  107. # 48 00 00 00 65 00 00 00 6c 00 00 00 6c 00 00 00 00 00 01 00
  108.