Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GH-90997: Improve inline cache performance for MSVC #96781

Merged
merged 3 commits into from
Sep 15, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
93 changes: 19 additions & 74 deletions Include/internal/pycore_code.h
Original file line number Diff line number Diff line change
Expand Up @@ -285,110 +285,55 @@ PyAPI_FUNC(PyObject*) _Py_GetSpecializationStats(void);
#define EVAL_CALL_STAT_INC_IF_FUNCTION(name, callable) ((void)0)
#endif // !Py_STATS

// Cache values are only valid in memory, so use native endianness.
#ifdef WORDS_BIGENDIAN
// Utility functions for reading/writing 32/64-bit values in the inline caches.
// Great care should be taken to ensure that these functions remain correct and
// performant! They should compile to just "move" instructions on all supported
// compilers and platforms.

// We use memcpy to let the C compiler handle unaligned accesses and endianness
// issues for us. It also seems to produce better code than manual copying for
// most compilers (see https://blog.regehr.org/archives/959 for more info).

static inline void
write_u32(uint16_t *p, uint32_t val)
{
p[0] = (uint16_t)(val >> 16);
p[1] = (uint16_t)(val >> 0);
memcpy(p, &val, sizeof(val));
}

static inline void
write_u64(uint16_t *p, uint64_t val)
{
p[0] = (uint16_t)(val >> 48);
p[1] = (uint16_t)(val >> 32);
p[2] = (uint16_t)(val >> 16);
p[3] = (uint16_t)(val >> 0);
}

static inline uint32_t
read_u32(uint16_t *p)
{
uint32_t val = 0;
val |= (uint32_t)p[0] << 16;
val |= (uint32_t)p[1] << 0;
return val;
}

static inline uint64_t
read_u64(uint16_t *p)
{
uint64_t val = 0;
val |= (uint64_t)p[0] << 48;
val |= (uint64_t)p[1] << 32;
val |= (uint64_t)p[2] << 16;
val |= (uint64_t)p[3] << 0;
return val;
}

#else

static inline void
write_u32(uint16_t *p, uint32_t val)
{
p[0] = (uint16_t)(val >> 0);
p[1] = (uint16_t)(val >> 16);
memcpy(p, &val, sizeof(val));
}

static inline void
write_u64(uint16_t *p, uint64_t val)
write_obj(uint16_t *p, PyObject *val)
{
p[0] = (uint16_t)(val >> 0);
p[1] = (uint16_t)(val >> 16);
p[2] = (uint16_t)(val >> 32);
p[3] = (uint16_t)(val >> 48);
memcpy(p, &val, sizeof(val));
}

static inline uint32_t
read_u32(uint16_t *p)
{
uint32_t val = 0;
val |= (uint32_t)p[0] << 0;
val |= (uint32_t)p[1] << 16;
uint32_t val;
memcpy(&val, p, sizeof(val));
return val;
}

static inline uint64_t
read_u64(uint16_t *p)
{
uint64_t val = 0;
val |= (uint64_t)p[0] << 0;
val |= (uint64_t)p[1] << 16;
val |= (uint64_t)p[2] << 32;
val |= (uint64_t)p[3] << 48;
uint64_t val;
memcpy(&val, p, sizeof(val));
return val;
}

#endif

static inline void
write_obj(uint16_t *p, PyObject *obj)
{
uintptr_t val = (uintptr_t)obj;
#if SIZEOF_VOID_P == 8
write_u64(p, val);
#elif SIZEOF_VOID_P == 4
write_u32(p, val);
#else
#error "SIZEOF_VOID_P must be 4 or 8"
#endif
}

static inline PyObject *
read_obj(uint16_t *p)
{
uintptr_t val;
#if SIZEOF_VOID_P == 8
val = read_u64(p);
#elif SIZEOF_VOID_P == 4
val = read_u32(p);
#else
#error "SIZEOF_VOID_P must be 4 or 8"
#endif
return (PyObject *)val;
PyObject *val;
memcpy(&val, p, sizeof(val));
return val;
}

/* See Objects/exception_handling_notes.txt for details.
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Improve the performance of reading and writing inline bytecode caches on
some platforms.