zts-improvement
no way to compare when less than two revisions
Differences
This shows you the differences between two versions of the page.
— | zts-improvement [2019/02/13 08:17] (current) – created dmitry | ||
---|---|---|---|
Line 1: | Line 1: | ||
+ | ====== PHP ZTS Improvement ====== | ||
+ | |||
+ | This is a description of ZTS improvement idea, that should reduce cost of module globals access. | ||
+ | For example EG(current_execute_data). | ||
+ | The idea came during analysing of JIT for ZTS expediency, but it should improve ZTS interpreter and whole PHP ZTS build as well. | ||
+ | |||
+ | ===== Non ZTS Way ===== | ||
+ | |||
+ | Without ZTS this takes just 1 CPU instruction and 1 load. | ||
+ | |||
+ | <code asm> | ||
+ | movl executor_globals+field_offset(%rip), | ||
+ | </ | ||
+ | |||
+ | < | ||
+ | executor_globals | ||
+ | +----------------+ | ||
+ | | field 0 | | ||
+ | field_offset ------>| ... | | ||
+ | | field N | | ||
+ | +----------------+ | ||
+ | </ | ||
+ | |||
+ | ===== Current ZTS Way (PHP-7.3) ===== | ||
+ | |||
+ | However in ZTS build the same access requires 6 CPU instruction and 6 loads. | ||
+ | |||
+ | <code asm> | ||
+ | movq _tsrm_ls_cache@gottpoff(%rip), | ||
+ | movslq executor_globals_id(%rip), | ||
+ | movq %fs: | ||
+ | movq (%rax), | ||
+ | movq -8(%rax, | ||
+ | movl field_offset(%rdx), | ||
+ | </ | ||
+ | |||
+ | < | ||
+ | %fs | ||
+ | | | ||
+ | v | ||
+ | +---------------------+ | ||
+ | | _tsrm_ls_cache | ||
+ | +---------------------+ | ||
+ | | thread_id | ||
+ | | next | ||
+ | +----------------+ | ||
+ | | | ||
+ | +---------------------+ | ||
+ | | executor_globals_id |-->| + |< | ||
+ | +---------------------+ | ||
+ | | | | ||
+ | | | ||
+ | | | ||
+ | | | slot 0 | | ||
+ | +---->| ... |--+ | ||
+ | | slot N | ||
+ | +----------------+ | ||
+ | | | ||
+ | +---+ | | ||
+ | field_offset ------>| + |< | ||
+ | +---+ | | ||
+ | | | | ||
+ | | | ||
+ | | | ||
+ | | | field 0 | | ||
+ | +---->| ... | | ||
+ | | field N | | ||
+ | +----------------+ | ||
+ | </ | ||
+ | |||
+ | ===== New ZTS Way (PHP-7.4+ or PHP-8) ===== | ||
+ | |||
+ | In case we fatten all the data structures we may reduce access pattern to 4 instructiond and 4 loads. | ||
+ | I think, it's possible to make this changes on TSRM level only, without (or with minimal) TSRM source API modification. | ||
+ | This would allow target the improvement into PHP-7.4. | ||
+ | |||
+ | <code asm> | ||
+ | movq _tsrm_ls_cache@gottpoff(%rip), | ||
+ | movslq executor_globals_offset(%rip), | ||
+ | movq %fs: | ||
+ | movq field_offset(%rax, | ||
+ | </ | ||
+ | |||
+ | < | ||
+ | %fs | ||
+ | | | ||
+ | v | ||
+ | +---------------------+ | ||
+ | | _tsrm_ls_cache | ||
+ | +---------------------+ | ||
+ | | | count | | ||
+ | v | ||
+ | +---------------------+ | ||
+ | | executor_globals_id |-->| + |--> | ||
+ | +---------------------+ | ||
+ | | | ... | | ||
+ | V | ... | | ||
+ | +---+ | ... | | ||
+ | field_offset ------>| + |-->| ... | | ||
+ | +---+ | slot 0 field N | | ||
+ | +----------------+ | ||
+ | | ... | | ||
+ | +----------------+ | ||
+ | | slot K size | | ||
+ | +----------------+ | ||
+ | | slot K field 0 | | ||
+ | | ... | | ||
+ | | slot K field M | | ||
+ | +----------------+ | ||
+ | </ | ||
+ | |||
+ | ==== Reserved global id-s (EG, CG, etc) ==== | ||
+ | |||
+ | In addition we may reserve slots few slots for frequently used execute_data, | ||
+ | And make " | ||
+ | This will reduce access pattern to 3 instructions and 3 loads. | ||
+ | This also eliminates requirement for temporary CPU register. | ||
+ | |||
+ | <code asm> | ||
+ | movq _tsrm_ls_cache@gottpoff(%rip), | ||
+ | movq %fs: | ||
+ | movq executor_globals_offset + field_offset(%rax), | ||
+ | </ | ||
+ | |||
+ | ==== JIT ==== | ||
+ | |||
+ | With JIT we may aviod " | ||
+ | |||
+ | <code asm> | ||
+ | movq %fs: | ||
+ | movq executor_globals_offset + field_offset(%rax), | ||
+ | </ | ||
+ | |||
+ | Finally, we may cache address of " | ||
zts-improvement.txt · Last modified: 2019/02/13 08:17 by dmitry