_files/f05cc14b-3dfb-4106-b8d2-c3d7045390ee.png)
图1. python 代码执行过程示意图
$ python test.pytypedef struct _is { struct _is *next; struct _ts *tstate_head; PyObject *modules; PyObject *modules_by_index; PyObject *sysdict; PyObject *builtins; PyObject *importlib; PyObject *codec_search_path; PyObject *codec_search_cache; PyObject *codec_error_registry; int codecs_initialized; int fscodec_initialized; PyObject *builtins_copy;} PyInterpreterState;typedef struct _ts { struct _ts *prev; struct _ts *next; PyInterpreterState *interp; struct _frame *frame; int recursion_depth; char overflowed; char recursion_critical; int tracing; int use_tracing; Py_tracefunc c_profilefunc; Py_tracefunc c_tracefunc; PyObject *c_profileobj; PyObject *c_traceobj; PyObject *curexc_type; PyObject *curexc_value; PyObject *curexc_traceback; PyObject *exc_type; PyObject *exc_value; PyObject *exc_traceback; PyObject *dict; /* Stores per-thread state */ int gilstate_counter; ... } PyThreadState;>>> import dis>>> def square(x):... return x*x... >>> dis.dis(square) 2 0 LOAD_FAST 0 (x) 2 LOAD_FAST 0 (x) 4 BINARY_MULTIPLY 6 RETURN_VALUE...stmt: simple_stmt | compound_stmtsimple_stmt: small_stmt (';' small_stmt)* [';'] NEWLINEsmall_stmt: (expr_stmt | del_stmt | pass_stmt | flow_stmt | import_stmt | global_stmt | nonlocal_stmt | assert_stmt)expr_stmt: testlist_star_expr (augassign (yield_expr|testlist) | ('=' (yield_expr|testlist_star_expr))*)testlist_star_expr: (test|star_expr) (',' (test|star_expr))* [',']augassign: ('+=' | '-=' | '*=' | '@=' | '/=' | '%=' | '&=' | '|=' | '^=' | '<<=' | '>>=' | '**=' | '//=')del_stmt: 'del' exprlistpass_stmt: 'pass'flow_stmt: break_stmt | continue_stmt | return_stmt | raise_stmt | yield_stmtbreak_stmt: 'break'continue_stmt: 'continue'return_stmt: 'return' [testlist]yield_stmt: yield_exprraise_stmt: 'raise' [test ['from' test]]import_stmt: import_name | import_fromimport_name: 'import' dotted_as_namesimport_from: ('from' (('.' | '...')* dotted_name | ('.' | '...')+) 'import' ('*' | '(' import_as_names ')' | import_as_names))import_as_name: NAME ['as' NAME]...+, * 能够对值进行运算产生结果。(, ), {,}, =, *= 等。"Fred", b"Fred" 、数值类型的字面量 2 、浮点类型字面量 1e100 、虚数字面量 10j 。使用 0 初始化 indent 栈for 每一个 logical line: if (当前行的缩进级别 > 栈顶的级别): push(当前缩进级别) 生成一个 INDENT token if (当前行的缩进级别 < 栈顶的级别): if (栈中不存在一个缩进级别 == 当前缩进级别): 报告一个错误 for 栈顶的每一个 != 当前行缩进级的值: 移除这个值 生成一个 DEDENT token tokenize(当前行)对每一个栈上的值(除了0)生成 DEDENT token>>> from tokenize import tokenize>>> f = open("./test.py", 'rb') # test.py 的内容只有一行 print("hello")>>> for t in tokenize(f.readline):... print(t)... TokenInfo(type=59 (ENCODING), string='utf-8', start=(0, 0), end=(0, 0), line='')TokenInfo(type=1 (NAME), string='print', start=(1, 0), end=(1, 5), line='print("hello")')TokenInfo(type=53 (OP), string='(', start=(1, 5), end=(1, 6), line='print("hello")')TokenInfo(type=3 (STRING), string='"hello"', start=(1, 6), end=(1, 13), line='print("hello")')TokenInfo(type=53 (OP), string=')', start=(1, 13), end=(1, 14), line='print("hello")')TokenInfo(type=0 (ENDMARKER), string='', start=(2, 0), end=(2, 0), line='')$ python -m tokenize test.py0,0-0,0: ENCODING 'utf-8' 1,0-1,5: NAME 'print' 1,5-1,6: OP '(' 1,6-1,13: STRING '"hello"' 1,13-1,14: OP ')' 2,0-2,0: ENDMARKER '' >>> import parser>>> from pprint import pprint>>> code_str = """... def hello_world():... return 'hello world'... """>>> st = parser.suite(code_str)>>> pprint(parser.st2list(st))[257, [269, [295, [263, [1, 'def'], [1, 'hello_world'], [264, [7, '('], [8, ')']], [11, ':'], [304, [4, ''], [5, ''], [269, [270, [271, [278, [281, [1, 'return'], [331, [305, [309, [310, [311, [312, [315, [316, [317, [318, [319, [320, [321, [322, [323, [324, [3, "'hello world'"]]]]]]]]]]]]]]]]]]]], [4, '']]], [6, '']]]]], [4, ''], [0, '']]代码清单 1.3.2
_files/e3dcae82-23ba-41ad-a076-d75a64958bfe.png)
图1.3.2:代码清单 1.3.2 中所展示的 parse tree 的图形化展示
typedef struct _node { short n_type; char *n_str; int n_lineno; int n_col_offset; int n_nchildren; struct _node *n_child;} node;>>> import ast>>> node = ast.parse(code_str, mode='exec')>>> ast.dump(node)"Module(body=[FunctionDef(name='hello_world', args=arguments(args=[], vararg=None, kwonlyargs=[], kw_defaults=[], kwarg=None, defaults=[]), body=[Return(value=Str(s='hello world'))], decorator_list=[], returns=None)])"struct _stmt { enum _stmt_kind kind; union { struct { identifier name; arguments_ty args; asdl_seq *body; asdl_seq *decorator_list; expr_ty returns; } FunctionDef; struct { identifier name; arguments_ty args; asdl_seq *body; asdl_seq *decorator_list; expr_ty returns; } AsyncFunctionDef; struct { identifier name; asdl_seq *bases; asdl_seq *keywords; asdl_seq *body; asdl_seq *decorator_list; } ClassDef; ... } v; int lineno; int col_offset;};>>> x = 5>>> def func1():... locals()['a'] = 1... print(locals()['a'])... print(a)... return locals()... >>> func1()1Traceback (most recent call last): File "<stdin>", line 1, in <module> File "<stdin>", line 4, in func1NameError: name 'a' is not defined>>> a = 1>>> def inc_a(): a += 1... >>> inc_a()Traceback (most recent call last): File "<stdin>", line 1, in <module> File "<stdin>", line 1, in inc_aUnboundLocalError: local variable 'a' referenced before assignment>>> a = 1>>> def inc_a():... global a... a += 1... >>> inc_a()>>> a2>>> def make_counter():... count = 0... def counter():... nonlocal count... count += 1... return count... return counter... >>> counter_1 = make_counter()>>> counter_2 = make_counter()>>> counter_1()1>>> counter_1()2>>> counter_2()1>>> counter_2()2for node in AST: # 遍历 AST if (node 是一个 code block 的起始): ste = 新建符号表项目() st.st_stack.push(ste) # 新的符号表入栈 st.current_ste.children.append(st) # 将新表项加入上一个表项的子表项 st.current_ste = ste # 将符号表的 当前表项 设置为新建的符号表项 for node1 in code_block.nodes: # 遍历 code_block 里的所有节点 # 递归访问 node1 创建符号加入表项中, XXX 是 node1 的类型 symtable_visit_XXX(node1) st.st_stack.pop() # 将当前表项移出栈 st.current_st = st_stack.pop() else: 递归访问 node 与 sub_nodes(node) 创建符号加入符号表The symbol table requires two passes to determine the scope of each name. The first pass collects raw facts from the AST via the symtable_visit_* functions while the second pass analyzes these facts during a pass over the PySTEntryObjects created during pass 1.
When a function is entered during the second pass, the parent passes the set of all name bindings visible to its children. These bindings are used to determine if non-local variables are free or implicit globals. Names which are explicitly declared nonlocal must exist in this set of visible names - if they do not, a syntax error is raised. After doing the local analysis, it analyzes each of its child blocks using an updated set of name bindings.
There are also two kinds of global variables, implicit and explicit. An explicit global is declared with the global statement. An implicit global is a free variable for which the compiler has found no binding in an enclosing function scope. The implicit global is either a global or a builtin.
Python’s module and class blocks use thexxx_NAMEopcodes to handle these names to implement slightly odd semantics. In such a block, the name is treated as global until it is assigned to; then it is treated as a local.
尽管这些注释试图用清晰的语言来解释这个过程,但其中仍然存在一些歧义,比如: passes the set of all name bindings visible to its children ,这里的 parent 与 children 指的到底是什么?为了理解这些术语,我们来看一看符号表的数据结构。The children update the free variable set. If a local variable is added to the free variable set by the child, the variable is marked as a cell. The function object being defined must provide runtime storage for the variable that may outlive the function’s frame. Cell variables are removed from the free set before the analyze function returns to its parent.
struct symtable { PyObject *st_filename; /* name of file being compiled, decoded from the filesystem encoding */ struct _symtable_entry *st_cur; /* current symbol table entry */ struct _symtable_entry *st_top; /* symbol table entry for module */ PyObject *st_blocks; /* dict: map AST node addresses * to symbol table entries */ PyObject *st_stack; /* list: stack of namespace info */ PyObject *st_global; /* borrowed ref to st_top->ste_symbols */ int st_nblocks; /* number of blocks used. kept for consistency with the corresponding compiler structure */ PyObject *st_private; /* name of current class or NULL */ PyFutureFeatures *st_future; /* module's future features that affect the symbol table */ int recursion_depth; /* current recursion depth */ int recursion_limit; /* recursion limit */};_files/0e20557c-2ee7-4127-ad89-83f5c349fb10.png)
图1.3.5: 一个符号表(symbol table)与符号表项(symbol table entry)的图形化展示
typedef struct _symtable_entry { PyObject_HEAD PyObject *ste_id; /* int: key in ste_table->st_blocks */ PyObject *ste_symbols; /* dict: variable names to flags */ PyObject *ste_name; /* string: name of current block */ PyObject *ste_varnames; /* list of function parameters */ PyObject *ste_children; /* list of child blocks */ PyObject *ste_directives;/* locations of global and nonlocal statements */ _Py_block_ty ste_type; /* module, class, or function */ int ste_nested; /* true if block is nested */ unsigned ste_free : 1; /* true if block has free variables */ unsigned ste_child_free : 1; /* true if a child block has free vars, including free refs to globals */ unsigned ste_generator : 1; /* true if namespace is a generator */ unsigned ste_varargs : 1; /* true if block has varargs */ unsigned ste_varkeywords : 1; /* true if block has varkeywords */ unsigned ste_returns_value : 1; /* true if namespace uses return with an argument */ unsigned ste_needs_class_closure : 1; /* for class scopes, true if a closure over __class__ should be created */ int ste_lineno; /* first line of block */ int ste_col_offset; /* offset of first line of block */ int ste_opt_lineno; /* lineno of last exec or import * */ int ste_opt_col_offset; /* offset of last exec or import * */ int ste_tmpname; /* counter for listcomp temp vars */ struct symtable *ste_table;} PySTEntryObject;/* Flags for def-use information *//* global stmt *//* assignment in code block *//* formal parameter *//* nonlocal stmt *//* name is used *//* name used but not defined in nested block */def make_counter(): count = 0 def counter(): nonlocal count count += 1 return count return counterdef fizzbuzz(n): if n % 3 == 0 and n % 5 == 0: return 'FizzBuzz' elif n % 3 == 0: return 'Fizz' elif n % 5 == 0: return 'Buzz' else: return str(n)_files/e54e7b26-9c62-45e3-9574-f93f39d8ba6a.png)
图1.3.6.a fizzbuzz 函数的 AST 图示(译注:这里的 AST 画的有问题,应该只有最后一个分支对应 str(n) 调用前几个分支返回的都是字面量)
_files/70037a0c-2565-4ee1-a1ec-dbe78f650fdd.png)
LOAD_FAST LOAD_CONST BINARY_MODULO LOAD_CONST COMPARE_OP JUMP_IF_FALSE_OR_POP LOAD_FAST LOAD_CONST BINARY_MODULO LOAD_CONST COMPARE_OP POP_JUMP_IF_FALSE LOAD_CONST RETURN_VALUE JUMP_FORWARD LOAD_FAST LOAD_CONST BINARY_MODULO LOAD_CONST COMPARE_OP POP_JUMP_IF_FALSE LOAD_CONST RETURN_VALUE JUMP_FORWARD LOAD_GLOBAL LOAD_FAST CALL_FUNCTION RETURN_VALUE>>> dis.dis(fizzbuzz) 2 0 LOAD_FAST 0 (n) 2 LOAD_CONST 1 (3) 4 BINARY_MODULO 6 LOAD_CONST 2 (0) 8 COMPARE_OP 2 (==) 10 POP_JUMP_IF_FALSE 28 12 LOAD_FAST 0 (n) 14 LOAD_CONST 3 (5) 16 BINARY_MODULO 18 LOAD_CONST 2 (0) 20 COMPARE_OP 2 (==) 22 POP_JUMP_IF_FALSE 28 3 24 LOAD_CONST 4 ('FizzBuzz') 26 RETURN_VALUE 4 >> 28 LOAD_FAST 0 (n) 30 LOAD_CONST 1 (3) 32 BINARY_MODULO 34 LOAD_CONST 2 (0) 36 COMPARE_OP 2 (==) 38 POP_JUMP_IF_FALSE 44 5 40 LOAD_CONST 5 ('Fizz') 42 RETURN_VALUE 6 >> 44 LOAD_FAST 0 (n) 46 LOAD_CONST 3 (5) 48 BINARY_MODULO 50 LOAD_CONST 2 (0) 52 COMPARE_OP 2 (==) 54 POP_JUMP_IF_FALSE 60 7 56 LOAD_CONST 6 ('Buzz') 58 RETURN_VALUE 9 >> 60 LOAD_GLOBAL 0 (str) 62 LOAD_FAST 0 (n) 64 CALL_FUNCTION 1 66 RETURN_VALUE 68 LOAD_CONST 0 (None) 70 RETURN_VALUE_files/8f23e180-6dd4-459f-af83-9ed883579458.png)
图1.3.6.1 生成 code object 过程中 4 个主要的数据结构。
struct compiler { PyObject *c_filename; struct symtable *c_st; PyFutureFeatures *c_future; /* pointer to module's __future__ */ PyCompilerFlags *c_flags; int c_optimize; /* optimization level */ int c_interactive; /* true if in interactive mode */ int c_nestlevel; struct compiler_unit *u; /* compiler state for current block */ PyObject *c_stack; /* Python list holding compiler_unit ptrs */ PyArena *c_arena; /* pointer to memory allocation arena */};struct compiler_unit { PySTEntryObject *u_ste; PyObject *u_name; PyObject *u_qualname; /* dot-separated qualified name (lazy) */ int u_scope_type; /* The following fields are dicts that map objects to the index of them in co_XXX. The index is used as the argument for opcodes that refer to those collections. */ PyObject *u_consts; /* all constants */ PyObject *u_names; /* all names */ PyObject *u_varnames; /* local variables */ PyObject *u_cellvars; /* cell variables */ PyObject *u_freevars; /* free variables */ PyObject *u_private; /* for private name mangling */ Py_ssize_t u_argcount; /* number of arguments for block */ Py_ssize_t u_kwonlyargcount; /* number of keyword only arguments for block */ /* Pointer to the most recently allocated block. By following b_list members, you can reach all early allocated blocks. */ basicblock *u_blocks; basicblock *u_curblock; /* pointer to current block */ int u_nfblocks; struct fblockinfo u_fblock[CO_MAXBLOCKS]; int u_firstlineno; /* the first lineno of the block */ int u_lineno; /* the lineno for the current stmt */ int u_col_offset; /* the offset of the current stmt */ int u_lineno_set; /* boolean to indicate whether instr has been generated with current lineno */};x
typedef struct basicblock_ { /* Each basicblock in a compilation unit is linked via b_list in the reverse order that the block are allocated. b_list points to the next block, not to be confused with b_next, which is next by control flow. */ struct basicblock_ *b_list; /* number of instructions used */ int b_iused; /* length of instruction array (b_instr) */ int b_ialloc; /* pointer to an array of instructions, initially NULL */ struct instr *b_instr; /* If b_next is non-NULL, it is a pointer to the next block reached by normal control flow. */ struct basicblock_ *b_next; /* b_seen is used to perform a DFS of basicblocks. */ unsigned b_seen : 1; /* b_return is true if a RETURN_VALUE opcode is inserted. */ unsigned b_return : 1; /* depth of stack upon entry of block, computed by stackdepth() */ int b_startdepth; /* instruction offset for block, computed by assemble_jump_offsets() */ int b_offset;} basicblock;struct instr { unsigned i_jabs : 1; // 绝对跳转 jump absolute unsigned i_jrel : 1; // 相对跳转 jump relative unsigned i_hasarg : 1; unsigned char i_opcode; int i_oparg; struct basicblock_ *i_target; /* target block (if jump instruction) */ int i_lineno;};
... totsize = 0; for (i = a->a_nblocks - 1; i >= 0; i--) { b = a->a_postorder[i]; bsize = blocksize(b); b->b_offset = totsize; totsize += bsize; } ...x
... for (b = c->u->u_blocks; b != NULL; b = b->b_list) { bsize = b->b_offset; for (i = 0; i < b->b_iused; i++) { struct instr *instr = &b->b_instr[i]; /* Relative jumps are computed relative to the instruction pointer after fetching the jump instruction. */ bsize += instrsize(instr); if (instr->i_jabs) // 绝对跳转 instr->i_oparg = instr->i_target->b_offset; else if (instr->i_jrel) { // 相对跳转 int delta = instr->i_target->b_offset - bsize; instr->i_oparg = delta; } else continue; if (instr->i_oparg > 0xffff) extended_arg_count++; } } ...