图1. python 代码执行过程示意图
$ python test.py
typedef struct _is {
struct _is *next;
struct _ts *tstate_head;
PyObject *modules;
PyObject *modules_by_index;
PyObject *sysdict;
PyObject *builtins;
PyObject *importlib;
PyObject *codec_search_path;
PyObject *codec_search_cache;
PyObject *codec_error_registry;
int codecs_initialized;
int fscodec_initialized;
PyObject *builtins_copy;
} PyInterpreterState;
typedef struct _ts {
struct _ts *prev;
struct _ts *next;
PyInterpreterState *interp;
struct _frame *frame;
int recursion_depth;
char overflowed;
char recursion_critical;
int tracing;
int use_tracing;
Py_tracefunc c_profilefunc;
Py_tracefunc c_tracefunc;
PyObject *c_profileobj;
PyObject *c_traceobj;
PyObject *curexc_type;
PyObject *curexc_value;
PyObject *curexc_traceback;
PyObject *exc_type;
PyObject *exc_value;
PyObject *exc_traceback;
PyObject *dict; /* Stores per-thread state */
int gilstate_counter;
...
} PyThreadState;
>>> import dis
>>> def square(x):
... return x*x
...
>>> dis.dis(square)
2 0 LOAD_FAST 0 (x)
2 LOAD_FAST 0 (x)
4 BINARY_MULTIPLY
6 RETURN_VALUE
...
stmt: simple_stmt | compound_stmt
simple_stmt: small_stmt (';' small_stmt)* [';'] NEWLINE
small_stmt: (expr_stmt | del_stmt | pass_stmt | flow_stmt |
import_stmt | global_stmt | nonlocal_stmt | assert_stmt)
expr_stmt: testlist_star_expr (augassign (yield_expr|testlist) |
('=' (yield_expr|testlist_star_expr))*)
testlist_star_expr: (test|star_expr) (',' (test|star_expr))* [',']
augassign: ('+=' | '-=' | '*=' | '@=' | '/=' | '%=' | '&=' | '|=' | '^=' |
'<<=' | '>>=' | '**=' | '//=')
del_stmt: 'del' exprlist
pass_stmt: 'pass'
flow_stmt: break_stmt | continue_stmt | return_stmt | raise_stmt | yield_stmt
break_stmt: 'break'
continue_stmt: 'continue'
return_stmt: 'return' [testlist]
yield_stmt: yield_expr
raise_stmt: 'raise' [test ['from' test]]
import_stmt: import_name | import_from
import_name: 'import' dotted_as_names
import_from: ('from' (('.' | '...')* dotted_name | ('.' | '...')+)
'import' ('*' | '(' import_as_names ')' | import_as_names))
import_as_name: NAME ['as' NAME]
...
+
, *
能够对值进行运算产生结果。(, )
, {,}
, =
, *=
等。"Fred"
, b"Fred"
、数值类型的字面量 2 、浮点类型字面量 1e100 、虚数字面量 10j 。使用 0 初始化 indent 栈
for 每一个 logical line:
if (当前行的缩进级别 > 栈顶的级别):
push(当前缩进级别)
生成一个 INDENT token
if (当前行的缩进级别 < 栈顶的级别):
if (栈中不存在一个缩进级别 == 当前缩进级别):
报告一个错误
for 栈顶的每一个 != 当前行缩进级的值:
移除这个值
生成一个 DEDENT token
tokenize(当前行)
对每一个栈上的值(除了0)生成 DEDENT token
>>> from tokenize import tokenize
>>> f = open("./test.py", 'rb') # test.py 的内容只有一行 print("hello")
>>> for t in tokenize(f.readline):
... print(t)
...
TokenInfo(type=59 (ENCODING), string='utf-8', start=(0, 0), end=(0, 0), line='')
TokenInfo(type=1 (NAME), string='print', start=(1, 0), end=(1, 5), line='print("hello")')
TokenInfo(type=53 (OP), string='(', start=(1, 5), end=(1, 6), line='print("hello")')
TokenInfo(type=3 (STRING), string='"hello"', start=(1, 6), end=(1, 13), line='print("hello")')
TokenInfo(type=53 (OP), string=')', start=(1, 13), end=(1, 14), line='print("hello")')
TokenInfo(type=0 (ENDMARKER), string='', start=(2, 0), end=(2, 0), line='')
$ python -m tokenize test.py
0,0-0,0: ENCODING 'utf-8'
1,0-1,5: NAME 'print'
1,5-1,6: OP '('
1,6-1,13: STRING '"hello"'
1,13-1,14: OP ')'
2,0-2,0: ENDMARKER ''
>>> import parser
>>> from pprint import pprint
>>> code_str = """
... def hello_world():
... return 'hello world'
... """
>>> st = parser.suite(code_str)
>>> pprint(parser.st2list(st))
[257,
[269,
[295,
[263,
[1, 'def'],
[1, 'hello_world'],
[264, [7, '('], [8, ')']],
[11, ':'],
[304,
[4, ''],
[5, ''],
[269,
[270,
[271,
[278,
[281,
[1, 'return'],
[331,
[305,
[309,
[310,
[311,
[312,
[315,
[316,
[317,
[318,
[319,
[320,
[321,
[322,
[323, [324, [3, "'hello world'"]]]]]]]]]]]]]]]]]]]],
[4, '']]],
[6, '']]]]],
[4, ''],
[0, '']]
代码清单 1.3.2
图1.3.2:代码清单 1.3.2 中所展示的 parse tree 的图形化展示
typedef struct _node {
short n_type;
char *n_str;
int n_lineno;
int n_col_offset;
int n_nchildren;
struct _node *n_child;
} node;
>>> import ast
>>> node = ast.parse(code_str, mode='exec')
>>> ast.dump(node)
"Module(body=[FunctionDef(name='hello_world', args=arguments(args=[], vararg=None, kwonlyargs=[], kw_defaults=[], kwarg=None, defaults=[]), body=[Return(value=Str(s='hello world'))], decorator_list=[], returns=None)])"
struct _stmt {
enum _stmt_kind kind;
union {
struct {
identifier name;
arguments_ty args;
asdl_seq *body;
asdl_seq *decorator_list;
expr_ty returns;
} FunctionDef;
struct {
identifier name;
arguments_ty args;
asdl_seq *body;
asdl_seq *decorator_list;
expr_ty returns;
} AsyncFunctionDef;
struct {
identifier name;
asdl_seq *bases;
asdl_seq *keywords;
asdl_seq *body;
asdl_seq *decorator_list;
} ClassDef;
...
} v;
int lineno;
int col_offset;
};
>>> x = 5
>>> def func1():
... locals()['a'] = 1
... print(locals()['a'])
... print(a)
... return locals()
...
>>> func1()
1
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "<stdin>", line 4, in func1
NameError: name 'a' is not defined
>>> a = 1
>>> def inc_a(): a += 1
...
>>> inc_a()
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "<stdin>", line 1, in inc_a
UnboundLocalError: local variable 'a' referenced before assignment
>>> a = 1
>>> def inc_a():
... global a
... a += 1
...
>>> inc_a()
>>> a
2
>>> def make_counter():
... count = 0
... def counter():
... nonlocal count
... count += 1
... return count
... return counter
...
>>> counter_1 = make_counter()
>>> counter_2 = make_counter()
>>> counter_1()
1
>>> counter_1()
2
>>> counter_2()
1
>>> counter_2()
2
for node in AST: # 遍历 AST
if (node 是一个 code block 的起始):
ste = 新建符号表项目()
st.st_stack.push(ste) # 新的符号表入栈
st.current_ste.children.append(st) # 将新表项加入上一个表项的子表项
st.current_ste = ste # 将符号表的 当前表项 设置为新建的符号表项
for node1 in code_block.nodes: # 遍历 code_block 里的所有节点
# 递归访问 node1 创建符号加入表项中, XXX 是 node1 的类型
symtable_visit_XXX(node1)
st.st_stack.pop() # 将当前表项移出栈
st.current_st = st_stack.pop()
else:
递归访问 node 与 sub_nodes(node) 创建符号加入符号表
The symbol table requires two passes to determine the scope of each name. The first pass collects raw facts from the AST via the symtable_visit_* functions while the second pass analyzes these facts during a pass over the PySTEntryObjects created during pass 1.
When a function is entered during the second pass, the parent passes the set of all name bindings visible to its children. These bindings are used to determine if non-local variables are free or implicit globals. Names which are explicitly declared nonlocal must exist in this set of visible names - if they do not, a syntax error is raised. After doing the local analysis, it analyzes each of its child blocks using an updated set of name bindings.
There are also two kinds of global variables, implicit and explicit. An explicit global is declared with the global statement. An implicit global is a free variable for which the compiler has found no binding in an enclosing function scope. The implicit global is either a global or a builtin.
Python’s module and class blocks use thexxx_NAME
opcodes to handle these names to implement slightly odd semantics. In such a block, the name is treated as global until it is assigned to; then it is treated as a local.
尽管这些注释试图用清晰的语言来解释这个过程,但其中仍然存在一些歧义,比如: passes the set of all name bindings visible to its children ,这里的 parent 与 children 指的到底是什么?为了理解这些术语,我们来看一看符号表的数据结构。The children update the free variable set. If a local variable is added to the free variable set by the child, the variable is marked as a cell. The function object being defined must provide runtime storage for the variable that may outlive the function’s frame. Cell variables are removed from the free set before the analyze function returns to its parent.
struct symtable {
PyObject *st_filename; /* name of file being compiled,
decoded from the filesystem encoding */
struct _symtable_entry *st_cur; /* current symbol table entry */
struct _symtable_entry *st_top; /* symbol table entry for module */
PyObject *st_blocks; /* dict: map AST node addresses
* to symbol table entries */
PyObject *st_stack; /* list: stack of namespace info */
PyObject *st_global; /* borrowed ref to st_top->ste_symbols */
int st_nblocks; /* number of blocks used. kept for
consistency with the corresponding
compiler structure */
PyObject *st_private; /* name of current class or NULL */
PyFutureFeatures *st_future; /* module's future features that affect
the symbol table */
int recursion_depth; /* current recursion depth */
int recursion_limit; /* recursion limit */
};
图1.3.5: 一个符号表(symbol table)与符号表项(symbol table entry)的图形化展示
typedef struct _symtable_entry {
PyObject_HEAD
PyObject *ste_id; /* int: key in ste_table->st_blocks */
PyObject *ste_symbols; /* dict: variable names to flags */
PyObject *ste_name; /* string: name of current block */
PyObject *ste_varnames; /* list of function parameters */
PyObject *ste_children; /* list of child blocks */
PyObject *ste_directives;/* locations of global and nonlocal statements */
_Py_block_ty ste_type; /* module, class, or function */
int ste_nested; /* true if block is nested */
unsigned ste_free : 1; /* true if block has free variables */
unsigned ste_child_free : 1; /* true if a child block has free vars,
including free refs to globals */
unsigned ste_generator : 1; /* true if namespace is a generator */
unsigned ste_varargs : 1; /* true if block has varargs */
unsigned ste_varkeywords : 1; /* true if block has varkeywords */
unsigned ste_returns_value : 1; /* true if namespace uses return with
an argument */
unsigned ste_needs_class_closure : 1; /* for class scopes, true if a
closure over __class__
should be created */
int ste_lineno; /* first line of block */
int ste_col_offset; /* offset of first line of block */
int ste_opt_lineno; /* lineno of last exec or import * */
int ste_opt_col_offset; /* offset of last exec or import * */
int ste_tmpname; /* counter for listcomp temp vars */
struct symtable *ste_table;
} PySTEntryObject;
/* Flags for def-use information */
/* global stmt */
/* assignment in code block */
/* formal parameter */
/* nonlocal stmt */
/* name is used */
/* name used but not defined in nested block */
def make_counter():
count = 0
def counter():
nonlocal count
count += 1
return count
return counter
def fizzbuzz(n):
if n % 3 == 0 and n % 5 == 0:
return 'FizzBuzz'
elif n % 3 == 0:
return 'Fizz'
elif n % 5 == 0:
return 'Buzz'
else:
return str(n)
图1.3.6.a fizzbuzz 函数的 AST 图示(译注:这里的 AST 画的有问题,应该只有最后一个分支对应 str(n) 调用前几个分支返回的都是字面量)
LOAD_FAST
LOAD_CONST
BINARY_MODULO
LOAD_CONST
COMPARE_OP
JUMP_IF_FALSE_OR_POP
LOAD_FAST
LOAD_CONST
BINARY_MODULO
LOAD_CONST
COMPARE_OP
POP_JUMP_IF_FALSE
LOAD_CONST
RETURN_VALUE
JUMP_FORWARD
LOAD_FAST
LOAD_CONST
BINARY_MODULO
LOAD_CONST
COMPARE_OP
POP_JUMP_IF_FALSE
LOAD_CONST
RETURN_VALUE
JUMP_FORWARD
LOAD_GLOBAL
LOAD_FAST
CALL_FUNCTION
RETURN_VALUE
>>> dis.dis(fizzbuzz)
2 0 LOAD_FAST 0 (n)
2 LOAD_CONST 1 (3)
4 BINARY_MODULO
6 LOAD_CONST 2 (0)
8 COMPARE_OP 2 (==)
10 POP_JUMP_IF_FALSE 28
12 LOAD_FAST 0 (n)
14 LOAD_CONST 3 (5)
16 BINARY_MODULO
18 LOAD_CONST 2 (0)
20 COMPARE_OP 2 (==)
22 POP_JUMP_IF_FALSE 28
3 24 LOAD_CONST 4 ('FizzBuzz')
26 RETURN_VALUE
4 >> 28 LOAD_FAST 0 (n)
30 LOAD_CONST 1 (3)
32 BINARY_MODULO
34 LOAD_CONST 2 (0)
36 COMPARE_OP 2 (==)
38 POP_JUMP_IF_FALSE 44
5 40 LOAD_CONST 5 ('Fizz')
42 RETURN_VALUE
6 >> 44 LOAD_FAST 0 (n)
46 LOAD_CONST 3 (5)
48 BINARY_MODULO
50 LOAD_CONST 2 (0)
52 COMPARE_OP 2 (==)
54 POP_JUMP_IF_FALSE 60
7 56 LOAD_CONST 6 ('Buzz')
58 RETURN_VALUE
9 >> 60 LOAD_GLOBAL 0 (str)
62 LOAD_FAST 0 (n)
64 CALL_FUNCTION 1
66 RETURN_VALUE
68 LOAD_CONST 0 (None)
70 RETURN_VALUE
图1.3.6.1 生成 code object 过程中 4 个主要的数据结构。
struct compiler {
PyObject *c_filename;
struct symtable *c_st;
PyFutureFeatures *c_future; /* pointer to module's __future__ */
PyCompilerFlags *c_flags;
int c_optimize; /* optimization level */
int c_interactive; /* true if in interactive mode */
int c_nestlevel;
struct compiler_unit *u; /* compiler state for current block */
PyObject *c_stack; /* Python list holding compiler_unit ptrs */
PyArena *c_arena; /* pointer to memory allocation arena */
};
struct compiler_unit {
PySTEntryObject *u_ste;
PyObject *u_name;
PyObject *u_qualname; /* dot-separated qualified name (lazy) */
int u_scope_type;
/* The following fields are dicts that map objects to
the index of them in co_XXX. The index is used as
the argument for opcodes that refer to those collections.
*/
PyObject *u_consts; /* all constants */
PyObject *u_names; /* all names */
PyObject *u_varnames; /* local variables */
PyObject *u_cellvars; /* cell variables */
PyObject *u_freevars; /* free variables */
PyObject *u_private; /* for private name mangling */
Py_ssize_t u_argcount; /* number of arguments for block */
Py_ssize_t u_kwonlyargcount; /* number of keyword only arguments for block */
/* Pointer to the most recently allocated block. By following b_list
members, you can reach all early allocated blocks. */
basicblock *u_blocks;
basicblock *u_curblock; /* pointer to current block */
int u_nfblocks;
struct fblockinfo u_fblock[CO_MAXBLOCKS];
int u_firstlineno; /* the first lineno of the block */
int u_lineno; /* the lineno for the current stmt */
int u_col_offset; /* the offset of the current stmt */
int u_lineno_set; /* boolean to indicate whether instr
has been generated with current lineno */
};
x
typedef struct basicblock_ {
/* Each basicblock in a compilation unit is linked via b_list in the
reverse order that the block are allocated. b_list points to the next
block, not to be confused with b_next, which is next by control flow. */
struct basicblock_ *b_list;
/* number of instructions used */
int b_iused;
/* length of instruction array (b_instr) */
int b_ialloc;
/* pointer to an array of instructions, initially NULL */
struct instr *b_instr;
/* If b_next is non-NULL, it is a pointer to the next
block reached by normal control flow. */
struct basicblock_ *b_next;
/* b_seen is used to perform a DFS of basicblocks. */
unsigned b_seen : 1;
/* b_return is true if a RETURN_VALUE opcode is inserted. */
unsigned b_return : 1;
/* depth of stack upon entry of block, computed by stackdepth() */
int b_startdepth;
/* instruction offset for block, computed by assemble_jump_offsets() */
int b_offset;
} basicblock;
struct instr {
unsigned i_jabs : 1; // 绝对跳转 jump absolute
unsigned i_jrel : 1; // 相对跳转 jump relative
unsigned i_hasarg : 1;
unsigned char i_opcode;
int i_oparg;
struct basicblock_ *i_target; /* target block (if jump instruction) */
int i_lineno;
};
...
totsize = 0;
for (i = a->a_nblocks - 1; i >= 0; i--) {
b = a->a_postorder[i];
bsize = blocksize(b);
b->b_offset = totsize;
totsize += bsize;
}
...
x
...
for (b = c->u->u_blocks; b != NULL; b = b->b_list) {
bsize = b->b_offset;
for (i = 0; i < b->b_iused; i++) {
struct instr *instr = &b->b_instr[i];
/* Relative jumps are computed relative to
the instruction pointer after fetching
the jump instruction.
*/
bsize += instrsize(instr);
if (instr->i_jabs) // 绝对跳转
instr->i_oparg = instr->i_target->b_offset;
else if (instr->i_jrel) { // 相对跳转
int delta = instr->i_target->b_offset - bsize;
instr->i_oparg = delta;
}
else
continue;
if (instr->i_oparg > 0xffff)
extended_arg_count++;
}
}
...