Home >Database >Mysql Tutorial >Mysql源码学习――打造专属语法_MySQL

Mysql源码学习――打造专属语法_MySQL

WBOY
WBOYOriginal
2016-06-01 13:44:111091browse

bitsCN.com

 

语法分析——YACC

 

         接触过SQL语句的人都会看过这家或者那家的SQL手册,其语法标准应该是从SQL92开始吧,在看SQL92标准的时候,你会发现里面定义的都是一些巴科斯范式(BNF),就是一种语法定义的标准。不管是牛X哄哄的ORACLE,还是不幸被其收购的Mysql,都会遵循里面的标准语法,当然一些扩展的语法除外,比如今天我们就会扩展一个简单的语法^-^。

 

         OK,大家知道了SQL语法的来源,那么如何进行语法解析呢?YACC!!(Yet Another Compiler Compiler),它的书写方式便是BNF,语法解析的利器。YACC接收来自词法分析阶段分解出来的token,然后去匹配那些BNF。今天哥就来揭开它的面纱。(关于YACC的基本使用方法,大家可以看我上一篇中提到IBM的链接,一定要看懂那个先)

 

         继续上一节的语句SELECT @@VERSION_COMMET,为了简单,这里省去后缀limit 1。Mysql的语法文件是sql_yacc.yy,首先给出这条语句涉及到的语法节点(大体浏览下即可):

 

?

 query:

END_OF_INPUT

{...}

|| verb_clause

{...}

| verb_clause END_OF_INPUT

          {

            /* Single query, not terminated. */

            YYLIP->found_semicolon= NULL;

          }

 

verb_clause:

          statement

        | begin

        ;

 

statement:

          alter

        | analyze

        | backup

        | binlog_base64_event

        | call

        | change

        | check

        | checksum

        | commit

        | create

        | deallocate

        | delete

        | describe

        | do

        | drop

        | execute

        | flush

        | grant

        | handler

        | help

        | insert

        | install

        | kill

        | load

        | lock

        | optimize

        | keycache

        | partition_entry

        | preload

        | prepare

        | purge

        | release

        | rename

        | repair

        | replace

        | reset

        | restore

        | revoke

        | rollback

        | savepoint

        | select

        | set

        | show

        | slave

        | start

        | truncate

        | uninstall

        | unlock

        | update

        | use

        | xa

        ;

 

select:

          select_init

          {

            LEX *lex= Lex;

            lex->sql_command= SQLCOM_SELECT;

          }

        ;

 

select_init:

          SELECT_SYM select_init2

        | '(' select_paren ')' union_opt

        ;

 

 

select_init2:

          select_part2

          {

            LEX *lex= Lex;

            SELECT_LEX * sel= lex->current_select;

            if (lex->current_select->set_braces(0))

            {

              my_parse_error(ER(ER_SYNTAX_ERROR));

              MYSQL_YYABORT;

            }

            if (sel->linkage == UNION_TYPE &&

                sel->master_unit()->first_select()->braces)

            {

              my_parse_error(ER(ER_SYNTAX_ERROR));

              MYSQL_YYABORT;

            }

          }

          union_clause

        ;

 

select_part2:

          {

            LEX *lex= Lex;

            SELECT_LEX *sel= lex->current_select;

            if (sel->linkage != UNION_TYPE)

              mysql_init_select(lex);

            lex->current_select->parsing_place= SELECT_LIST;

          }

          select_options select_item_list

          {

            Select->parsing_place= NO_MATTER;

          }

          select_into select_lock_type

        ;

?

 

select_item_list:

          select_item_list ',' select_item

        | select_item

        | '*'

          {

            THD *thd= YYTHD;

            Item *item= new (thd->mem_root)

                          Item_field(&thd->lex->current_select->context,

                                     NULL, NULL, "*");

            if (item == NULL)

              MYSQL_YYABORT;

            if (add_item_to_list(thd, item))

              MYSQL_YYABORT;

            (thd->lex->current_select->with_wild)++;

          }

        ;

 

select_item:

          remember_name select_item2 remember_end select_alias

          {

            THD *thd= YYTHD;

            DBUG_ASSERT($1

 

            if (add_item_to_list(thd, $2))

              MYSQL_YYABORT;

            if ($4.str)

            {

              if (Lex->sql_command == SQLCOM_CREATE_VIEW &&

                  check_column_name($4.str))

              {

                my_error(ER_WRONG_COLUMN_NAME, MYF(0), $4.str);

                MYSQL_YYABORT;

              }

              $2->is_autogenerated_name= FALSE;

              $2->set_name($4.str, $4.length, system_charset_info);

            }

            else if (!$2->name)

            {

              $2->set_name($1, (uint) ($3 - $1), thd->charset());

            }

          }

        ;

 

variable:

          '@'

          {

            if (! Lex->parsing_options.allows_variable)

            {

              my_error(ER_VIEW_SELECT_VARIABLE, MYF(0));

              MYSQL_YYABORT;

            }

          }

          variable_aux

          {

            $$= $3;

          }

        ;

 

variable_aux:

          ident_or_text SET_VAR expr

          {

            Item_func_set_user_var *item;

            $$= item= new (YYTHD->mem_root) Item_func_set_user_var($1, $3);

            if ($$ == NULL)

              MYSQL_YYABORT;

            LEX *lex= Lex;

            lex->uncacheable(UNCACHEABLE_RAND);

            lex->set_var_list.push_back(item);

          }

        | ident_or_text

          {

            $$= new (YYTHD->mem_root) Item_func_get_user_var($1);

            if ($$ == NULL)

              MYSQL_YYABORT;

            LEX *lex= Lex;

            lex->uncacheable(UNCACHEABLE_RAND);

          }

        | '@' opt_var_ident_type ident_or_text opt_component

          {

            /* disallow "SELECT @@global.global.variable" */

            if ($3.str && $4.str && check_reserved_words(&$3))

            {

              my_parse_error(ER(ER_SYNTAX_ERROR));

              MYSQL_YYABORT;

            }

            if (!($$= get_system_var(YYTHD, $2, $3, $4)))

              MYSQL_YYABORT;

            if (!((Item_func_get_system_var*) $$)->is_written_to_binlog())

              Lex->set_stmt_unsafe();

          }

        ;

下面我们仔细的来看一下整个SELECT语法节点的执行流程:

 

?

query->verb_clause->statement->select->select_init->select_init2->select_part2->select_item_list->select_item…->variable

语法是自上而下的,实际的解析过程是自下而上的匹配过程。词法分析首先yacc送来SELECT关键字,上一节说过为什么SELECT是关键字呢?

 

我们看下sql_yacc.yy,可以找到如下一个定义:

 

?

%token  SELECT_SYM                    /* SQL-2003-R */

这里其实是定义了一个宏SELECT_SYM,代表一个关键字,宏定义如下:

 

?

#define SELECT_SYM 687

那么字符串"SELECT"和SELECT_SYM是如何联系在一起的呢?我们回头看下MYSQLlex中的find_keyword这个函数:

?

static int find_keyword(Lex_input_stream *lip, uint len, bool function)

{

  const char *tok= lip->get_tok_start();

 

  SYMBOL *symbol= get_hash_symbol(tok, len, function);

  if (symbol)

  {

    lip->yylval->symbol.symbol=symbol;

    lip->yylval->symbol.str= (char*) tok;

    lip->yylval->symbol.length=len;

 

    if ((symbol->tok == NOT_SYM) &&

        (lip->m_thd->variables.sql_mode & MODE_HIGH_NOT_PRECEDENCE))

      return NOT2_SYM;

    if ((symbol->tok == OR_OR_SYM) &&

    !(lip->m_thd->variables.sql_mode & MODE_PIPES_AS_CONCAT))

      return OR2_SYM;

 

    return symbol->tok;

  }

  return 0;

}

 

static SYMBOL *get_hash_symbol(const char *s,

                               unsigned int len,bool function)

{

  register uchar *hash_map;

  register const char *cur_str= s;

 

  if (len == 0) {

    DBUG_PRINT("warning", ("get_hash_symbol() received a request for a zero-length symbol, which is probably a mistake."));

    return(NULL);

  }

  if (function){

    if (len>sql_functions_max_len) return 0;

    hash_map= sql_functions_map;

    register uint32 cur_struct= uint4korr(hash_map+((len-1)*4));

 

    for (;;){

      register uchar first_char= (uchar)cur_struct;

 

      if (first_char == 0)

      {

        register int16 ires= (int16)(cur_struct>>16);

        if (ires==array_elements(symbols)) return 0;

        register SYMBOL *res;

        if (ires>=0)

          res= symbols+ires;

        else

          res= sql_functions-ires-1;

          register uint count= (uint) (cur_str - s);

        return lex_casecmp(cur_str,res->name+count,len-count) ? 0 : res;

      }

 

      register uchar cur_char= (uchar)to_upper_lex[(uchar)*cur_str];

      if (cur_char

      cur_struct>>=8;

      if (cur_char>(uchar)cur_struct) return 0;

 

      cur_struct>>=8;

      cur_struct= uint4korr(hash_map+

                        (((uint16)cur_struct + cur_char - first_char)*4));

      cur_str++;

    }

  }else{

    if (len>symbols_max_len) return 0;

    hash_map= symbols_map;

    register uint32 cur_struct= uint4korr(hash_map+((len-1)*4));

 

    for (;;){

      register uchar first_char= (uchar)cur_struct;

 

      if (first_char==0){

        register int16 ires= (int16)(cur_struct>>16);

        if (ires==array_elements(symbols)) return 0;

        register SYMBOL *res= symbols+ires;

        register uint count= (uint) (cur_str - s);

        return lex_casecmp(cur_str,res->name+count,len-count)!=0 ? 0 : res;

      }

 

      register uchar cur_char= (uchar)to_upper_lex[(uchar)*cur_str];

      if (cur_char

      cur_struct>>=8;

      if (cur_char>(uchar)cur_struct) return 0;

 

      cur_struct>>=8;

      cur_struct= uint4korr(hash_map+

                        (((uint16)cur_struct + cur_char - first_char)*4));

      cur_str++;

    }

  }

}

其中的get_hash_symbol便是去系统中查找关键字,第三个参数function代表是否去查找系统函数,我们这里是系统变量,不是函数,故为FALSE。所有的关键字都挂在了hash_map上,即symbols_map上。symbols_maps又是一堆处理过的数据:

?

 

static uchar symbols_map[11828]= {

'', 29, 0,

'!', '|', 32, 0,

'

'B', 'Y', 11, 1,

'A', 'W', 147, 2,

'A', 'V', 0, 4,

...

看一下这个文件的最上面的注释吧,看看有啥有用的信息,果然被找到了:

?

1

2

/* Do not edit this file!  This is generated by gen_lex_hash.cc

that seeks for a perfect hash function */

看到了这个注释,心中豁然开朗,原来lex_hash.h是由gen_lex_hash.cc进行生成的,大家千万不要自己进行编辑此文件啊!!

 

来gen_lex_hash.cc看下吧,看到了个main函数,里面是一些生成文件的操作,在generate_find_structs函数中找到了insert_symbols,

 

这应该是初始化我们的symbols_map数组了吧。

 

?

 

void insert_symbols()

{

  size_t i= 0;

  SYMBOL *cur;

  for (cur= symbols; i

    hash_lex_struct *root=

      get_hash_struct_by_len(&root_by_len,cur->length,&max_len);

    insert_into_hash(root,cur->name,0,(uint) i,0);

  }

}

看到函数的实现是循环取数组symbols,找到symbols定义,在文件lex.h中,看到这个数组,我想大家就会了然了:

?

1

{ "SELECT",     SYM(SELECT_SYM)},

这就是将SELECT字符串与SELECT_SYM关联的地方了,bingo!

 

我们再来捋一下SELECT解析的思路,词法分析解析到SELECT后,执行find_keyword去找是否是关键字,发现SELECT是关键字,

 

于是给yacc返回SELECT_SYM用于语法分析。note:如果我们想要加关键字,只需在sql_yacc.yy上面添加一个%token xxx,

 

然后在lex.h里面加入相应的字符串和SYM的对应即可。

 

下面看下@@version_comment这个系统变量如何解析的,首先给出其语法节点:

 

?

 

variable_aux:

...

  | '@' opt_var_ident_type ident_or_text opt_component

          {

            /* disallow "SELECT @@global.global.variable" */

            if ($3.str && $4.str && check_reserved_words(&$3))

            {

              my_parse_error(ER(ER_SYNTAX_ERROR));

              MYSQL_YYABORT;

            }

            if (!($$= get_system_var(YYTHD, $2, $3, $4)))

              MYSQL_YYABORT;

            if (!((Item_func_get_system_var*) $$)->is_written_to_binlog())

              Lex->set_stmt_unsafe();

          }

        ;

这里便是查找系统变量的地方了:get_system_var,我们跟进去看下:

 

?

 

Item *get_system_var(THD *thd, enum_var_type var_type, LEX_STRING name,

             LEX_STRING component)

{

  sys_var *var;

  LEX_STRING *base_name, *component_name;

 

  if (component.str)

  {

    base_name= &component;

    component_name= &name;

  }

  else

  {

    base_name= &name;

    component_name= &component;         // Empty string

  }

 

  if (!(var= find_sys_var(thd, base_name->str, base_name->length)))

    return 0;

  if (component.str)

  {

    if (!var->is_struct())

    {

      my_error(ER_VARIABLE_IS_NOT_STRUCT, MYF(0), base_name->str);

      return 0;

    }

  }

  thd->lex->uncacheable(UNCACHEABLE_SIDEEFFECT);

 

  set_if_smaller(component_name->length, MAX_SYS_VAR_LENGTH);

 

  return new Item_func_get_system_var(var, var_type, component_name,

                                      NULL, 0);

}

    由find_sys_var函数不断跟进去,我们跟到了set_var.cc,找到了如下定义:

 

?

1

static sys_var_chain vars = { NULL, NULL };

    系统变量都会挂载在次链上。在文件中,搜索到了version_comment:

 

?

 

static sys_var_const_str    sys_version_comment(&vars, "version_comment",

                                            MYSQL_COMPILATION_COMMENT);

?

1

#define MYSQL_COMPILATION_COMMENT   "Source distribution"

这便是将version_comment加载到vars的链表上。

 

OK,我们也来加一个自己的系统变量:

 

?

 

static sys_var_const_str    sys_version_comment(&vars, "version_comment",

                                            MYSQL_COMPILATION_COMMENT);

 

/**add by nocode */

static sys_var_const_str    sys_version_comment_test(&vars, "nocode_test_sysvar",

                                            MYSQL_COMPILATION_NOCODE_TEST_SYSVAR);

#define MYSQL_COMPILATION_COMMENT    "Source distribution"

#define MYSQL_COMPILATION_NOCODE_TEST_SYSVAR  "No code in heart"    /*add by nocode*/

 

?

1

 

注释add by nocode的地方,即是新添加的系统变量和宏定义,我们的系统变量叫@@nocode_test_sysvar,其值为No code in heartOK,重新编译代码,执行SELECT语句,OK了。

?

 

mysql> select @@nocode_test_sysvar;

+----------------------+

| @@nocode_test_sysvar |

+----------------------+

| No code in heart     |

+----------------------+

1 row in set (0.01 sec)

上面添加了一个系统变量,并没有修改语法文件sql_yacc.yy,为了加深理解,我们添加一个属于自己的语法:nocode语法,为了简单化实现,我们的目标很简单,在客户端输入no_code后显示字符串"MAKE BY NOCODE"。

定义关键字

首先在sql_yacc.yy文件中添加相应的SYMBOL

?

 

%token  NO_SYM                        /* SQL-2003-R */

%token  NO_CODE_SYM                   /* add by nocode*/

%token  NO_WAIT_SYM

然后在lex.h中的symblos数组中添加nocode的字符串和符号的对应关系:

?

 

{ "NO",       SYM(NO_SYM)},

{ "NO_CODE",      SYM(NO_CODE_SYM)}, /*add by nocode*/

{ "NO_WAIT",      SYM(NO_WAIT_SYM)},

ok,至此我们关键字已经添加进去了

 

添加语法节点

我们给语法分支节点起名叫nocode,定义如下:

 

?

 

/**add by nocode*/

nocode:

        NO_CODE_SYM

        {

            THD *thd= YYTHD;

            LEX *lex= Lex;

            SELECT_LEX *sel= lex->current_select;

            Item_string* field;

            LEX_STRING tmp;

            CHARSET_INFO *cs_con= thd->variables.collation_connection;

            CHARSET_INFO *cs_cli= thd->variables.character_set_client;

 

            if (sel->linkage != UNION_TYPE)

                mysql_init_select(lex);

            lex->current_select->parsing_place= SELECT_LIST;

 

            uint repertoire= thd->lex->text_string_is_7bit &&

                my_charset_is_ascii_based(cs_cli) ? MY_REPERTOIRE_ASCII : MY_REPERTOIRE_UNICODE30;

 

            tmp.str = "MAKE BY NOCODE";

            tmp.length = strlen(tmp.str);

 

            field= new (thd->mem_root) Item_string(tmp.str, tmp.length, cs_con,

                DERIVATION_COERCIBLE,

                repertoire);

            if (field== NULL)

                MYSQL_YYABORT;

 

            if (add_item_to_list(thd, field))

                MYSQL_YYABORT;

 

            Select->parsing_place= NO_MATTER;

            lex->sql_command= SQLCOM_SELECT;

        }

        ;

    最后要在statement的语法节点上加入nocode分支,我就不贴不来了。只要读到"no_code"便会进行进入这个语法分支。在这个分支里,做了一些操作,首先构造了一个SELECT类型的语句,然后对其添加了一列,这列的名称就是"MAKE BY NOCODE"…具体的细节大家自己研究吧,这都不是本文的重点。

 

    语法添加完之后,我们重新编译项目,值得说明的是,Mysql还是项目组织还是非常好的,修改了语法文件之后,不需要我们自己去用bison编译,项目自动就帮我们编译好了,真是不错。重启服务器,在客户端输入no_code,结果如下:

 

?

 

mysql> no_code;

+----------------+

| MAKE BY NOCODE |

+----------------+

| MAKE BY NOCODE |

+----------------+

1 row in set (3.02 sec)

语法分析到此结束。这里只添加了一个很简单的语法分支,没啥用处,主要是介绍下添加分支的步骤,大家添加分支的时候要尽量使用已有的分支,既减少劳动量,同时也会减少语法冲突。 唠叨两句,最近项目太紧张,压力山大,每晚都被噩梦惊醒,噩梦中总会想到算法的各种BUG,写个代码都提心吊胆的,哎,搞IT的真是悲催啊。PS 终于又更新了一篇,oh yeah,-_-ps again: 第一次用windows live writer写博客,感觉比网页方便多了~~,赞一个


摘自 心中无码 bitsCN.com

Statement:
The content of this article is voluntarily contributed by netizens, and the copyright belongs to the original author. This site does not assume corresponding legal responsibility. If you find any content suspected of plagiarism or infringement, please contact admin@php.cn