search
HomeDatabaseMysql TutorialMysql源码学习――打造专属语法_MySQL

bitsCN.com

 

语法分析——YACC

 

         接触过SQL语句的人都会看过这家或者那家的SQL手册,其语法标准应该是从SQL92开始吧,在看SQL92标准的时候,你会发现里面定义的都是一些巴科斯范式(BNF),就是一种语法定义的标准。不管是牛X哄哄的ORACLE,还是不幸被其收购的Mysql,都会遵循里面的标准语法,当然一些扩展的语法除外,比如今天我们就会扩展一个简单的语法^-^。

 

         OK,大家知道了SQL语法的来源,那么如何进行语法解析呢?YACC!!(Yet Another Compiler Compiler),它的书写方式便是BNF,语法解析的利器。YACC接收来自词法分析阶段分解出来的token,然后去匹配那些BNF。今天哥就来揭开它的面纱。(关于YACC的基本使用方法,大家可以看我上一篇中提到IBM的链接,一定要看懂那个先)

 

         继续上一节的语句SELECT @@VERSION_COMMET,为了简单,这里省去后缀limit 1。Mysql的语法文件是sql_yacc.yy,首先给出这条语句涉及到的语法节点(大体浏览下即可):

 

?

 query:

END_OF_INPUT

{...}

|| verb_clause

{...}

| verb_clause END_OF_INPUT

          {

            /* Single query, not terminated. */

            YYLIP->found_semicolon= NULL;

          }

 

verb_clause:

          statement

        | begin

        ;

 

statement:

          alter

        | analyze

        | backup

        | binlog_base64_event

        | call

        | change

        | check

        | checksum

        | commit

        | create

        | deallocate

        | delete

        | describe

        | do

        | drop

        | execute

        | flush

        | grant

        | handler

        | help

        | insert

        | install

        | kill

        | load

        | lock

        | optimize

        | keycache

        | partition_entry

        | preload

        | prepare

        | purge

        | release

        | rename

        | repair

        | replace

        | reset

        | restore

        | revoke

        | rollback

        | savepoint

        | select

        | set

        | show

        | slave

        | start

        | truncate

        | uninstall

        | unlock

        | update

        | use

        | xa

        ;

 

select:

          select_init

          {

            LEX *lex= Lex;

            lex->sql_command= SQLCOM_SELECT;

          }

        ;

 

select_init:

          SELECT_SYM select_init2

        | '(' select_paren ')' union_opt

        ;

 

 

select_init2:

          select_part2

          {

            LEX *lex= Lex;

            SELECT_LEX * sel= lex->current_select;

            if (lex->current_select->set_braces(0))

            {

              my_parse_error(ER(ER_SYNTAX_ERROR));

              MYSQL_YYABORT;

            }

            if (sel->linkage == UNION_TYPE &&

                sel->master_unit()->first_select()->braces)

            {

              my_parse_error(ER(ER_SYNTAX_ERROR));

              MYSQL_YYABORT;

            }

          }

          union_clause

        ;

 

select_part2:

          {

            LEX *lex= Lex;

            SELECT_LEX *sel= lex->current_select;

            if (sel->linkage != UNION_TYPE)

              mysql_init_select(lex);

            lex->current_select->parsing_place= SELECT_LIST;

          }

          select_options select_item_list

          {

            Select->parsing_place= NO_MATTER;

          }

          select_into select_lock_type

        ;

?

 

select_item_list:

          select_item_list ',' select_item

        | select_item

        | '*'

          {

            THD *thd= YYTHD;

            Item *item= new (thd->mem_root)

                          Item_field(&thd->lex->current_select->context,

                                     NULL, NULL, "*");

            if (item == NULL)

              MYSQL_YYABORT;

            if (add_item_to_list(thd, item))

              MYSQL_YYABORT;

            (thd->lex->current_select->with_wild)++;

          }

        ;

 

select_item:

          remember_name select_item2 remember_end select_alias

          {

            THD *thd= YYTHD;

            DBUG_ASSERT($1

 

            if (add_item_to_list(thd, $2))

              MYSQL_YYABORT;

            if ($4.str)

            {

              if (Lex->sql_command == SQLCOM_CREATE_VIEW &&

                  check_column_name($4.str))

              {

                my_error(ER_WRONG_COLUMN_NAME, MYF(0), $4.str);

                MYSQL_YYABORT;

              }

              $2->is_autogenerated_name= FALSE;

              $2->set_name($4.str, $4.length, system_charset_info);

            }

            else if (!$2->name)

            {

              $2->set_name($1, (uint) ($3 - $1), thd->charset());

            }

          }

        ;

 

variable:

          '@'

          {

            if (! Lex->parsing_options.allows_variable)

            {

              my_error(ER_VIEW_SELECT_VARIABLE, MYF(0));

              MYSQL_YYABORT;

            }

          }

          variable_aux

          {

            $$= $3;

          }

        ;

 

variable_aux:

          ident_or_text SET_VAR expr

          {

            Item_func_set_user_var *item;

            $$= item= new (YYTHD->mem_root) Item_func_set_user_var($1, $3);

            if ($$ == NULL)

              MYSQL_YYABORT;

            LEX *lex= Lex;

            lex->uncacheable(UNCACHEABLE_RAND);

            lex->set_var_list.push_back(item);

          }

        | ident_or_text

          {

            $$= new (YYTHD->mem_root) Item_func_get_user_var($1);

            if ($$ == NULL)

              MYSQL_YYABORT;

            LEX *lex= Lex;

            lex->uncacheable(UNCACHEABLE_RAND);

          }

        | '@' opt_var_ident_type ident_or_text opt_component

          {

            /* disallow "SELECT @@global.global.variable" */

            if ($3.str && $4.str && check_reserved_words(&$3))

            {

              my_parse_error(ER(ER_SYNTAX_ERROR));

              MYSQL_YYABORT;

            }

            if (!($$= get_system_var(YYTHD, $2, $3, $4)))

              MYSQL_YYABORT;

            if (!((Item_func_get_system_var*) $$)->is_written_to_binlog())

              Lex->set_stmt_unsafe();

          }

        ;

下面我们仔细的来看一下整个SELECT语法节点的执行流程:

 

?

query->verb_clause->statement->select->select_init->select_init2->select_part2->select_item_list->select_item…->variable

语法是自上而下的,实际的解析过程是自下而上的匹配过程。词法分析首先yacc送来SELECT关键字,上一节说过为什么SELECT是关键字呢?

 

我们看下sql_yacc.yy,可以找到如下一个定义:

 

?

%token  SELECT_SYM                    /* SQL-2003-R */

这里其实是定义了一个宏SELECT_SYM,代表一个关键字,宏定义如下:

 

?

#define SELECT_SYM 687

那么字符串"SELECT"和SELECT_SYM是如何联系在一起的呢?我们回头看下MYSQLlex中的find_keyword这个函数:

?

static int find_keyword(Lex_input_stream *lip, uint len, bool function)

{

  const char *tok= lip->get_tok_start();

 

  SYMBOL *symbol= get_hash_symbol(tok, len, function);

  if (symbol)

  {

    lip->yylval->symbol.symbol=symbol;

    lip->yylval->symbol.str= (char*) tok;

    lip->yylval->symbol.length=len;

 

    if ((symbol->tok == NOT_SYM) &&

        (lip->m_thd->variables.sql_mode & MODE_HIGH_NOT_PRECEDENCE))

      return NOT2_SYM;

    if ((symbol->tok == OR_OR_SYM) &&

    !(lip->m_thd->variables.sql_mode & MODE_PIPES_AS_CONCAT))

      return OR2_SYM;

 

    return symbol->tok;

  }

  return 0;

}

 

static SYMBOL *get_hash_symbol(const char *s,

                               unsigned int len,bool function)

{

  register uchar *hash_map;

  register const char *cur_str= s;

 

  if (len == 0) {

    DBUG_PRINT("warning", ("get_hash_symbol() received a request for a zero-length symbol, which is probably a mistake."));

    return(NULL);

  }

  if (function){

    if (len>sql_functions_max_len) return 0;

    hash_map= sql_functions_map;

    register uint32 cur_struct= uint4korr(hash_map+((len-1)*4));

 

    for (;;){

      register uchar first_char= (uchar)cur_struct;

 

      if (first_char == 0)

      {

        register int16 ires= (int16)(cur_struct>>16);

        if (ires==array_elements(symbols)) return 0;

        register SYMBOL *res;

        if (ires>=0)

          res= symbols+ires;

        else

          res= sql_functions-ires-1;

          register uint count= (uint) (cur_str - s);

        return lex_casecmp(cur_str,res->name+count,len-count) ? 0 : res;

      }

 

      register uchar cur_char= (uchar)to_upper_lex[(uchar)*cur_str];

      if (cur_char

      cur_struct>>=8;

      if (cur_char>(uchar)cur_struct) return 0;

 

      cur_struct>>=8;

      cur_struct= uint4korr(hash_map+

                        (((uint16)cur_struct + cur_char - first_char)*4));

      cur_str++;

    }

  }else{

    if (len>symbols_max_len) return 0;

    hash_map= symbols_map;

    register uint32 cur_struct= uint4korr(hash_map+((len-1)*4));

 

    for (;;){

      register uchar first_char= (uchar)cur_struct;

 

      if (first_char==0){

        register int16 ires= (int16)(cur_struct>>16);

        if (ires==array_elements(symbols)) return 0;

        register SYMBOL *res= symbols+ires;

        register uint count= (uint) (cur_str - s);

        return lex_casecmp(cur_str,res->name+count,len-count)!=0 ? 0 : res;

      }

 

      register uchar cur_char= (uchar)to_upper_lex[(uchar)*cur_str];

      if (cur_char

      cur_struct>>=8;

      if (cur_char>(uchar)cur_struct) return 0;

 

      cur_struct>>=8;

      cur_struct= uint4korr(hash_map+

                        (((uint16)cur_struct + cur_char - first_char)*4));

      cur_str++;

    }

  }

}

其中的get_hash_symbol便是去系统中查找关键字,第三个参数function代表是否去查找系统函数,我们这里是系统变量,不是函数,故为FALSE。所有的关键字都挂在了hash_map上,即symbols_map上。symbols_maps又是一堆处理过的数据:

?

 

static uchar symbols_map[11828]= {

'', 29, 0,

'!', '|', 32, 0,

'

'B', 'Y', 11, 1,

'A', 'W', 147, 2,

'A', 'V', 0, 4,

...

看一下这个文件的最上面的注释吧,看看有啥有用的信息,果然被找到了:

?

1

2

/* Do not edit this file!  This is generated by gen_lex_hash.cc

that seeks for a perfect hash function */

看到了这个注释,心中豁然开朗,原来lex_hash.h是由gen_lex_hash.cc进行生成的,大家千万不要自己进行编辑此文件啊!!

 

来gen_lex_hash.cc看下吧,看到了个main函数,里面是一些生成文件的操作,在generate_find_structs函数中找到了insert_symbols,

 

这应该是初始化我们的symbols_map数组了吧。

 

?

 

void insert_symbols()

{

  size_t i= 0;

  SYMBOL *cur;

  for (cur= symbols; i

    hash_lex_struct *root=

      get_hash_struct_by_len(&root_by_len,cur->length,&max_len);

    insert_into_hash(root,cur->name,0,(uint) i,0);

  }

}

看到函数的实现是循环取数组symbols,找到symbols定义,在文件lex.h中,看到这个数组,我想大家就会了然了:

?

1

{ "SELECT",     SYM(SELECT_SYM)},

这就是将SELECT字符串与SELECT_SYM关联的地方了,bingo!

 

我们再来捋一下SELECT解析的思路,词法分析解析到SELECT后,执行find_keyword去找是否是关键字,发现SELECT是关键字,

 

于是给yacc返回SELECT_SYM用于语法分析。note:如果我们想要加关键字,只需在sql_yacc.yy上面添加一个%token xxx,

 

然后在lex.h里面加入相应的字符串和SYM的对应即可。

 

下面看下@@version_comment这个系统变量如何解析的,首先给出其语法节点:

 

?

 

variable_aux:

...

  | '@' opt_var_ident_type ident_or_text opt_component

          {

            /* disallow "SELECT @@global.global.variable" */

            if ($3.str && $4.str && check_reserved_words(&$3))

            {

              my_parse_error(ER(ER_SYNTAX_ERROR));

              MYSQL_YYABORT;

            }

            if (!($$= get_system_var(YYTHD, $2, $3, $4)))

              MYSQL_YYABORT;

            if (!((Item_func_get_system_var*) $$)->is_written_to_binlog())

              Lex->set_stmt_unsafe();

          }

        ;

这里便是查找系统变量的地方了:get_system_var,我们跟进去看下:

 

?

 

Item *get_system_var(THD *thd, enum_var_type var_type, LEX_STRING name,

             LEX_STRING component)

{

  sys_var *var;

  LEX_STRING *base_name, *component_name;

 

  if (component.str)

  {

    base_name= &component;

    component_name= &name;

  }

  else

  {

    base_name= &name;

    component_name= &component;         // Empty string

  }

 

  if (!(var= find_sys_var(thd, base_name->str, base_name->length)))

    return 0;

  if (component.str)

  {

    if (!var->is_struct())

    {

      my_error(ER_VARIABLE_IS_NOT_STRUCT, MYF(0), base_name->str);

      return 0;

    }

  }

  thd->lex->uncacheable(UNCACHEABLE_SIDEEFFECT);

 

  set_if_smaller(component_name->length, MAX_SYS_VAR_LENGTH);

 

  return new Item_func_get_system_var(var, var_type, component_name,

                                      NULL, 0);

}

    由find_sys_var函数不断跟进去,我们跟到了set_var.cc,找到了如下定义:

 

?

1

static sys_var_chain vars = { NULL, NULL };

    系统变量都会挂载在次链上。在文件中,搜索到了version_comment:

 

?

 

static sys_var_const_str    sys_version_comment(&vars, "version_comment",

                                            MYSQL_COMPILATION_COMMENT);

?

1

#define MYSQL_COMPILATION_COMMENT   "Source distribution"

这便是将version_comment加载到vars的链表上。

 

OK,我们也来加一个自己的系统变量:

 

?

 

static sys_var_const_str    sys_version_comment(&vars, "version_comment",

                                            MYSQL_COMPILATION_COMMENT);

 

/**add by nocode */

static sys_var_const_str    sys_version_comment_test(&vars, "nocode_test_sysvar",

                                            MYSQL_COMPILATION_NOCODE_TEST_SYSVAR);

#define MYSQL_COMPILATION_COMMENT    "Source distribution"

#define MYSQL_COMPILATION_NOCODE_TEST_SYSVAR  "No code in heart"    /*add by nocode*/

 

?

1

 

注释add by nocode的地方,即是新添加的系统变量和宏定义,我们的系统变量叫@@nocode_test_sysvar,其值为No code in heartOK,重新编译代码,执行SELECT语句,OK了。

?

 

mysql> select @@nocode_test_sysvar;

+----------------------+

| @@nocode_test_sysvar |

+----------------------+

| No code in heart     |

+----------------------+

1 row in set (0.01 sec)

上面添加了一个系统变量,并没有修改语法文件sql_yacc.yy,为了加深理解,我们添加一个属于自己的语法:nocode语法,为了简单化实现,我们的目标很简单,在客户端输入no_code后显示字符串"MAKE BY NOCODE"。

定义关键字

首先在sql_yacc.yy文件中添加相应的SYMBOL

?

 

%token  NO_SYM                        /* SQL-2003-R */

%token  NO_CODE_SYM                   /* add by nocode*/

%token  NO_WAIT_SYM

然后在lex.h中的symblos数组中添加nocode的字符串和符号的对应关系:

?

 

{ "NO",       SYM(NO_SYM)},

{ "NO_CODE",      SYM(NO_CODE_SYM)}, /*add by nocode*/

{ "NO_WAIT",      SYM(NO_WAIT_SYM)},

ok,至此我们关键字已经添加进去了

 

添加语法节点

我们给语法分支节点起名叫nocode,定义如下:

 

?

 

/**add by nocode*/

nocode:

        NO_CODE_SYM

        {

            THD *thd= YYTHD;

            LEX *lex= Lex;

            SELECT_LEX *sel= lex->current_select;

            Item_string* field;

            LEX_STRING tmp;

            CHARSET_INFO *cs_con= thd->variables.collation_connection;

            CHARSET_INFO *cs_cli= thd->variables.character_set_client;

 

            if (sel->linkage != UNION_TYPE)

                mysql_init_select(lex);

            lex->current_select->parsing_place= SELECT_LIST;

 

            uint repertoire= thd->lex->text_string_is_7bit &&

                my_charset_is_ascii_based(cs_cli) ? MY_REPERTOIRE_ASCII : MY_REPERTOIRE_UNICODE30;

 

            tmp.str = "MAKE BY NOCODE";

            tmp.length = strlen(tmp.str);

 

            field= new (thd->mem_root) Item_string(tmp.str, tmp.length, cs_con,

                DERIVATION_COERCIBLE,

                repertoire);

            if (field== NULL)

                MYSQL_YYABORT;

 

            if (add_item_to_list(thd, field))

                MYSQL_YYABORT;

 

            Select->parsing_place= NO_MATTER;

            lex->sql_command= SQLCOM_SELECT;

        }

        ;

    最后要在statement的语法节点上加入nocode分支,我就不贴不来了。只要读到"no_code"便会进行进入这个语法分支。在这个分支里,做了一些操作,首先构造了一个SELECT类型的语句,然后对其添加了一列,这列的名称就是"MAKE BY NOCODE"…具体的细节大家自己研究吧,这都不是本文的重点。

 

    语法添加完之后,我们重新编译项目,值得说明的是,Mysql还是项目组织还是非常好的,修改了语法文件之后,不需要我们自己去用bison编译,项目自动就帮我们编译好了,真是不错。重启服务器,在客户端输入no_code,结果如下:

 

?

 

mysql> no_code;

+----------------+

| MAKE BY NOCODE |

+----------------+

| MAKE BY NOCODE |

+----------------+

1 row in set (3.02 sec)

语法分析到此结束。这里只添加了一个很简单的语法分支,没啥用处,主要是介绍下添加分支的步骤,大家添加分支的时候要尽量使用已有的分支,既减少劳动量,同时也会减少语法冲突。 唠叨两句,最近项目太紧张,压力山大,每晚都被噩梦惊醒,噩梦中总会想到算法的各种BUG,写个代码都提心吊胆的,哎,搞IT的真是悲催啊。PS 终于又更新了一篇,oh yeah,-_-ps again: 第一次用windows live writer写博客,感觉比网页方便多了~~,赞一个


摘自 心中无码 bitsCN.com

Statement
The content of this article is voluntarily contributed by netizens, and the copyright belongs to the original author. This site does not assume corresponding legal responsibility. If you find any content suspected of plagiarism or infringement, please contact admin@php.cn
MySQL String Types: Storage, Performance, and Best PracticesMySQL String Types: Storage, Performance, and Best PracticesMay 10, 2025 am 12:02 AM

MySQLstringtypesimpactstorageandperformanceasfollows:1)CHARisfixed-length,alwaysusingthesamestoragespace,whichcanbefasterbutlessspace-efficient.2)VARCHARisvariable-length,morespace-efficientbutpotentiallyslower.3)TEXTisforlargetext,storedoutsiderows,

Understanding MySQL String Types: VARCHAR, TEXT, CHAR, and MoreUnderstanding MySQL String Types: VARCHAR, TEXT, CHAR, and MoreMay 10, 2025 am 12:02 AM

MySQLstringtypesincludeVARCHAR,TEXT,CHAR,ENUM,andSET.1)VARCHARisversatileforvariable-lengthstringsuptoaspecifiedlimit.2)TEXTisidealforlargetextstoragewithoutadefinedlength.3)CHARisfixed-length,suitableforconsistentdatalikecodes.4)ENUMenforcesdatainte

What are the String Data Types in MySQL?What are the String Data Types in MySQL?May 10, 2025 am 12:01 AM

MySQLoffersvariousstringdatatypes:1)CHARforfixed-lengthstrings,2)VARCHARforvariable-lengthtext,3)BINARYandVARBINARYforbinarydata,4)BLOBandTEXTforlargedata,and5)ENUMandSETforcontrolledinput.Eachtypehasspecificusesandperformancecharacteristics,sochoose

How to Grant Permissions to New MySQL UsersHow to Grant Permissions to New MySQL UsersMay 09, 2025 am 12:16 AM

TograntpermissionstonewMySQLusers,followthesesteps:1)AccessMySQLasauserwithsufficientprivileges,2)CreateanewuserwiththeCREATEUSERcommand,3)UsetheGRANTcommandtospecifypermissionslikeSELECT,INSERT,UPDATE,orALLPRIVILEGESonspecificdatabasesortables,and4)

How to Add Users in MySQL: A Step-by-Step GuideHow to Add Users in MySQL: A Step-by-Step GuideMay 09, 2025 am 12:14 AM

ToaddusersinMySQLeffectivelyandsecurely,followthesesteps:1)UsetheCREATEUSERstatementtoaddanewuser,specifyingthehostandastrongpassword.2)GrantnecessaryprivilegesusingtheGRANTstatement,adheringtotheprincipleofleastprivilege.3)Implementsecuritymeasuresl

MySQL: Adding a new user with complex permissionsMySQL: Adding a new user with complex permissionsMay 09, 2025 am 12:09 AM

ToaddanewuserwithcomplexpermissionsinMySQL,followthesesteps:1)CreatetheuserwithCREATEUSER'newuser'@'localhost'IDENTIFIEDBY'password';.2)Grantreadaccesstoalltablesin'mydatabase'withGRANTSELECTONmydatabase.TO'newuser'@'localhost';.3)Grantwriteaccessto'

MySQL: String Data Types and CollationsMySQL: String Data Types and CollationsMay 09, 2025 am 12:08 AM

The string data types in MySQL include CHAR, VARCHAR, BINARY, VARBINARY, BLOB, and TEXT. The collations determine the comparison and sorting of strings. 1.CHAR is suitable for fixed-length strings, VARCHAR is suitable for variable-length strings. 2.BINARY and VARBINARY are used for binary data, and BLOB and TEXT are used for large object data. 3. Sorting rules such as utf8mb4_unicode_ci ignores upper and lower case and is suitable for user names; utf8mb4_bin is case sensitive and is suitable for fields that require precise comparison.

MySQL: What length should I use for VARCHARs?MySQL: What length should I use for VARCHARs?May 09, 2025 am 12:06 AM

The best MySQLVARCHAR column length selection should be based on data analysis, consider future growth, evaluate performance impacts, and character set requirements. 1) Analyze the data to determine typical lengths; 2) Reserve future expansion space; 3) Pay attention to the impact of large lengths on performance; 4) Consider the impact of character sets on storage. Through these steps, the efficiency and scalability of the database can be optimized.

See all articles

Hot AI Tools

Undresser.AI Undress

Undresser.AI Undress

AI-powered app for creating realistic nude photos

AI Clothes Remover

AI Clothes Remover

Online AI tool for removing clothes from photos.

Undress AI Tool

Undress AI Tool

Undress images for free

Clothoff.io

Clothoff.io

AI clothes remover

Video Face Swap

Video Face Swap

Swap faces in any video effortlessly with our completely free AI face swap tool!

Hot Tools

mPDF

mPDF

mPDF is a PHP library that can generate PDF files from UTF-8 encoded HTML. The original author, Ian Back, wrote mPDF to output PDF files "on the fly" from his website and handle different languages. It is slower than original scripts like HTML2FPDF and produces larger files when using Unicode fonts, but supports CSS styles etc. and has a lot of enhancements. Supports almost all languages, including RTL (Arabic and Hebrew) and CJK (Chinese, Japanese and Korean). Supports nested block-level elements (such as P, DIV),

MantisBT

MantisBT

Mantis is an easy-to-deploy web-based defect tracking tool designed to aid in product defect tracking. It requires PHP, MySQL and a web server. Check out our demo and hosting services.

Atom editor mac version download

Atom editor mac version download

The most popular open source editor

SublimeText3 Mac version

SublimeText3 Mac version

God-level code editing software (SublimeText3)

SublimeText3 Chinese version

SublimeText3 Chinese version

Chinese version, very easy to use