首页  >  文章  >  后端开发  >  csv2mysql

csv2mysql

PHP中文网
PHP中文网原创
2017-07-11 18:13:061524浏览
<span style="color: #0000ff">import</span><span style="color: #000000"> os
</span><span style="color: #0000ff">import</span><span style="color: #000000"> re
</span><span style="color: #0000ff">import</span><span style="color: #000000"> sys
</span><span style="color: #0000ff">import</span><span style="color: #000000"> csv
</span><span style="color: #0000ff">import</span><span style="color: #000000"> time
</span><span style="color: #0000ff">import</span><span style="color: #000000"> argparse
</span><span style="color: #0000ff">import</span><span style="color: #000000"> collections
</span><span style="color: #0000ff">import</span><span style="color: #000000"> MySQLdb
</span><span style="color: #0000ff">import</span><span style="color: #000000"> warnings 
</span><span style="color: #008000">#</span><span style="color: #008000"> suppress annoying mysql warnings</span>
warnings.filterwarnings(action=<span style="color: #800000">'</span><span style="color: #800000">ignore</span><span style="color: #800000">'</span>, category=<span style="color: #000000">MySQLdb.Warning) 



</span><span style="color: #0000ff">def</span><span style="color: #000000"> get_type(s):
    </span><span style="color: #800000">"""</span><span style="color: #800000">Find type for this string
    </span><span style="color: #800000">"""</span>
    <span style="color: #008000">#</span><span style="color: #008000"> try integer type</span>
    <span style="color: #0000ff">try</span><span style="color: #000000">:
        v </span>=<span style="color: #000000"> int(s)
    </span><span style="color: #0000ff">except</span><span style="color: #000000"> ValueError:
        </span><span style="color: #0000ff">pass</span>
    <span style="color: #0000ff">else</span><span style="color: #000000">:
        </span><span style="color: #0000ff">if</span> abs(v) > 2147483647<span style="color: #000000">:
            </span><span style="color: #0000ff">return</span> <span style="color: #800000">'</span><span style="color: #800000">bigint</span><span style="color: #800000">'</span>
        <span style="color: #0000ff">else</span><span style="color: #000000">:
            </span><span style="color: #0000ff">return</span> <span style="color: #800000">'</span><span style="color: #800000">int</span><span style="color: #800000">'</span>
    <span style="color: #008000">#</span><span style="color: #008000"> try float type</span>
    <span style="color: #0000ff">try</span><span style="color: #000000">:
        float(s)
    </span><span style="color: #0000ff">except</span><span style="color: #000000"> ValueError:
        </span><span style="color: #0000ff">pass</span>
    <span style="color: #0000ff">else</span><span style="color: #000000">:
        </span><span style="color: #0000ff">return</span> <span style="color: #800000">'</span><span style="color: #800000">double</span><span style="color: #800000">'</span>

    <span style="color: #008000">#</span><span style="color: #008000"> check for timestamp</span>
    dt_formats =<span style="color: #000000"> (
        (</span><span style="color: #800000">'</span><span style="color: #800000">%Y-%m-%d %H:%M:%S</span><span style="color: #800000">'</span>, <span style="color: #800000">'</span><span style="color: #800000">datetime</span><span style="color: #800000">'</span><span style="color: #000000">),
        (</span><span style="color: #800000">'</span><span style="color: #800000">%Y-%m-%d %H:%M:%S.%f</span><span style="color: #800000">'</span>, <span style="color: #800000">'</span><span style="color: #800000">datetime</span><span style="color: #800000">'</span><span style="color: #000000">),
        (</span><span style="color: #800000">'</span><span style="color: #800000">%Y-%m-%d</span><span style="color: #800000">'</span>, <span style="color: #800000">'</span><span style="color: #800000">date</span><span style="color: #800000">'</span><span style="color: #000000">),
        (</span><span style="color: #800000">'</span><span style="color: #800000">%H:%M:%S</span><span style="color: #800000">'</span>, <span style="color: #800000">'</span><span style="color: #800000">time</span><span style="color: #800000">'</span><span style="color: #000000">),
    )
    </span><span style="color: #0000ff">for</span> dt_format, dt_type <span style="color: #0000ff">in</span><span style="color: #000000"> dt_formats:
        </span><span style="color: #0000ff">try</span><span style="color: #000000">:
            time.strptime(s, dt_format)
        </span><span style="color: #0000ff">except</span><span style="color: #000000"> ValueError:
            </span><span style="color: #0000ff">pass</span>
        <span style="color: #0000ff">else</span><span style="color: #000000">:
            </span><span style="color: #0000ff">return</span><span style="color: #000000"> dt_type
   
    </span><span style="color: #008000">#</span><span style="color: #008000"> doesn't match any other types so assume text</span>
    <span style="color: #0000ff">if</span> len(s) > 255<span style="color: #000000">:
        </span><span style="color: #0000ff">return</span> <span style="color: #800000">'</span><span style="color: #800000">text</span><span style="color: #800000">'</span>
    <span style="color: #0000ff">else</span><span style="color: #000000">:
        </span><span style="color: #0000ff">return</span> <span style="color: #800000">'</span><span style="color: #800000">varchar(255)</span><span style="color: #800000">'</span>


<span style="color: #0000ff">def</span> most_common(l, default=<span style="color: #800000">'</span><span style="color: #800000">varchar(255)</span><span style="color: #800000">'</span><span style="color: #000000">):
    </span><span style="color: #800000">"""</span><span style="color: #800000">Return most common value from list
    </span><span style="color: #800000">"""</span>
    <span style="color: #008000">#</span><span style="color: #008000"> some formats trump others</span>
    <span style="color: #0000ff">if</span><span style="color: #000000"> l:
        </span><span style="color: #0000ff">for</span> dt_type <span style="color: #0000ff">in</span> (<span style="color: #800000">'</span><span style="color: #800000">text</span><span style="color: #800000">'</span>, <span style="color: #800000">'</span><span style="color: #800000">bigint</span><span style="color: #800000">'</span><span style="color: #000000">):
            </span><span style="color: #0000ff">if</span> dt_type <span style="color: #0000ff">in</span><span style="color: #000000"> l:
                </span><span style="color: #0000ff">return</span><span style="color: #000000"> dt_type
        </span><span style="color: #0000ff">return</span> max(l, key=<span style="color: #000000">l.count)
    </span><span style="color: #0000ff">return</span><span style="color: #000000"> default


</span><span style="color: #0000ff">def</span> get_col_types(input_file, max_rows=1000<span style="color: #000000">):
    </span><span style="color: #800000">"""</span><span style="color: #800000">Find the type for each CSV column
    </span><span style="color: #800000">"""</span><span style="color: #000000">
    csv_types </span>=<span style="color: #000000"> collections.defaultdict(list)
    reader </span>=<span style="color: #000000"> csv.reader(open(input_file))
    </span><span style="color: #008000">#</span><span style="color: #008000"> test the first few rows for their data types</span>
    <span style="color: #0000ff">for</span> row_i, row <span style="color: #0000ff">in</span><span style="color: #000000"> enumerate(reader):
        </span><span style="color: #0000ff">if</span> row_i ==<span style="color: #000000"> 0:
            header </span>=<span style="color: #000000"> row
        </span><span style="color: #0000ff">else</span><span style="color: #000000">:
            </span><span style="color: #0000ff">for</span> col_i, s <span style="color: #0000ff">in</span><span style="color: #000000"> enumerate(row):
                data_type </span>=<span style="color: #000000"> get_type(s)
                csv_types[header[col_i]].append(data_type)
 
        </span><span style="color: #0000ff">if</span> row_i ==<span style="color: #000000"> max_rows:
            </span><span style="color: #0000ff">break</span>

    <span style="color: #008000">#</span><span style="color: #008000"> take the most common data type for each row</span>
    <span style="color: #0000ff">return</span> [most_common(csv_types[col]) <span style="color: #0000ff">for</span> col <span style="color: #0000ff">in</span><span style="color: #000000"> header]


</span><span style="color: #0000ff">def</span><span style="color: #000000"> get_schema(table, header, col_types):
    </span><span style="color: #800000">"""</span><span style="color: #800000">Generate the schema for this table from given types and columns
    </span><span style="color: #800000">"""</span><span style="color: #000000">
    schema_sql </span>= <span style="color: #800000">"""</span><span style="color: #800000">CREATE TABLE IF NOT EXISTS %s ( 
        id int NOT NULL AUTO_INCREMENT,</span><span style="color: #800000">"""</span> %<span style="color: #000000"> table 

    </span><span style="color: #0000ff">for</span> col_name, col_type <span style="color: #0000ff">in</span><span style="color: #000000"> zip(header, col_types):
        schema_sql </span>+= <span style="color: #800000">'</span><span style="color: #800000">\n%s %s,</span><span style="color: #800000">'</span> %<span style="color: #000000"> (col_name, col_type)

    schema_sql </span>+= <span style="color: #800000">"""</span><span style="color: #800000">\nPRIMARY KEY (id)
        ) DEFAULT CHARSET=utf8;</span><span style="color: #800000">"""</span>
    <span style="color: #0000ff">return</span><span style="color: #000000"> schema_sql


</span><span style="color: #0000ff">def</span><span style="color: #000000"> get_insert(table, header):
    </span><span style="color: #800000">"""</span><span style="color: #800000">Generate the SQL for inserting rows
    </span><span style="color: #800000">"""</span><span style="color: #000000">
    field_names </span>= <span style="color: #800000">'</span><span style="color: #800000">, </span><span style="color: #800000">'</span><span style="color: #000000">.join(header)
    field_markers </span>= <span style="color: #800000">'</span><span style="color: #800000">, </span><span style="color: #800000">'</span>.join(<span style="color: #800000">'</span><span style="color: #800000">%s</span><span style="color: #800000">'</span> <span style="color: #0000ff">for</span> col <span style="color: #0000ff">in</span><span style="color: #000000"> header)
    </span><span style="color: #0000ff">return</span> <span style="color: #800000">'</span><span style="color: #800000">INSERT INTO %s (%s) VALUES (%s);</span><span style="color: #800000">'</span> %<span style="color: #000000"> \
        (table, field_names, field_markers)


</span><span style="color: #0000ff">def</span><span style="color: #000000"> format_header(row):
    </span><span style="color: #800000">"""</span><span style="color: #800000">Format column names to remove illegal characters and duplicates
    </span><span style="color: #800000">"""</span><span style="color: #000000">
    safe_col </span>= <span style="color: #0000ff">lambda</span> s: re.sub(<span style="color: #800000">'</span><span style="color: #800000">\W+</span><span style="color: #800000">'</span>, <span style="color: #800000">'</span><span style="color: #800000">_</span><span style="color: #800000">'</span>, s.lower()).strip(<span style="color: #800000">'</span><span style="color: #800000">_</span><span style="color: #800000">'</span><span style="color: #000000">)
    header </span>=<span style="color: #000000"> []
    counts </span>=<span style="color: #000000"> collections.defaultdict(int)
    </span><span style="color: #0000ff">for</span> col <span style="color: #0000ff">in</span><span style="color: #000000"> row:
        col </span>=<span style="color: #000000"> safe_col(col)
        counts[col] </span>+= 1
        <span style="color: #0000ff">if</span> counts[col] > 1<span style="color: #000000">:
            col </span>= <span style="color: #800000">'</span><span style="color: #800000">{}{}</span><span style="color: #800000">'</span><span style="color: #000000">.format(col, counts[col])
        header.append(col)
    </span><span style="color: #0000ff">return</span><span style="color: #000000"> header


</span><span style="color: #0000ff">def</span> main(input_file, user, password, host, table, database, max_inserts=10000<span style="color: #000000">):
    </span><span style="color: #0000ff">print</span> <span style="color: #800000">"</span><span style="color: #800000">Importing `%s' into MySQL database `%s.%s'</span><span style="color: #800000">"</span> %<span style="color: #000000"> (input_file, database, table)
    db </span>= MySQLdb.connect(host=host, user=user, passwd=password, charset=<span style="color: #800000">'</span><span style="color: #800000">utf8</span><span style="color: #800000">'</span><span style="color: #000000">)
    cursor </span>=<span style="color: #000000"> db.cursor()
    </span><span style="color: #008000">#</span><span style="color: #008000"> create database and if doesn't exist</span>
    cursor.execute(<span style="color: #800000">'</span><span style="color: #800000">CREATE DATABASE IF NOT EXISTS %s;</span><span style="color: #800000">'</span> %<span style="color: #000000"> database)
    db.select_db(database)

    </span><span style="color: #008000">#</span><span style="color: #008000"> define table</span>
    <span style="color: #0000ff">print</span> <span style="color: #800000">'</span><span style="color: #800000">Analyzing column types ...</span><span style="color: #800000">'</span><span style="color: #000000">
    col_types </span>=<span style="color: #000000"> get_col_types(input_file)
    </span><span style="color: #0000ff">print</span><span style="color: #000000"> col_types

    header </span>=<span style="color: #000000"> None
    </span><span style="color: #0000ff">for</span> i, row <span style="color: #0000ff">in</span><span style="color: #000000"> enumerate(csv.reader(open(input_file))):
        </span><span style="color: #0000ff">if</span><span style="color: #000000"> header:
            </span><span style="color: #0000ff">while</span> len(row) <<span style="color: #000000"> len(header):
                row.append(</span><span style="color: #800000">''</span>) <span style="color: #008000">#</span><span style="color: #008000"> this row is missing columns so pad blank values</span>
<span style="color: #000000">            cursor.execute(insert_sql, row)
            </span><span style="color: #0000ff">if</span> i % max_inserts ==<span style="color: #000000"> 0:
                db.commit()
                </span><span style="color: #0000ff">print</span> <span style="color: #800000">'</span><span style="color: #800000">commit</span><span style="color: #800000">'</span>
        <span style="color: #0000ff">else</span><span style="color: #000000">:
            header </span>=<span style="color: #000000"> format_header(row)
            schema_sql </span>=<span style="color: #000000"> get_schema(table, header, col_types)
            </span><span style="color: #0000ff">print</span><span style="color: #000000"> schema_sql
            </span><span style="color: #008000">#</span><span style="color: #008000"> create table</span>
            cursor.execute(<span style="color: #800000">'</span><span style="color: #800000">DROP TABLE IF EXISTS %s;</span><span style="color: #800000">'</span> %<span style="color: #000000"> table)
            cursor.execute(schema_sql)
            </span><span style="color: #008000">#</span><span style="color: #008000"> create index for more efficient access</span>
            <span style="color: #0000ff">try</span><span style="color: #000000">:
                cursor.execute(</span><span style="color: #800000">'</span><span style="color: #800000">CREATE INDEX ids ON %s (id);</span><span style="color: #800000">'</span> %<span style="color: #000000"> table)
            </span><span style="color: #0000ff">except</span><span style="color: #000000"> MySQLdb.OperationalError:
                </span><span style="color: #0000ff">pass</span> <span style="color: #008000">#</span><span style="color: #008000"> index already exists</span>

            <span style="color: #0000ff">print</span> <span style="color: #800000">'</span><span style="color: #800000">Inserting rows ...</span><span style="color: #800000">'</span>
            <span style="color: #008000">#</span><span style="color: #008000"> SQL string for inserting data</span>
            insert_sql =<span style="color: #000000"> get_insert(table, header)

    </span><span style="color: #008000">#</span><span style="color: #008000"> commit rows to database</span>
    <span style="color: #0000ff">print</span> <span style="color: #800000">'</span><span style="color: #800000">Committing rows to database ...</span><span style="color: #800000">'</span><span style="color: #000000">
    db.commit()
    </span><span style="color: #0000ff">print</span> <span style="color: #800000">'</span><span style="color: #800000">Done!</span><span style="color: #800000">'</span>



<span style="color: #0000ff">if</span> <span style="color: #800080">__name__</span> == <span style="color: #800000">'</span><span style="color: #800000">__main__</span><span style="color: #800000">'</span><span style="color: #000000">:
    parser </span>= argparse.ArgumentParser(description=<span style="color: #800000">'</span><span style="color: #800000">Automatically insert CSV contents into MySQL</span><span style="color: #800000">'</span><span style="color: #000000">)
    parser.add_argument(</span><span style="color: #800000">'</span><span style="color: #800000">--table</span><span style="color: #800000">'</span>, dest=<span style="color: #800000">'</span><span style="color: #800000">table</span><span style="color: #800000">'</span>, help=<span style="color: #800000">'</span><span style="color: #800000">Set the name of the table. If not set the CSV filename will be used</span><span style="color: #800000">'</span><span style="color: #000000">)
    parser.add_argument(</span><span style="color: #800000">'</span><span style="color: #800000">--database</span><span style="color: #800000">'</span>, dest=<span style="color: #800000">'</span><span style="color: #800000">database</span><span style="color: #800000">'</span>, default=<span style="color: #800000">'</span><span style="color: #800000">test</span><span style="color: #800000">'</span>, help=<span style="color: #800000">'</span><span style="color: #800000">Set the name of the database. If not set the test database will be used</span><span style="color: #800000">'</span><span style="color: #000000">)
    parser.add_argument(</span><span style="color: #800000">'</span><span style="color: #800000">--user</span><span style="color: #800000">'</span>, dest=<span style="color: #800000">'</span><span style="color: #800000">user</span><span style="color: #800000">'</span>, default=<span style="color: #800000">'</span><span style="color: #800000">root</span><span style="color: #800000">'</span>, help=<span style="color: #800000">'</span><span style="color: #800000">The MySQL login username</span><span style="color: #800000">'</span><span style="color: #000000">)
    parser.add_argument(</span><span style="color: #800000">'</span><span style="color: #800000">--password</span><span style="color: #800000">'</span>, dest=<span style="color: #800000">'</span><span style="color: #800000">password</span><span style="color: #800000">'</span>, default=<span style="color: #800000">''</span>, help=<span style="color: #800000">'</span><span style="color: #800000">The MySQL login password</span><span style="color: #800000">'</span><span style="color: #000000">)
    parser.add_argument(</span><span style="color: #800000">'</span><span style="color: #800000">--host</span><span style="color: #800000">'</span>, dest=<span style="color: #800000">'</span><span style="color: #800000">host</span><span style="color: #800000">'</span>, default=<span style="color: #800000">'</span><span style="color: #800000">localhost</span><span style="color: #800000">'</span>, help=<span style="color: #800000">'</span><span style="color: #800000">The MySQL host</span><span style="color: #800000">'</span><span style="color: #000000">)
    parser.add_argument(</span><span style="color: #800000">'</span><span style="color: #800000">input_file</span><span style="color: #800000">'</span>, help=<span style="color: #800000">'</span><span style="color: #800000">The input CSV file</span><span style="color: #800000">'</span><span style="color: #000000">)
    args </span>= parser.parse_args(sys.argv[1<span style="color: #000000">:])
    </span><span style="color: #0000ff">if</span> <span style="color: #0000ff">not</span><span style="color: #000000"> args.table:
        </span><span style="color: #008000">#</span><span style="color: #008000"> use input file name for table</span>
        args.table =<span style="color: #000000"> os.path.splitext(os.path.basename(args.input_file))[0]
    
    main(args.input_file, args.user, args.password, args.host, args.table, args.database)</span>

具体使用例子如下

[root@server1]# python csv2mysql.py --host=172.20.197.61 --user=PdYRxGWNpVRCQfHj --password=RX5a5YsViQcDdywr --database=cf_dc61100a_92a7_43ca_81dd_2a7e3fa0808a --table=performance_history_2 performance_history_2.csv
Importing `performance_history_2.csv' into MySQL database `cf_dc61100a_92a7_43ca_81dd_2a7e3fa0808a.performance_history_2'
Analyzing column types ...
['datetime', 'varchar(255)', 'varchar(255)', 'varchar(255)', 'varchar(255)', 'varchar(255)', 'varchar(255)', 'varchar(255)']
CREATE TABLE IF NOT EXISTS performance_history_2 (
id int NOT NULL AUTO_INCREMENT,
date_time datetime,
write_bw_mb_s varchar(255),
read_bw_mb_s varchar(255),
write_iops varchar(255),
read_iops varchar(255),
write_latency_usec varchar(255),
read_latency_usec varchar(255),
avg_latency_usec varchar(255),
PRIMARY KEY (id)
) DEFAULT CHARSET=utf8;
Inserting rows ...
commit
commit
commit
commit
commit
commit
commit
commit
commit
commit
commit
Committing rows to database ...
Done!

以上是csv2mysql的详细内容。更多信息请关注PHP中文网其他相关文章!

声明:
本文内容由网友自发贡献,版权归原作者所有,本站不承担相应法律责任。如您发现有涉嫌抄袭侵权的内容,请联系admin@php.cn