Home  >  Article  >  Backend Development  >  csv2mysql

csv2mysql

PHP中文网
PHP中文网Original
2017-07-11 18:13:061539browse
<span style="color: #0000ff">import</span><span style="color: #000000"> os
</span><span style="color: #0000ff">import</span><span style="color: #000000"> re
</span><span style="color: #0000ff">import</span><span style="color: #000000"> sys
</span><span style="color: #0000ff">import</span><span style="color: #000000"> csv
</span><span style="color: #0000ff">import</span><span style="color: #000000"> time
</span><span style="color: #0000ff">import</span><span style="color: #000000"> argparse
</span><span style="color: #0000ff">import</span><span style="color: #000000"> collections
</span><span style="color: #0000ff">import</span><span style="color: #000000"> MySQLdb
</span><span style="color: #0000ff">import</span><span style="color: #000000"> warnings 
</span><span style="color: #008000">#</span><span style="color: #008000"> suppress annoying mysql warnings</span>
warnings.filterwarnings(action=<span style="color: #800000">'</span><span style="color: #800000">ignore</span><span style="color: #800000">'</span>, category=<span style="color: #000000">MySQLdb.Warning) 



</span><span style="color: #0000ff">def</span><span style="color: #000000"> get_type(s):
    </span><span style="color: #800000">"""</span><span style="color: #800000">Find type for this string
    </span><span style="color: #800000">"""</span>
    <span style="color: #008000">#</span><span style="color: #008000"> try integer type</span>
    <span style="color: #0000ff">try</span><span style="color: #000000">:
        v </span>=<span style="color: #000000"> int(s)
    </span><span style="color: #0000ff">except</span><span style="color: #000000"> ValueError:
        </span><span style="color: #0000ff">pass</span>
    <span style="color: #0000ff">else</span><span style="color: #000000">:
        </span><span style="color: #0000ff">if</span> abs(v) > 2147483647<span style="color: #000000">:
            </span><span style="color: #0000ff">return</span> <span style="color: #800000">'</span><span style="color: #800000">bigint</span><span style="color: #800000">'</span>
        <span style="color: #0000ff">else</span><span style="color: #000000">:
            </span><span style="color: #0000ff">return</span> <span style="color: #800000">'</span><span style="color: #800000">int</span><span style="color: #800000">'</span>
    <span style="color: #008000">#</span><span style="color: #008000"> try float type</span>
    <span style="color: #0000ff">try</span><span style="color: #000000">:
        float(s)
    </span><span style="color: #0000ff">except</span><span style="color: #000000"> ValueError:
        </span><span style="color: #0000ff">pass</span>
    <span style="color: #0000ff">else</span><span style="color: #000000">:
        </span><span style="color: #0000ff">return</span> <span style="color: #800000">'</span><span style="color: #800000">double</span><span style="color: #800000">'</span>

    <span style="color: #008000">#</span><span style="color: #008000"> check for timestamp</span>
    dt_formats =<span style="color: #000000"> (
        (</span><span style="color: #800000">'</span><span style="color: #800000">%Y-%m-%d %H:%M:%S</span><span style="color: #800000">'</span>, <span style="color: #800000">'</span><span style="color: #800000">datetime</span><span style="color: #800000">'</span><span style="color: #000000">),
        (</span><span style="color: #800000">'</span><span style="color: #800000">%Y-%m-%d %H:%M:%S.%f</span><span style="color: #800000">'</span>, <span style="color: #800000">'</span><span style="color: #800000">datetime</span><span style="color: #800000">'</span><span style="color: #000000">),
        (</span><span style="color: #800000">'</span><span style="color: #800000">%Y-%m-%d</span><span style="color: #800000">'</span>, <span style="color: #800000">'</span><span style="color: #800000">date</span><span style="color: #800000">'</span><span style="color: #000000">),
        (</span><span style="color: #800000">'</span><span style="color: #800000">%H:%M:%S</span><span style="color: #800000">'</span>, <span style="color: #800000">'</span><span style="color: #800000">time</span><span style="color: #800000">'</span><span style="color: #000000">),
    )
    </span><span style="color: #0000ff">for</span> dt_format, dt_type <span style="color: #0000ff">in</span><span style="color: #000000"> dt_formats:
        </span><span style="color: #0000ff">try</span><span style="color: #000000">:
            time.strptime(s, dt_format)
        </span><span style="color: #0000ff">except</span><span style="color: #000000"> ValueError:
            </span><span style="color: #0000ff">pass</span>
        <span style="color: #0000ff">else</span><span style="color: #000000">:
            </span><span style="color: #0000ff">return</span><span style="color: #000000"> dt_type
   
    </span><span style="color: #008000">#</span><span style="color: #008000"> doesn't match any other types so assume text</span>
    <span style="color: #0000ff">if</span> len(s) > 255<span style="color: #000000">:
        </span><span style="color: #0000ff">return</span> <span style="color: #800000">'</span><span style="color: #800000">text</span><span style="color: #800000">'</span>
    <span style="color: #0000ff">else</span><span style="color: #000000">:
        </span><span style="color: #0000ff">return</span> <span style="color: #800000">'</span><span style="color: #800000">varchar(255)</span><span style="color: #800000">'</span>


<span style="color: #0000ff">def</span> most_common(l, default=<span style="color: #800000">'</span><span style="color: #800000">varchar(255)</span><span style="color: #800000">'</span><span style="color: #000000">):
    </span><span style="color: #800000">"""</span><span style="color: #800000">Return most common value from list
    </span><span style="color: #800000">"""</span>
    <span style="color: #008000">#</span><span style="color: #008000"> some formats trump others</span>
    <span style="color: #0000ff">if</span><span style="color: #000000"> l:
        </span><span style="color: #0000ff">for</span> dt_type <span style="color: #0000ff">in</span> (<span style="color: #800000">'</span><span style="color: #800000">text</span><span style="color: #800000">'</span>, <span style="color: #800000">'</span><span style="color: #800000">bigint</span><span style="color: #800000">'</span><span style="color: #000000">):
            </span><span style="color: #0000ff">if</span> dt_type <span style="color: #0000ff">in</span><span style="color: #000000"> l:
                </span><span style="color: #0000ff">return</span><span style="color: #000000"> dt_type
        </span><span style="color: #0000ff">return</span> max(l, key=<span style="color: #000000">l.count)
    </span><span style="color: #0000ff">return</span><span style="color: #000000"> default


</span><span style="color: #0000ff">def</span> get_col_types(input_file, max_rows=1000<span style="color: #000000">):
    </span><span style="color: #800000">"""</span><span style="color: #800000">Find the type for each CSV column
    </span><span style="color: #800000">"""</span><span style="color: #000000">
    csv_types </span>=<span style="color: #000000"> collections.defaultdict(list)
    reader </span>=<span style="color: #000000"> csv.reader(open(input_file))
    </span><span style="color: #008000">#</span><span style="color: #008000"> test the first few rows for their data types</span>
    <span style="color: #0000ff">for</span> row_i, row <span style="color: #0000ff">in</span><span style="color: #000000"> enumerate(reader):
        </span><span style="color: #0000ff">if</span> row_i ==<span style="color: #000000"> 0:
            header </span>=<span style="color: #000000"> row
        </span><span style="color: #0000ff">else</span><span style="color: #000000">:
            </span><span style="color: #0000ff">for</span> col_i, s <span style="color: #0000ff">in</span><span style="color: #000000"> enumerate(row):
                data_type </span>=<span style="color: #000000"> get_type(s)
                csv_types[header[col_i]].append(data_type)
 
        </span><span style="color: #0000ff">if</span> row_i ==<span style="color: #000000"> max_rows:
            </span><span style="color: #0000ff">break</span>

    <span style="color: #008000">#</span><span style="color: #008000"> take the most common data type for each row</span>
    <span style="color: #0000ff">return</span> [most_common(csv_types[col]) <span style="color: #0000ff">for</span> col <span style="color: #0000ff">in</span><span style="color: #000000"> header]


</span><span style="color: #0000ff">def</span><span style="color: #000000"> get_schema(table, header, col_types):
    </span><span style="color: #800000">"""</span><span style="color: #800000">Generate the schema for this table from given types and columns
    </span><span style="color: #800000">"""</span><span style="color: #000000">
    schema_sql </span>= <span style="color: #800000">"""</span><span style="color: #800000">CREATE TABLE IF NOT EXISTS %s ( 
        id int NOT NULL AUTO_INCREMENT,</span><span style="color: #800000">"""</span> %<span style="color: #000000"> table 

    </span><span style="color: #0000ff">for</span> col_name, col_type <span style="color: #0000ff">in</span><span style="color: #000000"> zip(header, col_types):
        schema_sql </span>+= <span style="color: #800000">'</span><span style="color: #800000">\n%s %s,</span><span style="color: #800000">'</span> %<span style="color: #000000"> (col_name, col_type)

    schema_sql </span>+= <span style="color: #800000">"""</span><span style="color: #800000">\nPRIMARY KEY (id)
        ) DEFAULT CHARSET=utf8;</span><span style="color: #800000">"""</span>
    <span style="color: #0000ff">return</span><span style="color: #000000"> schema_sql


</span><span style="color: #0000ff">def</span><span style="color: #000000"> get_insert(table, header):
    </span><span style="color: #800000">"""</span><span style="color: #800000">Generate the SQL for inserting rows
    </span><span style="color: #800000">"""</span><span style="color: #000000">
    field_names </span>= <span style="color: #800000">'</span><span style="color: #800000">, </span><span style="color: #800000">'</span><span style="color: #000000">.join(header)
    field_markers </span>= <span style="color: #800000">'</span><span style="color: #800000">, </span><span style="color: #800000">'</span>.join(<span style="color: #800000">'</span><span style="color: #800000">%s</span><span style="color: #800000">'</span> <span style="color: #0000ff">for</span> col <span style="color: #0000ff">in</span><span style="color: #000000"> header)
    </span><span style="color: #0000ff">return</span> <span style="color: #800000">'</span><span style="color: #800000">INSERT INTO %s (%s) VALUES (%s);</span><span style="color: #800000">'</span> %<span style="color: #000000"> \
        (table, field_names, field_markers)


</span><span style="color: #0000ff">def</span><span style="color: #000000"> format_header(row):
    </span><span style="color: #800000">"""</span><span style="color: #800000">Format column names to remove illegal characters and duplicates
    </span><span style="color: #800000">"""</span><span style="color: #000000">
    safe_col </span>= <span style="color: #0000ff">lambda</span> s: re.sub(<span style="color: #800000">'</span><span style="color: #800000">\W+</span><span style="color: #800000">'</span>, <span style="color: #800000">'</span><span style="color: #800000">_</span><span style="color: #800000">'</span>, s.lower()).strip(<span style="color: #800000">'</span><span style="color: #800000">_</span><span style="color: #800000">'</span><span style="color: #000000">)
    header </span>=<span style="color: #000000"> []
    counts </span>=<span style="color: #000000"> collections.defaultdict(int)
    </span><span style="color: #0000ff">for</span> col <span style="color: #0000ff">in</span><span style="color: #000000"> row:
        col </span>=<span style="color: #000000"> safe_col(col)
        counts[col] </span>+= 1
        <span style="color: #0000ff">if</span> counts[col] > 1<span style="color: #000000">:
            col </span>= <span style="color: #800000">'</span><span style="color: #800000">{}{}</span><span style="color: #800000">'</span><span style="color: #000000">.format(col, counts[col])
        header.append(col)
    </span><span style="color: #0000ff">return</span><span style="color: #000000"> header


</span><span style="color: #0000ff">def</span> main(input_file, user, password, host, table, database, max_inserts=10000<span style="color: #000000">):
    </span><span style="color: #0000ff">print</span> <span style="color: #800000">"</span><span style="color: #800000">Importing `%s' into MySQL database `%s.%s'</span><span style="color: #800000">"</span> %<span style="color: #000000"> (input_file, database, table)
    db </span>= MySQLdb.connect(host=host, user=user, passwd=password, charset=<span style="color: #800000">'</span><span style="color: #800000">utf8</span><span style="color: #800000">'</span><span style="color: #000000">)
    cursor </span>=<span style="color: #000000"> db.cursor()
    </span><span style="color: #008000">#</span><span style="color: #008000"> create database and if doesn't exist</span>
    cursor.execute(<span style="color: #800000">'</span><span style="color: #800000">CREATE DATABASE IF NOT EXISTS %s;</span><span style="color: #800000">'</span> %<span style="color: #000000"> database)
    db.select_db(database)

    </span><span style="color: #008000">#</span><span style="color: #008000"> define table</span>
    <span style="color: #0000ff">print</span> <span style="color: #800000">'</span><span style="color: #800000">Analyzing column types ...</span><span style="color: #800000">'</span><span style="color: #000000">
    col_types </span>=<span style="color: #000000"> get_col_types(input_file)
    </span><span style="color: #0000ff">print</span><span style="color: #000000"> col_types

    header </span>=<span style="color: #000000"> None
    </span><span style="color: #0000ff">for</span> i, row <span style="color: #0000ff">in</span><span style="color: #000000"> enumerate(csv.reader(open(input_file))):
        </span><span style="color: #0000ff">if</span><span style="color: #000000"> header:
            </span><span style="color: #0000ff">while</span> len(row) <<span style="color: #000000"> len(header):
                row.append(</span><span style="color: #800000">''</span>) <span style="color: #008000">#</span><span style="color: #008000"> this row is missing columns so pad blank values</span>
<span style="color: #000000">            cursor.execute(insert_sql, row)
            </span><span style="color: #0000ff">if</span> i % max_inserts ==<span style="color: #000000"> 0:
                db.commit()
                </span><span style="color: #0000ff">print</span> <span style="color: #800000">'</span><span style="color: #800000">commit</span><span style="color: #800000">'</span>
        <span style="color: #0000ff">else</span><span style="color: #000000">:
            header </span>=<span style="color: #000000"> format_header(row)
            schema_sql </span>=<span style="color: #000000"> get_schema(table, header, col_types)
            </span><span style="color: #0000ff">print</span><span style="color: #000000"> schema_sql
            </span><span style="color: #008000">#</span><span style="color: #008000"> create table</span>
            cursor.execute(<span style="color: #800000">'</span><span style="color: #800000">DROP TABLE IF EXISTS %s;</span><span style="color: #800000">'</span> %<span style="color: #000000"> table)
            cursor.execute(schema_sql)
            </span><span style="color: #008000">#</span><span style="color: #008000"> create index for more efficient access</span>
            <span style="color: #0000ff">try</span><span style="color: #000000">:
                cursor.execute(</span><span style="color: #800000">'</span><span style="color: #800000">CREATE INDEX ids ON %s (id);</span><span style="color: #800000">'</span> %<span style="color: #000000"> table)
            </span><span style="color: #0000ff">except</span><span style="color: #000000"> MySQLdb.OperationalError:
                </span><span style="color: #0000ff">pass</span> <span style="color: #008000">#</span><span style="color: #008000"> index already exists</span>

            <span style="color: #0000ff">print</span> <span style="color: #800000">'</span><span style="color: #800000">Inserting rows ...</span><span style="color: #800000">'</span>
            <span style="color: #008000">#</span><span style="color: #008000"> SQL string for inserting data</span>
            insert_sql =<span style="color: #000000"> get_insert(table, header)

    </span><span style="color: #008000">#</span><span style="color: #008000"> commit rows to database</span>
    <span style="color: #0000ff">print</span> <span style="color: #800000">'</span><span style="color: #800000">Committing rows to database ...</span><span style="color: #800000">'</span><span style="color: #000000">
    db.commit()
    </span><span style="color: #0000ff">print</span> <span style="color: #800000">'</span><span style="color: #800000">Done!</span><span style="color: #800000">'</span>



<span style="color: #0000ff">if</span> <span style="color: #800080">__name__</span> == <span style="color: #800000">'</span><span style="color: #800000">__main__</span><span style="color: #800000">'</span><span style="color: #000000">:
    parser </span>= argparse.ArgumentParser(description=<span style="color: #800000">'</span><span style="color: #800000">Automatically insert CSV contents into MySQL</span><span style="color: #800000">'</span><span style="color: #000000">)
    parser.add_argument(</span><span style="color: #800000">'</span><span style="color: #800000">--table</span><span style="color: #800000">'</span>, dest=<span style="color: #800000">'</span><span style="color: #800000">table</span><span style="color: #800000">'</span>, help=<span style="color: #800000">'</span><span style="color: #800000">Set the name of the table. If not set the CSV filename will be used</span><span style="color: #800000">'</span><span style="color: #000000">)
    parser.add_argument(</span><span style="color: #800000">'</span><span style="color: #800000">--database</span><span style="color: #800000">'</span>, dest=<span style="color: #800000">'</span><span style="color: #800000">database</span><span style="color: #800000">'</span>, default=<span style="color: #800000">'</span><span style="color: #800000">test</span><span style="color: #800000">'</span>, help=<span style="color: #800000">'</span><span style="color: #800000">Set the name of the database. If not set the test database will be used</span><span style="color: #800000">'</span><span style="color: #000000">)
    parser.add_argument(</span><span style="color: #800000">'</span><span style="color: #800000">--user</span><span style="color: #800000">'</span>, dest=<span style="color: #800000">'</span><span style="color: #800000">user</span><span style="color: #800000">'</span>, default=<span style="color: #800000">'</span><span style="color: #800000">root</span><span style="color: #800000">'</span>, help=<span style="color: #800000">'</span><span style="color: #800000">The MySQL login username</span><span style="color: #800000">'</span><span style="color: #000000">)
    parser.add_argument(</span><span style="color: #800000">'</span><span style="color: #800000">--password</span><span style="color: #800000">'</span>, dest=<span style="color: #800000">'</span><span style="color: #800000">password</span><span style="color: #800000">'</span>, default=<span style="color: #800000">''</span>, help=<span style="color: #800000">'</span><span style="color: #800000">The MySQL login password</span><span style="color: #800000">'</span><span style="color: #000000">)
    parser.add_argument(</span><span style="color: #800000">'</span><span style="color: #800000">--host</span><span style="color: #800000">'</span>, dest=<span style="color: #800000">'</span><span style="color: #800000">host</span><span style="color: #800000">'</span>, default=<span style="color: #800000">'</span><span style="color: #800000">localhost</span><span style="color: #800000">'</span>, help=<span style="color: #800000">'</span><span style="color: #800000">The MySQL host</span><span style="color: #800000">'</span><span style="color: #000000">)
    parser.add_argument(</span><span style="color: #800000">'</span><span style="color: #800000">input_file</span><span style="color: #800000">'</span>, help=<span style="color: #800000">'</span><span style="color: #800000">The input CSV file</span><span style="color: #800000">'</span><span style="color: #000000">)
    args </span>= parser.parse_args(sys.argv[1<span style="color: #000000">:])
    </span><span style="color: #0000ff">if</span> <span style="color: #0000ff">not</span><span style="color: #000000"> args.table:
        </span><span style="color: #008000">#</span><span style="color: #008000"> use input file name for table</span>
        args.table =<span style="color: #000000"> os.path.splitext(os.path.basename(args.input_file))[0]
    
    main(args.input_file, args.user, args.password, args.host, args.table, args.database)</span>

具体使用例子如下

[root@server1]# python csv2mysql.py --host=172.20.197.61 --user=PdYRxGWNpVRCQfHj --password=RX5a5YsViQcDdywr --database=cf_dc61100a_92a7_43ca_81dd_2a7e3fa0808a --table=performance_history_2 performance_history_2.csv
Importing `performance_history_2.csv' into MySQL database `cf_dc61100a_92a7_43ca_81dd_2a7e3fa0808a.performance_history_2'
Analyzing column types ...
['datetime', 'varchar(255)', 'varchar(255)', 'varchar(255)', 'varchar(255)', 'varchar(255)', 'varchar(255)', 'varchar(255)']
CREATE TABLE IF NOT EXISTS performance_history_2 (
id int NOT NULL AUTO_INCREMENT,
date_time datetime,
write_bw_mb_s varchar(255),
read_bw_mb_s varchar(255),
write_iops varchar(255),
read_iops varchar(255),
write_latency_usec varchar(255),
read_latency_usec varchar(255),
avg_latency_usec varchar(255),
PRIMARY KEY (id)
) DEFAULT CHARSET=utf8;
Inserting rows ...
commit
commit
commit
commit
commit
commit
commit
commit
commit
commit
commit
Committing rows to database ...
Done!

The above is the detailed content of csv2mysql. For more information, please follow other related articles on the PHP Chinese website!

Statement:
The content of this article is voluntarily contributed by netizens, and the copyright belongs to the original author. This site does not assume corresponding legal responsibility. If you find any content suspected of plagiarism or infringement, please contact admin@php.cn