[donghua@cdh-vm source]$ cat newline.txt
id,text
1,"a
b"
2,"c"
3,"新年快乐"
[donghua@cdh-vm source]$ cat convert_csv_to_parquet.py
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
df = pd.read_csv('newline.txt')
# Convert from pandas to Arrow
table = pa.Table.from_pandas(df)
pq.write_table(table,'newline.parquet')
[donghua@cdh-vm source]$ python convert_csv_to_parquet.py
[donghua@cdh-vm source]$ parquet-tools cat -j newline.parquet
{"id":1,"text":"YQpi","__index_level_0__":0}
{"id":2,"text":"Yw==","__index_level_0__":1}
{"id":3,"text":"5paw5bm05b+r5LmQ","__index_level_0__":2}
[donghua@cdh-vm source]$ hdfs dfs -mkdir tbl_newline_parquet
[donghua@cdh-vm source]$ hdfs dfs -put newline.parquet tbl_newline_parquet/
0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> -- Hive syntax
0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> create external table tbl_newline_3
. . . . . . . . . . . . . . . . . . . . . . .> (id bigint, text string)
. . . . . . . . . . . . . . . . . . . . . . .> stored as parquet
. . . . . . . . . . . . . . . . . . . . . . .> location '/user/donghua/tbl_newline_parquet';
No rows affected (0.114 seconds)
0: jdbc:hive2://cdh-vm.dbaglobe.com:10000/def> select * from tbl_newline_3;
+-------------------+---------------------+--+
| tbl_newline_3.id | tbl_newline_3.text |
+-------------------+---------------------+--+
| 1 | a
b |
| 2 | c |
| 3 | 新年快乐 |
+-------------------+---------------------+--+
3 rows selected (0.132 seconds)
[cdh-vm.dbaglobe.com:21000] > -- impala syntax
> create external table tbl_newline_2
> LIKE PARQUET '/user/donghua/tbl_newline_parquet/newline.parquet'
> stored as parquet
> location '/user/donghua/tbl_newline_parquet';
[cdh-vm.dbaglobe.com:21000] > desc tbl_newline_2;
+-------------------+--------+-----------------------------+
| name | type | comment |
+-------------------+--------+-----------------------------+
| id | bigint | Inferred from Parquet file. |
| text | string | Inferred from Parquet file. |
| __index_level_0__ | bigint | Inferred from Parquet file. |
+-------------------+--------+-----------------------------+
Fetched 3 row(s) in 0.02s
[cdh-vm.dbaglobe.com:21000] > select * from tbl_newline_2;
+----+----------+-------------------+
| id | text | __index_level_0__ |
+----+----------+-------------------+
| 1 | a | 0 |
| | b | |
| 2 | c | 1 |
| 3 | 新年快乐 | 2 |
+----+----------+-------------------+
Fetched 3 row(s) in 5.25s
No comments:
Post a Comment