1. Download or create sample csv
vi test.csv
id,name,amount
1,Ola McGee,40
2,Callie Taylor,65
3,Jesus Kennedy,43
4,Irene Freeman,56
5,Essie Carson,40
6,Marc McCarthy,62
7,Nicholas Snyder,37
2. Insert test.csv into Hadoop directory testing
#Create testing directory in hadoop
hduser@bd:/home/sachin/Downloads$ hdfs dfs -mkdir /testing hduser@bd:/home/sachin/Downloads$ hdfs dfs -ls / Found 6 items drwxr-xr-x - hduser supergroup 0 2020-07-04 03:01 /hadoop drwxr-xr-x - hduser supergroup 0 2020-07-04 00:24 /sample drwxr-xr-x - hduser supergroup 0 2020-07-07 22:38 /test drwxr-xr-x - hduser supergroup 0 2020-07-08 20:33 /testing drwxrwxr-x - hduser supergroup 0 2020-07-04 02:55 /tmp drwxr-xr-x - hduser supergroup 0 2020-07-04 13:31 /user hduser@bd:/home/sachin/Downloads$ hdfs dfs -put test.csv /testing/ hduser@bd:/home/sachin/Downloads$ hdfs dfs -ls /testing/ Found 1 items -rw-r--r-- 1 hduser supergroup 24579945 2020-07-08 20:33 /testing/test.csv
3. CREATE four Tables in hive for each file format and load test.csv into it
PLAIN TEXTFILE FORMAT
#LOGIN INTO HIVE BEELINE
hduser@bd:~/hiveserver2log$ beeline beeline> !connect jdbc:hive2://localhost:10000
Connecting to jdbc:hive2://localhost:10000
Enter username for jdbc:hive2://localhost:10000:
Enter password for jdbc:hive2://localhost:10000:
Connected to: Apache Hive (version 3.1.2)
Driver: Hive JDBC (version 3.1.2)
Transaction isolation: TRANSACTION_REPEATABLE_READ
0: jdbc:hive2://localhost:10000>
#CREATE TABLE plain_text
0: jdbc:hive2://localhost:10000>CREATE EXTERNAL TABLE plain_text
(
id integer,
name STRING,
age integer
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
STORED AS TEXTFILE
LOCATION '/testing/'
TBLPROPERTIES ('skip.header.line.count'='1');INFO : Starting task [Stage-0:DDL] in serial mode
OK
INFO : Completed executing command(queryId=hduser_20200708204042_62691f18-d1ea-4bf9-9e77-a57bea02f91e); Time taken: 1.786 seconds
INFO : OK
INFO : Concurrency mode is disabled
#CHECK THE TABLE SIZE
0: jdbc:hive2://localhost:10000> show tblproperties plain_text;
OK
+-------------------------+-------------+
| prpt_name | prpt_value |
+-------------------------+-------------+
| EXTERNAL | TRUE |
| bucketing_version | 2 |
| numFiles | 1 |
| skip.header.line.count | 1 |
| totalSize | 24579945 |
| transient_lastDdlTime | 1594221042 |
+-------------------------+-------------+
TABLE SIZE IS same as CSV size 24579945 byte
Avro File Format
0: jdbc:hive2://localhost:10000>CREATE TABLE test_avro
(
id integer,
name STRING,
age integer
)
STORED AS avro;
Insert data into test_avro from plain_text table
INSERT INTO TABLE test_avro SELECT * FROM plain_text
Check size of table
0: jdbc:hive2://localhost:10000> show tblproperties test_avro;
+————————+—————————————————-+
| prpt_name | prpt_value |
+————————+—————————————————-+
| COLUMN_STATS_ACCURATE | {“BASIC_STATS”:”true”,”COLUMN_STATS”:{“age”:”true”,”id”:”true”,”name”:”true”}} |
| bucketing_version | 2 |
| numFiles | 1 |
| numRows | 1000000 |
| rawDataSize | 0 |
| totalSize | 20731661 |
| transient_lastDdlTime | 1594225213 |
+————————+—————————————————-+
size is decreased to 20731661 bytes
Parquet File Format
Same as Avro we will create Parquet and ORC table and insert data from plain_text table
0: jdbc:hive2://localhost:10000>CREATE TABLE test_parquet
(
id integer,
name STRING,
age integer
)
STORED AS PARQUET;
0: jdbc:hive2://localhost:10000>INSERT INTO TABLE test_parquet SELECT * FROM plain_text ;
0: jdbc:hive2://localhost:10000> show tblproperties test_parquet;
ORC FILE FORMAT (HIGH Compression,FAST READ)
jdbc:hive2://localhost:10000>CREATE TABLE test_orc
(
id integer,
name STRING,
age integer
)
STORED AS ORC;
jdbc:hive2://localhost:10000>INSERT INTO TABLE test_orc SELECT * FROM plain_text ;
Check Size of table
0: jdbc:hive2://localhost:10000> show tblproperties test_orc;
+------------------------+----------------------------------------------------+
| prpt_name | prpt_value |
+------------------------+----------------------------------------------------+
| COLUMN_STATS_ACCURATE | {"BASIC_STATS":"true","COLUMN_STATS":{"age":"true","id":"true","name":"true"}} |
| bucketing_version | 2 |
| numFiles | 1 |
| numRows | 1000000 |
| rawDataSize | 104000000 |
| totalSize | 6149821 |
| transient_lastDdlTime | 1594225900 |
+------------------------+----------------------------------------------------+
size is decreased to 6149821 ie 6.1 MB