Skip to main content

Hive-4.* Docker Loading Data

Description

We’ll explore creating a Hive environment in Docker and demonstrate loading data.

Prerequisites

  • Docker

Launching Hive

export HIVE_VERSION=4.0.0
export POSTGRES_LOCAL_PATH=your_local_path_to_postgres_driver
  • docker-compose.yml

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

version: '3.9'
services:
postgres:
image: postgres
restart: unless-stopped
container_name: postgres
hostname: postgres
environment:
POSTGRES_DB: 'metastore_db'
POSTGRES_USER: 'hive'
POSTGRES_PASSWORD: 'password'
ports:
- '5432:5432'
volumes:
- hive-db:/var/lib/postgresql
networks:
- hive

metastore:
image: apache/hive:${HIVE_VERSION}
depends_on:
- postgres
restart: unless-stopped
container_name: metastore
hostname: metastore
environment:
DB_DRIVER: postgres
SERVICE_NAME: 'metastore'
SERVICE_OPTS: '-Xmx1G -Djavax.jdo.option.ConnectionDriverName=org.postgresql.Driver
-Djavax.jdo.option.ConnectionURL=jdbc:postgresql://postgres:5432/metastore_db
-Djavax.jdo.option.ConnectionUserName=hive
-Djavax.jdo.option.ConnectionPassword=password'
ports:
- '9083:9083'
volumes:
- warehouse:/opt/hive/data/warehouse
- type: bind
source: ${POSTGRES_LOCAL_PATH}
target: /opt/hive/lib/postgres.jar
networks:
- hive

hiveserver2:
image: apache/hive:${HIVE_VERSION}
depends_on:
- metastore
restart: unless-stopped
container_name: hiveserver2
environment:
HIVE_SERVER2_THRIFT_PORT: 10000
SERVICE_OPTS: '-Xmx1G -Dhive.metastore.uris=thrift://metastore:9083'
IS_RESUME: 'true'
SERVICE_NAME: 'hiveserver2'
ports:
- '10000:10000'
- '10002:10002'
volumes:
- warehouse:/opt/hive/data/warehouse
networks:
- hive

volumes:
hive-db:
warehouse:

networks:
hive:
name: hive
  • docker compose up -d

Dataset

  • customers-100.csv

  • Customer (customers-100.csv) Schema

    • Index
    • Customer Id
    • First Name
    • Last Name
    • Company
    • City
    • Country
    • Phone 1
    • Phone 2
    • Email
    • Subscription Date
    • Website

Loading Dataset

docker exec -it hiveserver2 bash

Create dir my_examples

hive@088e0373719e:/opt/hive$ hdfs dfs -mkdir -p ./my_examples

Copy file customers-100.csv to hadoop


$ docker cp [YOUR PATH]/customers-100.csv hiveserver2:/opt/hive/my_examples
Successfully copied 18.9kB to hiveserver2:/opt/hive/my_examples

Launch Beeline

docker exec -it hiveserver2 beeline -u 'jdbc:hive2://hiveserver2:10000/'

Create database customers_100

create database customers_100;
show databases;
use customers_100;

Create Table customers_100

To create a new Hive table, you can use the CREATE TABLE statement in HiveQL.

DROP TABLE customers_100;
CREATE TABLE customers_100 (
customer_id STRING COMMENT 'Unique ID',
first_name STRING COMMENT 'First Name',
last_name STRING COMMENT 'Last Name',
company STRING COMMENT 'Company',
city STRING COMMENT 'City',
country STRING COMMENT 'Country',
phone_1 STRING COMMENT 'phone_1',
phone_2 STRING COMMENT 'phone_2',
email STRING COMMENT 'email',
subscription_date STRING COMMENT 'subscription_date',
website STRING COMMENT 'Website'
)
COMMENT 'Table to store information about customers'
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
STORED AS TEXTFILE
LOCATION '/opt/hive/my_examples';
Show tables;

Output:

+----------------+
| tab_name |
+----------------+
| customers_100 |
+----------------+

Insert Data

    FROM customers_100
INSERT OVERWRITE TABLE customers_100
SELECT *;

select

select * from customers_100;

Output:

INFO  : Compiling command(queryId=hive_20250524073506_204cf7a3-a179-4103-8ba5-42e047f22996): select * from customers_100
INFO : Semantic Analysis Completed (retrial = false)
INFO : Created Hive schema: Schema(fieldSchemas:[FieldSchema(name:customers_100.customer_id, type:string, comment:null), FieldSchema(name:customers_100.first_name, type:string, comment:null), FieldSchema(name:customers_100.last_name, type:string, comment:null), FieldSchema(name:customers_100.company, type:string, comment:null), FieldSchema(name:customers_100.city, type:string, comment:null), FieldSchema(name:customers_100.country, type:string, comment:null), FieldSchema(name:customers_100.phone_1, type:string, comment:null), FieldSchema(name:customers_100.phone_2, type:string, comment:null), FieldSchema(name:customers_100.email, type:string, comment:null), FieldSchema(name:customers_100.subscription_date, type:string, comment:null), FieldSchema(name:customers_100.website, type:string, comment:null)], properties:null)
INFO : Completed compiling command(queryId=hive_20250524073506_204cf7a3-a179-4103-8ba5-42e047f22996); Time taken: 0.128 seconds
INFO : Concurrency mode is disabled, not creating a lock manager
INFO : Executing command(queryId=hive_20250524073506_204cf7a3-a179-4103-8ba5-42e047f22996): select * from customers_100
INFO : Completed executing command(queryId=hive_20250524073506_204cf7a3-a179-4103-8ba5-42e047f22996); Time taken: 0.001 seconds
+----------------------------+---------------------------+--------------------------+------------------------+-----------------------+---------------------------+-----------------------------------------------+-----------------------------------------------+-------------------------+-------------------------------------+------------------------------------+
| customers_100.customer_id | customers_100.first_name | customers_100.last_name | customers_100.company | customers_100.city | customers_100.country | customers_100.phone_1 | customers_100.phone_2 | customers_100.email | customers_100.subscription_date | customers_100.website |
+----------------------------+---------------------------+--------------------------+------------------------+-----------------------+---------------------------+-----------------------------------------------+-----------------------------------------------+-------------------------+-------------------------------------+------------------------------------+
| Index | Customer Id | First Name | Last Name | Company | City | Country | Phone 1 | Phone 2 | Email | Subscription Date |
| 1 | DD37Cf93aecA6Dc | Sheryl | Baxter | Rasmussen Group | East Leonard | Chile | 229.077.5154 | 397.884.0519x718 | zunigavanessa@smith.info | 2020-08-24 |
| 2 | 1Ef7b82A4CAAD10 | Preston | Lozano | Vega-Gentry | East Jimmychester | Djibouti | 5153435776 | 686-620-1820x944 | vmata@colon.com | 2021-04-23 |
| 3 | 6F94879bDAfE5a6 | Roy | Berry | Murillo-Perry | Isabelborough | Antigua and Barbuda
...
...

Load data


LOAD DATA LOCAL INPATH '/opt/hive/my_examples/customers-100.csv' INTO TABLE customers_100;

More Examples: https://hive.apache.org/docs/latest/tutorial_27362061/#usage-and-examples