To demonstrate Hive, below is a short tutorial. The tutorial uses the Google NGrams dataset, which is available in HDFS in /var/ngrams.
# Open the interactive hive console
hive
# Create a table with the Google NGrams data in /var/ngrams
CREATE EXTERNAL TABLE ngrams_your-uniqname(ngram STRING, year INT, count BIGINT, volumes BIGINT)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ‘\t’
STORED AS TEXTFILE
LOCATION ‘/var/ngrams’;
# Look at the schema of the table
DESCRIBE ngrams_your-uniqname;
# Count the total number of rows (should be 1430731493)
SELECT COUNT(*) FROM ngrams_your-uniqname;
# Select the number of words, by year, that have only appeared in a single volume
SELECT year, COUNT(ngram) FROM ngrams_your-uniqname WHERE
volumes = 1
GROUP BY year;
# Optional: delete your ngrams table
DROP table ngrams_your-uniqname;
# Exit the Hive console
QUIT;