Skip to main content

MapReduce: Inverted Index

Based on...

Description

Inverted index maps terms (words) to their locations in a set of documents. This type of index allows for fast full-text searches, making it ideal for applications like search engines and document retrieval systems.

This Example is created for looking up the words in files.

Workflow:

  • Mapper: fetch the file name of each record and split the record into words input: <offset, line> output: <word, fileName>

  • Reducer: sum up the count for each word input: <word, (file1, file2, file1, ...)> output: <word, (file1=count1, file2=count2, ...)>

Prerequisites

  • Linux
  • JDK 11
  • Maven
  • Hadoop 3.4.1

HDFS

Explanations

Create files in HDFS

su - hadoop
start-dfs.sh
start-yarn.sh

Create local:

/home/hadoop/examples/InvertedIndex/input/sample1.txt

with data:

I love big data
Finally AI

and

/home/hadoop/examples/InvertedIndex/input/sample2.txt

with data:

I love hello world
Finally AI

Copy the above local files to HDFS file system

and
check if both files have been copied:

hdfs dfs -ls /examples/InvertedIndex/input/

Java Program

pom.xml

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>

<groupId>org.example</groupId>
<artifactId>InvertedIndex</artifactId>
<version>1.0-SNAPSHOT</version>

<properties>
<target.java.version>11</target.java.version>
<maven.compiler.source>${target.java.version}</maven.compiler.source>
<maven.compiler.target>${target.java.version}</maven.compiler.target>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>

<dependencies>
<!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-mapreduce-client-core -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-core</artifactId>
<version>3.4.1</version>
</dependency>

<!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-core -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-core</artifactId>
<version>1.2.1</version>
</dependency>
</dependencies>

<build>
<plugins>

<!-- Java Compiler -->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.1</version>
<configuration>
<source>${target.java.version}</source>
<target>${target.java.version}</target>
</configuration>
</plugin>

<!-- We use the maven-shade plugin to create a fat jar that contains all necessary dependencies. -->
<!-- Change the value of <mainClass>...</mainClass> if your program entry point changes. -->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>3.0.0</version>
<executions>
<!-- Run shade goal on package phase -->
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<artifactSet>
<excludes>
<exclude>org.apache.flink:flink-shaded-force-shading</exclude>
<exclude>com.google.code.findbugs:jsr305</exclude>
<exclude>org.slf4j:*</exclude>
<exclude>org.apache.logging.log4j:*</exclude>
</excludes>
</artifactSet>
<filters>
<filter>
<!-- Do not copy the signatures in the META-INF folder.
Otherwise, this might cause SecurityExceptions when using the JAR. -->
<artifact>*:*</artifact>
<excludes>
<exclude>META-INF/*.SF</exclude>
<exclude>META-INF/*.DSA</exclude>
<exclude>META-INF/*.RSA</exclude>
</excludes>
</filter>
</filters>
<transformers>
<transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
<mainClass>Driver</mainClass>
</transformer>
</transformers>
</configuration>
</execution>
</executions>
</plugin>
</plugins>

</build>
</project>

InvertedIndex.java

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;

import java.io.IOException;
import java.util.HashMap;
import java.util.Map;

public class InvertedIndex {

public static class InvertedIndexMapper extends Mapper<Object, Text, Text, Text> {

@Override
public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
// input: <offset, line>
// output: <word, fileName>

// split this record into words
String[] words = value.toString().trim().split("\\s+");

// fetch the file name of this record
String fileName = ((FileSplit) context.getInputSplit()).getPath().getName();

for (String word : words) {
context.write(new Text(word.toLowerCase()), new Text(fileName));
}
}
}

public static class InvertedIndexReducer extends Reducer<Text, Text, Text, Text> {

@Override
public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
// input: <word, (file1, file2, file1, ...)>
// output: <word, {file1=count1, file2=count2, ...)>

Map<String, Integer> hist = new HashMap<String, Integer>();

for (Text value : values) {
String fileName = value.toString();

if (hist.containsKey(fileName)) {
hist.put(fileName, hist.get(fileName) + 1);
} else {
hist.put(fileName, 1);
}
}

context.write(key, new Text(hist.toString()));
}
}

}

Driver.java

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;


/*
* This driver implements the Tool interface (which calls GenericOptionsParser)
* to help interpret common Hadoop command-line options.
*
* All implementations of Tool need to implement Configurable (since Tool extends it),
* and the easiest way to do this is subclassing Configured.
*
* ToolRunner is a utility to help run Tool. It can be used to run classes implementing Tool interface.
* It works in conjunction with GenericOptionsParser to parse the generic hadoop command line
* arguments and modifies the Configuration of the Tool. The application-specific options are passed along
* without being modified.
*
*
* Although it is unnecessary in this example, this driver format is very useful for customizing configuration
* at run time, not compile time!
*
*/

public class Driver extends Configured implements Tool {

@Override
public int run(String[] args) throws Exception {
// check the run parameters
if (args.length != 2) {
System.err.printf("Usage: %s [generic options] <input> <output>\n", getClass().getSimpleName());
ToolRunner.printGenericCommandUsage(System.err);
return -1;
}

// create a configuration
Configuration conf = getConf();

// instantiate a job
Job job = Job.getInstance(conf, "Inverted Index");

job.setJarByClass(InvertedIndex.class);
job.setMapperClass(InvertedIndex.InvertedIndexMapper.class);
job.setReducerClass(InvertedIndex.InvertedIndexReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);

// specify io
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));

return job.waitForCompletion(true)? 0 : 1;
}

public static void main(String[] args) throws Exception {
// use ToolRunner to run Tool
int exitCode = ToolRunner.run(new Driver(), args);
System.exit(exitCode);
}
}

Build jar

mvn clean package

Run jar


hadoop jar /home/hadoop/examples/InvertedIndex/InvertedIndex-1.0-SNAPSHOT.jar /examples/InvertedIndex/input/ /examples/InvertedIndex/output/

Verify result

hdfs dfs -ls /examples/InvertedIndex/output/

Output:


-rw-r--r-- 1 hadoop supergroup 0 2025-03-13 13:52 /examples/InvertedIndex/output/_SUCCESS
-rw-r--r-- 1 hadoop supergroup 227 2025-03-13 13:52 /examples/InvertedIndex/output/part-r-00000

Display content

hdfs dfs -cat /examples/InvertedIndex/output/part-r-00000

Output:


hadoop@LL01:~/hadoop-3.4.1/bin$ ./hdfs dfs -cat /examples/InvertedIndex/output/part-r-00000
ai {sample2.txt=1, sample1.txt=1}
big {sample1.txt=1}
data {sample1.txt=1}
finally {sample2.txt=1, sample1.txt=1}
hello {sample2.txt=1}
i {sample2.txt=1, sample1.txt=1}
love {sample2.txt=1, sample1.txt=1}
world {sample2.txt=1}

Source of the java program:
https://github.com/ZbCiok/zjc-examples/tree/main/streams/hadoop/InvertedIndex