Create an Algolia index by crawling a WebHelp documentation

Labels: Algolia WebHelp

How to create a Crawler that will collect documenation's information and push it to an Algolia index?

In this task we''ll create a basic crawler that collects title, keywords, short description and contents and pushes that information to an Algolia index.

Prepare a Maven project.

Add dependenceis in pom.xml. We'll need JSoup, Algolia and Log system

            <dependency>
			<groupId>org.jsoup</groupId>
			<artifactId>jsoup</artifactId>
			<version>1.10.2</version>
		</dependency>

		<dependency>
			<groupId>ch.qos.logback</groupId>
			<artifactId>logback-core</artifactId>
			<version>1.2.6</version>
		</dependency>

		<dependency>
			<groupId>org.slf4j</groupId>
			<artifactId>slf4j-api</artifactId>
			<version>1.7.36</version>
		</dependency>


		<dependency>
			<groupId>ch.qos.logback</groupId>
			<artifactId>logback-classic</artifactId>
			<version>1.2.6</version>
		</dependency>

		<dependency>
			<groupId>com.algolia</groupId>
			<artifactId>algoliasearch-core</artifactId>
			<version>3.16.5</version>
		</dependency>

		<dependency>
			<groupId>com.algolia</groupId>
			<artifactId>algoliasearch-java-net</artifactId>
			<version>3.16.5</version>
		</dependency>

		<dependency>
			<groupId>org.json</groupId>
			<artifactId>json</artifactId>
			<version>20220320</version>
		</dependency>

Create an basic example of Page that will be used by Algolia to create JSON records.

package ro.sync.search;

import java.util.List;

import com.fasterxml.jackson.annotation.JsonProperty;

/**
 * The class that represents a page model. It contains all the data crawled from
 * a certain URL.
 * 
 * @author Artiom Bozieac
 *
 */
public class PageBase {
	/**
	 * URL from whom the data was collected.
	 */
	@JsonProperty("objectID")
	protected String url;
	/**
	 * Page's title collected from metadata.
	 */
	protected String title;
	/**
	 * Page's short description
	 */
	protected String shortDescription;
	/**
	 * Page's collected keywords from metadata.
	 */
	@JsonProperty("_tags")
	protected List<String> keywords;
	/**
	 * Page's collected content from body section.
	 */
	protected String content;

	/**
	 * @param url is the URL from whom the data should be collected.
	 * @return reference to the current instance.
	 */
	protected PageBase setUrl(final String url) {
		this.url = url;
		return this;
	}

	/**
	 * @param title is the page's title.
	 * @return reference to the current instance.
	 */
	protected PageBase setTitle(final String title) {
		this.title = title;
		return this;
	}

	/**
	 * @param shortDescription is the page's short description.
	 * @return reference to the current instance.
	 */
	protected PageBase setShortDescription(final String shortDescription) {
		this.shortDescription = shortDescription;
		return this;
	}

	/**
	 * @param keywords is the page's collected keywords from metadata.
	 * @return reference to the current instance.
	 */
	protected PageBase setKeywords(final List<String> keywords) {
		this.keywords = keywords;
		return this;
	}

	/**
	 * @param content is the page's content that represents the body.
	 * @return reference to the current instance.
	 */
	protected PageBase setContent(final String content) {
		this.content = content;
		return this;
	}

	/**
	 * @return URL from whom the data was collected.
	 */
	public String getUrl() {
		return this.url;
	}

	/**
	 * @return Page's collected title.
	 */
	public String getTitle() {
		return this.title;
	}

	/**
	 * @return Page's short description.
	 */
	public String getShortDescription() {
		return this.shortDescription;
	}

	/**
	 * @return Page's collected keywords.
	 */
	public List<String> getKeywords() {
		return this.keywords;
	}

	/**
	 * @return Page's collected content from body section.
	 */
	public String getContent() {
		return this.content;
	}
}

Create an abstract crawler that will provide the necessary methods and fields for any use case.

package ro.sync.search;

import java.io.File;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.StringTokenizer;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Abstract class of Crawler that provides basic functional.
 * 
 * @author Bozieac Artiom
 *
 * @param <T> is the version of page to be used. PageBase, PageFaceting or
 *            PageMultipleDocumentations.
 */
public abstract class AbstractCrawler<T extends PageBase> {
	/**
	 * Logger to inform user about certain actions like errors and others.
	 */
	protected static final Logger logger = LoggerFactory.getLogger(AbstractCrawler.class);
	/**
	 * The url to be crawled.
	 */
	protected String url;
	/**
	 * The base url of url to be crawled. It is used in order to not leave the
	 * website and crawl data to infinite. For example if the base url is
	 * "https://google.com/search/index.html" then it won't go to any other sites
	 * that don't start with "https://google.com/search".
	 */
	protected String baseUrl;
	/**
	 * Represents the state of URL. If URL has "http:// or "https://" protocol then
	 * it's a website, if "file://" then it's a file.
	 */
	protected boolean isFile;
	/**
	 * Class and attribute that represents short description in DOM.
	 */
	static final String SHORT_DESCRIPTION_SELECTOR = "p[class=\"- topic/shortdesc shortdesc\"]";
	/**
	 * File that represents class and attributes that should be ignored for
	 * collection.
	 */
	static final String NODES_TO_IGNORE_PATH = "nodesToIgnore.csv";
	/**
	 * A list of strings that represents selectors of elements that should be ignore
	 * during the crawling process.
	 */
	protected final List<String> nodesToIgnore = new ArrayList<>();

	/**
	 * List that stores all the visited urls in order to not crawl them more than
	 * one time.
	 */
	protected List<String> visitedUrls = new ArrayList<>();

	/**
	 * List that serves as a queue that is used to perform BFS algorithm.
	 */
	protected List<String> queue = new ArrayList<>();
	/**
	 * List that stores all crawled pages
	 */
	protected List<T> pages = new ArrayList<>();

	/**
	 * Constructor with url and baseUrl parameters.
	 * 
	 * @param url     is the page that should be crawled for data.
	 * @param baseUrl is the parent that is used to not go out of bounds.
	 * @param isFile  is the flag that indicates if you passed and URL to the file
	 *                or a website.
	 * 
	 * @throws IOException if problems with initaliztion of URL or accessing the
	 *                     nodesToIgnore.csv file occurred.
	 */
	protected AbstractCrawler(final String url, final String baseUrl, final boolean isFile) throws IOException {
		this.url = url;
		this.baseUrl = baseUrl;
		this.isFile = isFile;

		StringTokenizer tokenizer = new StringTokenizer(Files.readString(Path.of(NODES_TO_IGNORE_PATH)), ",");
		while (tokenizer.hasMoreTokens()) {
			nodesToIgnore.add(tokenizer.nextToken());
		}
	}

	/**
	 * @return list of crawled pages
	 */
	public List<T> getCrawledPages() {
		return this.pages;
	}

	/**
	 * @return start url that should be crawled for data.
	 */
	public String getUrl() {
		return this.url;
	}

	/**
	 * @return base url that is used to not go out of parent's bounds.
	 */
	public String getBaseUrl() {
		return this.baseUrl;
	}

	/**
	 * @return list of visited urls after the crawl.
	 */
	public List<String> getVisitedUrls() {
		return this.visitedUrls;
	}

	/**
	 * Using the given url in the constructor it visits every resource that haves
	 * the same host and crawls its data.
	 * 
	 * @throws IOException if a problem with reading HTML File occured.
	 * 
	 */
	public void crawl() throws IOException {
		visitedUrls.clear();

		// Add to the queue the starting url so it starts with it
		queue.add(url);

		while (!queue.isEmpty()) {
			String currentUrl = queue.remove(0);
			Document page = readHtml(currentUrl);
			findUrls(page, currentUrl);

			if (!(currentUrl.endsWith("index.html") || currentUrl.equals(this.url)))
				collectData(page);
		}

		logger.info("The crawling went successfully! {} page(s) has/have been crawled!", getCrawledPages().size());
	}

	/**
	 * Reads HTML code from an URL.
	 * 
	 * @param url is page's url whose HTML code should be extracted.
	 * @return a HTML document.
	 * @throws IOException when a problem with reading the HTML code occurred.
	 */
	protected Document readHtml(final String url) throws IOException {
		return this.isFile ? Jsoup.parse(new File(url.substring(5)), "UTF-8") : Jsoup.connect(url).get();
	}

	/**
	 * Finds appropriate urls among all the matches and adds them to queue.
	 * 
	 * @param page    is the Document whose hrefs should be collected.
	 * @param pageUrl is the current page's url that is used to construct the next
	 *                URLs.
	 * @throws MalformedURLException when a problem with initialization of URL
	 *                               occurred.
	 */
	protected void findUrls(final Document page, final String pageUrl) throws MalformedURLException {
		// Select all "a" tags
		Elements links = page.select("a");
		// Search for ".html" hrefs
		for (Element link : links) {
			if (link.attr("href").endsWith(".html")) {

				String currentUrl = new URL(new URL(pageUrl), link.attr("href")).toString();

				if (!visitedUrls.contains(currentUrl) && currentUrl.startsWith(this.baseUrl)
						&& !(currentUrl.endsWith("index.html") || currentUrl.equals(this.url))) {
					visitedUrls.add(currentUrl);
					queue.add(currentUrl);
				}
			}
		}
	}

	/**
	 * Collects all the data(titles, keywords and content) from visited urls and
	 * creates a new Page object.
	 * 
	 * @param page is the desired document whose data should be collected.
	 */
	protected abstract void collectData(final Document page);

	/**
	 * Collects the title of the page.
	 * 
	 * @param page is the desired document whose data should be collected. Page's
	 *             collected title from metadata.
	 * @return page's collected title.
	 */
	protected String collectTitle(final Document page) {
		return page.title();
	}

	/**
	 * Collects the short description of the page.
	 * 
	 * @param page is the desired document whose data should be collected.
	 * @return Short description of the page
	 */
	protected String collectShortDescription(final Document page) {
		return page.select(SHORT_DESCRIPTION_SELECTOR).text();
	}

	/**
	 * Collects the keywords of the page from metadata.
	 * 
	 * @param page is the desired document whose data should be collected.
	 * @return Page's collected keywords from metadata.
	 */
	protected List<String> collectKeywords(final Document page) {
		Element element = page.select("meta[name=keywords]").first();

		if (element != null)
			return Arrays.asList(element.attr("content").split(","));

		return new ArrayList<>();
	}

	/**
	 * Collects the content of Page from body. The content are texts, titles,
	 * paragraphs and others.
	 * 
	 * @param page is the desired document whose data should be collected.
	 * @return Page's collected content from body section.
	 */
	protected String collectContent(final Document page) {
		// Delete from DOM every selector from file "nodesToIgnore.csv".
		for (String selector : this.nodesToIgnore)
			page.select(selector).remove();

		// Return remaining text from body.
		return page.body().text();
	}
}

Create a basic crawler that will be used to collect bare minimum, title, keywords, short descriptions and contents.

package ro.sync.search;

import java.io.IOException;
import java.util.List;

import org.jsoup.nodes.Document;

/**
 * Base class for Crawler that crawls an URL for its data.
 * 
 * @author Bozieac Artiom
 *
 */
public class BasicCrawler extends AbstractCrawler<PageBase> {
	/**
	 * Constructor with url and baseUrl parameters.
	 * 
	 * @param url     is the page that should be crawled for data.
	 * @param baseUrl is the parent that is used to not go out of bounds.
	 * @param isFile  is the flag that indicates if you passed and URL to the file
	 *                or a website.
	 * 
	 * @throws IOException if problems with initaliztion of URL or accessing the
	 *                     nodesToIgnore.csv file occurred.
	 */
	protected BasicCrawler(String url, String baseUrl, boolean isFile) throws IOException {
		super(url, baseUrl, isFile);
	}

	/**
	 * @return list of crawled pages
	 */
	@Override
	public List<PageBase> getCrawledPages() {
		return this.pages;
	}

	/**
	 * Collects all the data(titles, keywords and content) from visited urls and
	 * creates a new Page object.
	 * 
	 * @param page is the desired document whose data should be collected.
	 */
	protected void collectData(final Document page) {
		pages.add(new PageBase().setTitle(collectTitle(page)).setShortDescription(collectShortDescription(page))
				.setKeywords(collectKeywords(page)).setContent(collectContent(page)).setUrl(page.baseUri()));
		
		logger.info("Page {} was crawled!", page.title());
	}
}

Create a basic Algolia Client that will use basic crawler's collected data to push it to an Algolia index.

package ro.sync.search;

import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.Arrays;
import java.util.Properties;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.algolia.search.DefaultSearchClient;
import com.algolia.search.SearchClient;
import com.algolia.search.SearchIndex;
import com.algolia.search.models.settings.IndexSettings;

/**
 * Class that handles Algolia API calls for basic use case, when you need to
 * collect only title, keywords, short description and content.
 * 
 * @author Artiom Bozieac
 */
public class BasicAlgolia {
	/**
	 * Logger to inform user about certain actions like errors and others.
	 */
	private static final Logger logger = LoggerFactory.getLogger(BasicAlgolia.class);
	/**
	 * Algolia application's id.
	 */
	protected String appId;
	/**
	 * Algolia admin's key to perform actions.
	 */
	protected String adminApiKey;
	/**
	 * Client that performs operations such as indices management.
	 */
	protected SearchClient client;
	/**
	 * Index that stores the current index performing actions on;
	 */
	protected SearchIndex<PageBase> basicIndex;

	/**
	 * Constructor to set up all the necessary data like properties for Algolia
	 * connection.
	 * 
	 * @throws IOException if a problem with loading config properties occured.
	 * 
	 */
	public BasicAlgolia() throws IOException {
		try (InputStream input = new FileInputStream("config.properties")) {
			Properties properties = new Properties();

			// Load a properties file.
			properties.load(input);

			appId = properties.getProperty("algolia.appId");
			adminApiKey = properties.getProperty("algolia.adminApiKey");

			client = DefaultSearchClient.create(appId, adminApiKey);
		}
	}

	/**
	 * Adds crawled pages from Crawler object to index.
	 * 
	 * @param url     is the URL whose pages should be added to index.
	 * @param baseUrl is the base URL that is used to not go out of bounds.
	 * 
	 * @throws IOException if Crawler was failed to initiate or the HTML File
	 *                     couldn't be read.
	 */
	protected void populateIndex(final String url, final String baseUrl) throws IOException {
		BasicCrawler crawler = new BasicCrawler(url, baseUrl, false);
		crawler.crawl();

		basicIndex.saveObjects(crawler.getCrawledPages());
		logger.info("{} Page object(s) successfully added to {} index!", crawler.getCrawledPages().size(),
				basicIndex.getUrlEncodedIndexName());
	}

	/**
	 * Use arguments to crawl the documentation and push it to Algolia index.
	 * 
	 * @param args is the array with indexName, url, baseUrl.
	 * @throws IOException              if config.properties file is not set, path
	 *                                  to the documents is wrong or profilingPath
	 *                                  is invalid.
	 * @throws IllegalArgumentException if passed arguments are invalid.
	 */
	public void useArguments(final String... args) throws IOException, IllegalArgumentException {
		String url = "";
		String baseUrl = "";
		String indexName = "";

		for (String arg : args) {
			if (arg.startsWith("-url="))
				url = arg.substring(5, arg.length());
			else if (arg.startsWith("-baseUrl="))
				baseUrl = arg.substring(9, arg.length());
			else if (arg.startsWith("-indexName="))
				indexName = arg.substring(11, arg.length());
		}

		if (url.isEmpty() || baseUrl.isEmpty() || indexName.isEmpty())
			throw new IllegalArgumentException();

		basicIndex = client.initIndex(indexName, PageBase.class);
		basicIndex.clearObjects();
		populateIndex(url, baseUrl);
	}
}

With this we created an Basic Crawler and Basic AlgoliaClient. Most of the time we'll be using BasicAlgolia that recieves -url, -baseUrl and -indexName arguments that are used to crawl a website and push its data to an index.
```
BasicAlgolia algolia = new BasicAlgolia();
algolia.useArguments(args);
// args = -url=URL -baseUrl=BASE_URL -indexName=INDEX_NAME
```