This article assumes the reader has already setup an ElasticSearch, Logstash, and Kibana (ELK) stack instance along with API keys.
I have a library of hundreds of PDF files of eBooks. One of the issues I’ve run into is quickly finding search terms across the entire library. While I can use Adobe Reader to search within a PDF document, there’s no easy way to search multiple documents or an entire library of documents. To solve this problem, I chose to use ElasticSearch, NEST, and a .Net Core console application to create an index of my PDF library. Here’s how you can do the same:
1. Create a new .Net Core console application in either Visual Studio or VS Code
2. Add the following Nuget package reference to your project file:
<PackageReference Include="NEST" Version="7.17.5" />
3. Create a new class called Document.cs using the following code:
using Nest;
namespace ElasticSearch
{
public class Document
{
public int Id { get; set; }
public string Path { get; set; }
public string Content { get; set; }
public Attachment Attachment { get; set; }
}
}
4. In Program.cs, add the following code:
using Elasticsearch.Net;
using Nest;
namespace ElasticSearch
{
internal class Program
{
static void Main(string[] args)
{
string esUri = {your ElasticSearch Uri};
string index = "library";
string id = {your API id};
string key = {your API key};
ConnectionSettings settings = new ConnectionSettings(new Uri(esUri));
settings.ApiKeyAuthentication(id, key);
settings.DefaultIndex(index);
ElasticClient client = new ElasticClient(settings);
if (client.Indices.Exists(index).Exists)
{
var deleteIndexResponse = client.Indices.Delete(index);
}
var indexResponse = client.Indices.Create(index, c => c
.Settings(s => s
.Setting("index.highlight.max_analyzed_offset", 10000000)
.NumberOfReplicas(0)
.Analysis(a => a
.Analyzers(ad => ad
.Custom("windows_path_hierarchy_analyzer", ca => ca
.Tokenizer("windows_path_hierarchy_tokenizer")))
.Tokenizers(t => t
.PathHierarchy("windows_path_hierarchy_tokenizer", ph => ph
.Delimiter('\\')))))
.Map<Document>(mp => mp
.AutoMap()
.Properties(ps => ps
.Text(s => s
.Name(n => n.Path)
.Analyzer("windows_path_hierarchy_analyzer"))
.Object<Attachment>(a => a
.Name(n => n.Attachment)
.AutoMap()))));
var putPipelineResponse = client.Ingest.PutPipeline("attachments", p => p
.Description("Document attachment pipeline")
.Processors(pr => pr
.Attachment<Document>(a => a
.Field(f => f.Content)
.IndexedCharacters(-1)
.TargetField(f => f.Attachment))
.Remove<Document>(r => r
.Field(ff => ff
.Field(f => f.Content)))));
string directory = {your PDF files location};
var files = from file in Directory.EnumerateFiles(directory, "*.pdf")
select file;
int x = 0;
foreach (var file in files)
{
var base64File = Convert.ToBase64String(File.ReadAllBytes(file));
var response = client.Index(new Document
{
Id = x++,
Path = file,
Content = base64File
}, i => i
.Pipeline("attachments")
.Refresh(Refresh.WaitFor));
if (response.IsValid)
{
Console.WriteLine($"Added document: {file}");
}
}
}
}
}
5. Using Kibana, run a search against the index.