Aggiunta file di configurazione Tika

This commit is contained in:
andrea.terzani
2025-03-22 20:13:26 +01:00
parent 8919d075b7
commit d6b5458e59
7 changed files with 121 additions and 88 deletions

View File

@@ -103,6 +103,8 @@
<artifactId>spring-ai-tika-document-reader</artifactId>
</dependency>
<dependency>
<groupId>org.springdoc</groupId>
<artifactId>springdoc-openapi-starter-webmvc-ui</artifactId>

View File

@@ -2,24 +2,29 @@ package com.olympus.apollo.controllers;
import java.util.List;
import com.olympus.apollo.feign.services.REModuleService;
import com.olympus.dto.ResultDTO;
import com.olympus.apollo.feign.services.ParserModuleService;
import com.olympus.apollo.services.GitService;
import com.olympus.dto.CommonParseRequest;
import com.olympus.dto.ApolloParseRequestDTO;
import com.olympus.feign.JavaParserModule;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.ai.document.Document;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.http.HttpStatus;
import org.springframework.http.ResponseEntity;
import org.springframework.web.bind.annotation.*;
import org.springframework.web.bind.annotation.GetMapping;
import org.springframework.web.bind.annotation.PathVariable;
import org.springframework.web.bind.annotation.PostMapping;
import org.springframework.web.bind.annotation.RequestBody;
import org.springframework.web.bind.annotation.RequestParam;
import org.springframework.web.bind.annotation.RestController;
import com.olympus.dto.IngestionOutput;
import com.olympus.apollo.feign.services.ParserModuleService;
import com.olympus.apollo.feign.services.REModuleService;
import com.olympus.apollo.services.GitRepositoryIngestor;
import com.olympus.apollo.services.GitService;
import com.olympus.apollo.services.KSIngestor;
import org.springframework.ai.document.Document;
import com.olympus.dto.ApolloParseRequestDTO;
import com.olympus.dto.CommonParseRequest;
import com.olympus.dto.IngestionOutput;
import com.olympus.dto.ResultDTO;
import com.olympus.feign.JavaParserModule;
@RestController
@@ -52,7 +57,7 @@ public class TestController {
@GetMapping("test/ingest_document/{id}")
public IngestionOutput ingestDocumentById(@PathVariable String id) {
return ksIngestor.ingestDocumentById(id);
return ksIngestor.ingestDocumentByIdAsync(id);
}
@GetMapping("test/query_vector")

View File

@@ -1,29 +1,30 @@
package com.olympus.apollo.services;
import com.olympus.dto.DeleteGitRepoDetailsRequest;
import com.olympus.dto.ResultDTO;
import com.olympus.apollo.exception.vectorStoreMetaDetailsEmptyException;
import com.olympus.apollo.repository.*;
import com.olympus.model.apollo.KSGitInfo;
import java.util.Date;
import java.util.List;
import java.util.Optional;
import java.util.concurrent.CompletableFuture;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.Date;
import com.olympus.dto.DeletionRequest;
import org.springframework.ai.document.Document;
import org.springframework.ai.vectorstore.SearchRequest;
import org.springframework.ai.vectorstore.VectorStore;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.messaging.simp.SimpMessagingTemplate;
import org.springframework.scheduling.annotation.Async;
import org.springframework.stereotype.Service;
import org.springframework.ai.vectorstore.VectorStore;
import com.olympus.apollo.exception.vectorStoreMetaDetailsEmptyException;
import com.olympus.apollo.repository.KSDocumentRepository;
import com.olympus.apollo.repository.KSGitInfoRepository;
import com.olympus.apollo.repository.KSGitIngestionInfoRepository;
import com.olympus.apollo.repository.KSIngestionInfoRepository;
import com.olympus.apollo.repository.KSTextsRepository;
import com.olympus.apollo.repository.VectorStoreRepository;
import com.olympus.dto.DeleteGitRepoDetailsRequest;
import com.olympus.dto.DeletionRequest;
import com.olympus.dto.ResultDTO;
import com.olympus.model.apollo.KSDocument;
import java.util.ArrayList;
import java.util.List;
import java.util.Optional;
import java.util.concurrent.CompletableFuture;
import com.olympus.model.apollo.KSGitInfo;
@Service
public class DeletionService {
@@ -58,21 +59,6 @@ public class DeletionService {
public void deleteRecords(DeletionRequest deletionRequest) {
try {
//TODO: COMPLETE REFACTOR REQUIRED TO DELETE RECORD FROM AZURE SEARCH
// NOT WORKING AT THE MOMENT
// boolean KSDocumentExists = deletionRequest.getKsDocumentId() != null &&
// !deletionRequest.getKsDocumentId().isEmpty() &&
// ksDocumentRepository.existsById(deletionRequest.getKsDocumentId());
// if(KSDocumentExists){
// SearchRequest searchRequest = SearchRequest.defaults()
// .withQuery("a").withTopK(1000)
// .withSimilarityThreshold(0.0)
// .withFilterExpression("KsDocumentId=='"+deletionRequest.getKsDocumentId()+"'");
// List<Document> docs = vectorStore.similaritySearch(searchRequest);
// List<String> ids = docs.stream().map(Document::getId).toList();
// vectorStore.delete(ids);
String rag_filter = "KsDocumentId=='"+deletionRequest.getKsDocumentId()+"'";
logger.info("Starting deletion");
vectorStore.delete(rag_filter);
@@ -92,21 +78,7 @@ public class DeletionService {
public void deleteRecordsOnlyFromVectorStore(DeletionRequest deletionRequest) {
try {
//TODO: COMPLETE REFACTOR REQUIRED TO DELETE RECORD FROM AZURE SEARCH
// NOT WORKING AT THE MOMENT
// boolean KSDocumentExists = deletionRequest.getKsDocumentId() != null &&
// !deletionRequest.getKsDocumentId().isEmpty() &&
// ksDocumentRepository.existsById(deletionRequest.getKsDocumentId());
// if(KSDocumentExists){
// SearchRequest searchRequest = SearchRequest.defaults()
// .withQuery("a").withTopK(1000)
// .withSimilarityThreshold(0.0)
// .withFilterExpression("KsDocumentId=='"+deletionRequest.getKsDocumentId()+"'");
// List<Document> docs = vectorStore.similaritySearch(searchRequest);
// List<String> ids = docs.stream().map(Document::getId).toList();
// vectorStore.delete(ids);
String rag_filter = "KsDocumentId=='"+deletionRequest.getKsDocumentId()+"'";
logger.info("Starting deletion");
vectorStore.delete(rag_filter);

View File

@@ -5,7 +5,6 @@ import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.List;
import org.apache.tomcat.util.openssl.openssl_h;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
@@ -18,7 +17,6 @@ import org.springframework.security.core.context.SecurityContextHolder;
import org.springframework.stereotype.Service;
import com.olympus.apollo.repository.KSDocumentRepository;
import com.olympus.apollo.repository.ProjectRepository;
import com.olympus.apollo.security.entity.User;
import com.olympus.model.apollo.KSDocument;
@@ -31,7 +29,6 @@ public class KSDocumentService {
private KSDocumentRepository ksdocRepo;
public List<KSDocument> findByProjectNameAndApplicationName() {
logger.info("findByProjectNameAndApplicationName function:");
User principal = (User) SecurityContextHolder.getContext().getAuthentication().getPrincipal();
try {

View File

@@ -1,28 +1,33 @@
package com.olympus.apollo.services;
import java.util.*;
import java.text.SimpleDateFormat;
import java.util.Collections;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Optional;
import java.util.concurrent.CompletableFuture;
import com.olympus.dto.IngestionOutput;
import com.olympus.model.apollo.KSDocument;
import com.olympus.model.apollo.KSTexts;
import com.olympus.apollo.repository.KSTextsRepository;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.ai.document.Document;
import org.springframework.ai.vectorstore.SearchRequest.Builder;
import org.springframework.ai.reader.tika.TikaDocumentReader;
import org.springframework.ai.transformer.splitter.TokenTextSplitter;
import org.springframework.ai.vectorstore.SearchRequest;
import org.springframework.ai.vectorstore.SearchRequest.Builder;
import org.springframework.ai.vectorstore.VectorStore;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.core.io.Resource;
import org.springframework.scheduling.annotation.Async;
import org.springframework.stereotype.Service;
import com.olympus.model.apollo.KSIngestionInfo;
import com.olympus.apollo.repository.KSDocumentRepository;
import com.olympus.apollo.repository.KSIngestionInfoRepository;
import com.olympus.apollo.repository.KSTextsRepository;
import com.olympus.dto.IngestionOutput;
import com.olympus.model.apollo.KSDocument;
import com.olympus.model.apollo.KSIngestionInfo;
import com.olympus.model.apollo.KSTexts;
@Service
@@ -93,39 +98,83 @@ public class KSIngestor {
}
}
public IngestionOutput ingestDocumentByIdAsync(String id) {
IngestionOutput ingestionOutput= new IngestionOutput();
Optional<KSDocument> optionalDocument = ksDocumentRepository.findById(id);
if (optionalDocument.isPresent()) {
KSDocument ksDocument = optionalDocument.get();
if ("LOADED".equals(ksDocument.getIngestionStatus()) || "ERROR".equals(ksDocument.getIngestionStatus())) {
ingestionOutput.setStatus("IN PROGRESS");
ingestDocumentAsync(ksDocument);
return ingestionOutput;
} else {
ingestionOutput.setMessage("OOPS: Document is already Injected");
return ingestionOutput;
}
} else {
ingestionOutput.setMessage("OOPS: Document Not found");
return ingestionOutput;
}
}
@Async
private CompletableFuture<Void> ingestDocumentAsync(KSDocument ksDocument) {
ingestDocument(ksDocument);
return null;
}
private IngestionOutput ingestDocument(KSDocument ksDocument) {
IngestionOutput ingestionLoopOutput = new IngestionOutput();
try {
ksDocument.setIngestionStatus("IN PROGRESS");
ksDocumentRepository.save(ksDocument);
Resource file = storageService.loadAsResource(ksDocument.getFilePath());
TikaDocumentReader tikaDocumentReader = new TikaDocumentReader(file);
List<Document> docs = tikaDocumentReader.read();
logger.info("Ingested document: " + ksDocument.getFilePath());
logger.info("Number of documents: " + docs.size());
KSIngestionInfo ingestionInfo = ksDocument.getIngestionInfo();
List<Document> docs = null;
try {
ksDocument.setIngestionMessage("Reading document: " + ksDocument.getFilePath());
ksDocumentRepository.save(ksDocument);
Resource file = storageService.loadAsResource(ksDocument.getFilePath());
TikaDocumentReader tikaDocumentReader = new TikaDocumentReader(file);
docs = tikaDocumentReader.read();
logger.info("Ingested document: " + ksDocument.getFilePath());
logger.info("Number of documents: " + docs.size());
ksDocument.setIngestionMessage("Document read successfully");
ksDocumentRepository.save(ksDocument);
} catch (Exception e) {
logger.error("Error reading document: " + e.getMessage());
ksDocument.setIngestionStatus("ERROR");
ksDocument.setIngestionMessage("Error reading document: " + e.getMessage());
ksDocumentRepository.save(ksDocument);
ingestionLoopOutput.setStatus("ERROR");
ingestionLoopOutput.setMessage("Error reading document: " + e.getMessage());
return ingestionLoopOutput;
}
TokenTextSplitter splitter = new TokenTextSplitter(ingestionInfo.getDefaultChunkSize(),
ingestionInfo.getMinChunkSize(),
ingestionInfo.getMinChunkSizeToEmbed(),
ingestionInfo.getMaxNumberOfChunks(),
true);
ingestionInfo.getMinChunkSize(),
ingestionInfo.getMinChunkSizeToEmbed(),
ingestionInfo.getMaxNumberOfChunks(),
true);
HashMap<String, String> metadata = ingestionInfo.getMetadata();
metadata.put("KsDocumentId",ksDocument.getId());
docs.forEach(doc -> {
List<Document> splitDocs = splitter.split(doc);
logger.info("Number of documents: " + splitDocs.size());
for (Document splitDoc : splitDocs) {
splitDoc.getMetadata().putAll(metadata);
}
}
ksDocument.setIngestionMessage("Embedding documents");
ksDocumentRepository.save(ksDocument);
embedDocuments(splitDocs, ingestionInfo);
});
ksDocument.setIngestionStatus("INGESTED");
ksDocument.setIngestionDate(new Date());
ksDocument.setIngestionDateFormat(new SimpleDateFormat("MM/dd/yy").format(new Date()));
@@ -135,12 +184,17 @@ public class KSIngestor {
ingestionLoopOutput.setStatus("OK");
ingestionLoopOutput.setMessage("OK");
}catch (Exception e){
ksDocument.setIngestionStatus("ERROR");
ksDocument.setIngestionMessage("Error ingesting document: " + e.getMessage());
ksDocumentRepository.save(ksDocument);
ingestionLoopOutput.setStatus("ERROR");
ingestionLoopOutput.setMessage(e.getMessage());
}
return ingestionLoopOutput;
}
public IngestionOutput ingestTextById(String id,String textToBeEmbed,String KsExternalDocUniqueID) {
IngestionOutput ingestionOutput= new IngestionOutput();
Optional<KSTexts> optionalDocument = ksTextsRepository.findById(id);
@@ -218,9 +272,6 @@ public class KSIngestor {
private void embedDocuments(List<Document> docs, KSIngestionInfo ingestionInfo) {
logger.info("Embedding documents");
docs.forEach(doc -> logger.info("Document metadata: " + doc.getMetadata()));
int batchSize = embDocsBatchSize;
for (int i = 0; i < docs.size(); i += batchSize) {
int end = Math.min(i + batchSize, docs.size());
@@ -228,7 +279,7 @@ public class KSIngestor {
try {
Thread.sleep(embDocRetryTime);
vectorStore.add(currentList);
logger.info("Documents embedded - Progress: Batch from {} to {} completed", i, end);
logger.info("Documents embedded - Progress: Batch from {} to {} completed of {} total chunks", i, end, docs.size());
} catch (Exception e) {
logger.error("Error embedding documents from {} to {}: {}", i, end, e.getMessage());
}

View File

@@ -66,4 +66,6 @@ logging:
#org.springframework.web.client: DEBUG
java-re-module:
url: "http://localhost:8084"
url: "http://localhost:8084"
tika.config: "tika-config.xml"

4
tika-config.xml Normal file
View File

@@ -0,0 +1,4 @@
<?xml version="1.0" encoding="UTF-8"?>
<properties>
</properties>