Aggiunta file di configurazione Tika
This commit is contained in:
2
pom.xml
2
pom.xml
@@ -103,6 +103,8 @@
|
||||
<artifactId>spring-ai-tika-document-reader</artifactId>
|
||||
</dependency>
|
||||
|
||||
|
||||
|
||||
<dependency>
|
||||
<groupId>org.springdoc</groupId>
|
||||
<artifactId>springdoc-openapi-starter-webmvc-ui</artifactId>
|
||||
|
||||
@@ -2,24 +2,29 @@ package com.olympus.apollo.controllers;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import com.olympus.apollo.feign.services.REModuleService;
|
||||
import com.olympus.dto.ResultDTO;
|
||||
import com.olympus.apollo.feign.services.ParserModuleService;
|
||||
import com.olympus.apollo.services.GitService;
|
||||
import com.olympus.dto.CommonParseRequest;
|
||||
import com.olympus.dto.ApolloParseRequestDTO;
|
||||
import com.olympus.feign.JavaParserModule;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.ai.document.Document;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.http.HttpStatus;
|
||||
import org.springframework.http.ResponseEntity;
|
||||
import org.springframework.web.bind.annotation.*;
|
||||
import org.springframework.web.bind.annotation.GetMapping;
|
||||
import org.springframework.web.bind.annotation.PathVariable;
|
||||
import org.springframework.web.bind.annotation.PostMapping;
|
||||
import org.springframework.web.bind.annotation.RequestBody;
|
||||
import org.springframework.web.bind.annotation.RequestParam;
|
||||
import org.springframework.web.bind.annotation.RestController;
|
||||
|
||||
import com.olympus.dto.IngestionOutput;
|
||||
import com.olympus.apollo.feign.services.ParserModuleService;
|
||||
import com.olympus.apollo.feign.services.REModuleService;
|
||||
import com.olympus.apollo.services.GitRepositoryIngestor;
|
||||
import com.olympus.apollo.services.GitService;
|
||||
import com.olympus.apollo.services.KSIngestor;
|
||||
import org.springframework.ai.document.Document;
|
||||
import com.olympus.dto.ApolloParseRequestDTO;
|
||||
import com.olympus.dto.CommonParseRequest;
|
||||
import com.olympus.dto.IngestionOutput;
|
||||
import com.olympus.dto.ResultDTO;
|
||||
import com.olympus.feign.JavaParserModule;
|
||||
|
||||
|
||||
@RestController
|
||||
@@ -52,7 +57,7 @@ public class TestController {
|
||||
|
||||
@GetMapping("test/ingest_document/{id}")
|
||||
public IngestionOutput ingestDocumentById(@PathVariable String id) {
|
||||
return ksIngestor.ingestDocumentById(id);
|
||||
return ksIngestor.ingestDocumentByIdAsync(id);
|
||||
}
|
||||
|
||||
@GetMapping("test/query_vector")
|
||||
|
||||
@@ -1,29 +1,30 @@
|
||||
package com.olympus.apollo.services;
|
||||
|
||||
import com.olympus.dto.DeleteGitRepoDetailsRequest;
|
||||
import com.olympus.dto.ResultDTO;
|
||||
import com.olympus.apollo.exception.vectorStoreMetaDetailsEmptyException;
|
||||
import com.olympus.apollo.repository.*;
|
||||
import com.olympus.model.apollo.KSGitInfo;
|
||||
import java.util.Date;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
import java.util.concurrent.CompletableFuture;
|
||||
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import java.util.Date;
|
||||
|
||||
import com.olympus.dto.DeletionRequest;
|
||||
import org.springframework.ai.document.Document;
|
||||
import org.springframework.ai.vectorstore.SearchRequest;
|
||||
import org.springframework.ai.vectorstore.VectorStore;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.messaging.simp.SimpMessagingTemplate;
|
||||
import org.springframework.scheduling.annotation.Async;
|
||||
import org.springframework.stereotype.Service;
|
||||
import org.springframework.ai.vectorstore.VectorStore;
|
||||
|
||||
import com.olympus.apollo.exception.vectorStoreMetaDetailsEmptyException;
|
||||
import com.olympus.apollo.repository.KSDocumentRepository;
|
||||
import com.olympus.apollo.repository.KSGitInfoRepository;
|
||||
import com.olympus.apollo.repository.KSGitIngestionInfoRepository;
|
||||
import com.olympus.apollo.repository.KSIngestionInfoRepository;
|
||||
import com.olympus.apollo.repository.KSTextsRepository;
|
||||
import com.olympus.apollo.repository.VectorStoreRepository;
|
||||
import com.olympus.dto.DeleteGitRepoDetailsRequest;
|
||||
import com.olympus.dto.DeletionRequest;
|
||||
import com.olympus.dto.ResultDTO;
|
||||
import com.olympus.model.apollo.KSDocument;
|
||||
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
import java.util.concurrent.CompletableFuture;
|
||||
import com.olympus.model.apollo.KSGitInfo;
|
||||
|
||||
@Service
|
||||
public class DeletionService {
|
||||
@@ -58,21 +59,6 @@ public class DeletionService {
|
||||
public void deleteRecords(DeletionRequest deletionRequest) {
|
||||
try {
|
||||
|
||||
//TODO: COMPLETE REFACTOR REQUIRED TO DELETE RECORD FROM AZURE SEARCH
|
||||
// NOT WORKING AT THE MOMENT
|
||||
// boolean KSDocumentExists = deletionRequest.getKsDocumentId() != null &&
|
||||
// !deletionRequest.getKsDocumentId().isEmpty() &&
|
||||
// ksDocumentRepository.existsById(deletionRequest.getKsDocumentId());
|
||||
// if(KSDocumentExists){
|
||||
// SearchRequest searchRequest = SearchRequest.defaults()
|
||||
// .withQuery("a").withTopK(1000)
|
||||
// .withSimilarityThreshold(0.0)
|
||||
// .withFilterExpression("KsDocumentId=='"+deletionRequest.getKsDocumentId()+"'");
|
||||
|
||||
|
||||
// List<Document> docs = vectorStore.similaritySearch(searchRequest);
|
||||
// List<String> ids = docs.stream().map(Document::getId).toList();
|
||||
// vectorStore.delete(ids);
|
||||
String rag_filter = "KsDocumentId=='"+deletionRequest.getKsDocumentId()+"'";
|
||||
logger.info("Starting deletion");
|
||||
vectorStore.delete(rag_filter);
|
||||
@@ -92,21 +78,7 @@ public class DeletionService {
|
||||
public void deleteRecordsOnlyFromVectorStore(DeletionRequest deletionRequest) {
|
||||
try {
|
||||
|
||||
//TODO: COMPLETE REFACTOR REQUIRED TO DELETE RECORD FROM AZURE SEARCH
|
||||
// NOT WORKING AT THE MOMENT
|
||||
// boolean KSDocumentExists = deletionRequest.getKsDocumentId() != null &&
|
||||
// !deletionRequest.getKsDocumentId().isEmpty() &&
|
||||
// ksDocumentRepository.existsById(deletionRequest.getKsDocumentId());
|
||||
// if(KSDocumentExists){
|
||||
// SearchRequest searchRequest = SearchRequest.defaults()
|
||||
// .withQuery("a").withTopK(1000)
|
||||
// .withSimilarityThreshold(0.0)
|
||||
// .withFilterExpression("KsDocumentId=='"+deletionRequest.getKsDocumentId()+"'");
|
||||
|
||||
|
||||
// List<Document> docs = vectorStore.similaritySearch(searchRequest);
|
||||
// List<String> ids = docs.stream().map(Document::getId).toList();
|
||||
// vectorStore.delete(ids);
|
||||
String rag_filter = "KsDocumentId=='"+deletionRequest.getKsDocumentId()+"'";
|
||||
logger.info("Starting deletion");
|
||||
vectorStore.delete(rag_filter);
|
||||
|
||||
@@ -5,7 +5,6 @@ import java.nio.file.Path;
|
||||
import java.nio.file.Paths;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.tomcat.util.openssl.openssl_h;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
@@ -18,7 +17,6 @@ import org.springframework.security.core.context.SecurityContextHolder;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.olympus.apollo.repository.KSDocumentRepository;
|
||||
import com.olympus.apollo.repository.ProjectRepository;
|
||||
import com.olympus.apollo.security.entity.User;
|
||||
import com.olympus.model.apollo.KSDocument;
|
||||
|
||||
@@ -31,7 +29,6 @@ public class KSDocumentService {
|
||||
private KSDocumentRepository ksdocRepo;
|
||||
|
||||
public List<KSDocument> findByProjectNameAndApplicationName() {
|
||||
logger.info("findByProjectNameAndApplicationName function:");
|
||||
User principal = (User) SecurityContextHolder.getContext().getAuthentication().getPrincipal();
|
||||
|
||||
try {
|
||||
|
||||
@@ -1,28 +1,33 @@
|
||||
package com.olympus.apollo.services;
|
||||
|
||||
import java.util.*;
|
||||
import java.text.SimpleDateFormat;
|
||||
import java.util.Collections;
|
||||
import java.util.Date;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
import java.util.concurrent.CompletableFuture;
|
||||
|
||||
import com.olympus.dto.IngestionOutput;
|
||||
import com.olympus.model.apollo.KSDocument;
|
||||
import com.olympus.model.apollo.KSTexts;
|
||||
import com.olympus.apollo.repository.KSTextsRepository;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.ai.document.Document;
|
||||
import org.springframework.ai.vectorstore.SearchRequest.Builder;
|
||||
import org.springframework.ai.reader.tika.TikaDocumentReader;
|
||||
import org.springframework.ai.transformer.splitter.TokenTextSplitter;
|
||||
import org.springframework.ai.vectorstore.SearchRequest;
|
||||
import org.springframework.ai.vectorstore.SearchRequest.Builder;
|
||||
import org.springframework.ai.vectorstore.VectorStore;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.beans.factory.annotation.Value;
|
||||
import org.springframework.core.io.Resource;
|
||||
import org.springframework.scheduling.annotation.Async;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.olympus.model.apollo.KSIngestionInfo;
|
||||
import com.olympus.apollo.repository.KSDocumentRepository;
|
||||
import com.olympus.apollo.repository.KSIngestionInfoRepository;
|
||||
import com.olympus.apollo.repository.KSTextsRepository;
|
||||
import com.olympus.dto.IngestionOutput;
|
||||
import com.olympus.model.apollo.KSDocument;
|
||||
import com.olympus.model.apollo.KSIngestionInfo;
|
||||
import com.olympus.model.apollo.KSTexts;
|
||||
|
||||
|
||||
@Service
|
||||
@@ -93,20 +98,62 @@ public class KSIngestor {
|
||||
}
|
||||
}
|
||||
|
||||
public IngestionOutput ingestDocumentByIdAsync(String id) {
|
||||
IngestionOutput ingestionOutput= new IngestionOutput();
|
||||
Optional<KSDocument> optionalDocument = ksDocumentRepository.findById(id);
|
||||
if (optionalDocument.isPresent()) {
|
||||
KSDocument ksDocument = optionalDocument.get();
|
||||
if ("LOADED".equals(ksDocument.getIngestionStatus()) || "ERROR".equals(ksDocument.getIngestionStatus())) {
|
||||
ingestionOutput.setStatus("IN PROGRESS");
|
||||
ingestDocumentAsync(ksDocument);
|
||||
return ingestionOutput;
|
||||
} else {
|
||||
ingestionOutput.setMessage("OOPS: Document is already Injected");
|
||||
return ingestionOutput;
|
||||
}
|
||||
} else {
|
||||
ingestionOutput.setMessage("OOPS: Document Not found");
|
||||
return ingestionOutput;
|
||||
}
|
||||
}
|
||||
|
||||
@Async
|
||||
private CompletableFuture<Void> ingestDocumentAsync(KSDocument ksDocument) {
|
||||
ingestDocument(ksDocument);
|
||||
return null;
|
||||
}
|
||||
|
||||
private IngestionOutput ingestDocument(KSDocument ksDocument) {
|
||||
IngestionOutput ingestionLoopOutput = new IngestionOutput();
|
||||
try {
|
||||
ksDocument.setIngestionStatus("IN PROGRESS");
|
||||
ksDocumentRepository.save(ksDocument);
|
||||
|
||||
KSIngestionInfo ingestionInfo = ksDocument.getIngestionInfo();
|
||||
List<Document> docs = null;
|
||||
try {
|
||||
ksDocument.setIngestionMessage("Reading document: " + ksDocument.getFilePath());
|
||||
ksDocumentRepository.save(ksDocument);
|
||||
|
||||
Resource file = storageService.loadAsResource(ksDocument.getFilePath());
|
||||
TikaDocumentReader tikaDocumentReader = new TikaDocumentReader(file);
|
||||
|
||||
List<Document> docs = tikaDocumentReader.read();
|
||||
docs = tikaDocumentReader.read();
|
||||
|
||||
logger.info("Ingested document: " + ksDocument.getFilePath());
|
||||
logger.info("Number of documents: " + docs.size());
|
||||
ksDocument.setIngestionMessage("Document read successfully");
|
||||
ksDocumentRepository.save(ksDocument);
|
||||
|
||||
KSIngestionInfo ingestionInfo = ksDocument.getIngestionInfo();
|
||||
} catch (Exception e) {
|
||||
logger.error("Error reading document: " + e.getMessage());
|
||||
ksDocument.setIngestionStatus("ERROR");
|
||||
ksDocument.setIngestionMessage("Error reading document: " + e.getMessage());
|
||||
ksDocumentRepository.save(ksDocument);
|
||||
ingestionLoopOutput.setStatus("ERROR");
|
||||
ingestionLoopOutput.setMessage("Error reading document: " + e.getMessage());
|
||||
return ingestionLoopOutput;
|
||||
}
|
||||
|
||||
TokenTextSplitter splitter = new TokenTextSplitter(ingestionInfo.getDefaultChunkSize(),
|
||||
ingestionInfo.getMinChunkSize(),
|
||||
@@ -119,13 +166,15 @@ public class KSIngestor {
|
||||
|
||||
docs.forEach(doc -> {
|
||||
List<Document> splitDocs = splitter.split(doc);
|
||||
|
||||
logger.info("Number of documents: " + splitDocs.size());
|
||||
for (Document splitDoc : splitDocs) {
|
||||
splitDoc.getMetadata().putAll(metadata);
|
||||
}
|
||||
|
||||
ksDocument.setIngestionMessage("Embedding documents");
|
||||
ksDocumentRepository.save(ksDocument);
|
||||
embedDocuments(splitDocs, ingestionInfo);
|
||||
});
|
||||
|
||||
ksDocument.setIngestionStatus("INGESTED");
|
||||
ksDocument.setIngestionDate(new Date());
|
||||
ksDocument.setIngestionDateFormat(new SimpleDateFormat("MM/dd/yy").format(new Date()));
|
||||
@@ -135,12 +184,17 @@ public class KSIngestor {
|
||||
ingestionLoopOutput.setStatus("OK");
|
||||
ingestionLoopOutput.setMessage("OK");
|
||||
}catch (Exception e){
|
||||
ksDocument.setIngestionStatus("ERROR");
|
||||
ksDocument.setIngestionMessage("Error ingesting document: " + e.getMessage());
|
||||
ksDocumentRepository.save(ksDocument);
|
||||
|
||||
ingestionLoopOutput.setStatus("ERROR");
|
||||
ingestionLoopOutput.setMessage(e.getMessage());
|
||||
}
|
||||
return ingestionLoopOutput;
|
||||
}
|
||||
|
||||
|
||||
public IngestionOutput ingestTextById(String id,String textToBeEmbed,String KsExternalDocUniqueID) {
|
||||
IngestionOutput ingestionOutput= new IngestionOutput();
|
||||
Optional<KSTexts> optionalDocument = ksTextsRepository.findById(id);
|
||||
@@ -218,9 +272,6 @@ public class KSIngestor {
|
||||
private void embedDocuments(List<Document> docs, KSIngestionInfo ingestionInfo) {
|
||||
|
||||
logger.info("Embedding documents");
|
||||
|
||||
docs.forEach(doc -> logger.info("Document metadata: " + doc.getMetadata()));
|
||||
|
||||
int batchSize = embDocsBatchSize;
|
||||
for (int i = 0; i < docs.size(); i += batchSize) {
|
||||
int end = Math.min(i + batchSize, docs.size());
|
||||
@@ -228,7 +279,7 @@ public class KSIngestor {
|
||||
try {
|
||||
Thread.sleep(embDocRetryTime);
|
||||
vectorStore.add(currentList);
|
||||
logger.info("Documents embedded - Progress: Batch from {} to {} completed", i, end);
|
||||
logger.info("Documents embedded - Progress: Batch from {} to {} completed of {} total chunks", i, end, docs.size());
|
||||
} catch (Exception e) {
|
||||
logger.error("Error embedding documents from {} to {}: {}", i, end, e.getMessage());
|
||||
}
|
||||
|
||||
@@ -67,3 +67,5 @@ logging:
|
||||
|
||||
java-re-module:
|
||||
url: "http://localhost:8084"
|
||||
|
||||
tika.config: "tika-config.xml"
|
||||
|
||||
4
tika-config.xml
Normal file
4
tika-config.xml
Normal file
@@ -0,0 +1,4 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<properties>
|
||||
|
||||
</properties>
|
||||
Reference in New Issue
Block a user