diff --git a/pom.xml b/pom.xml index a90a3e3..b793e50 100644 --- a/pom.xml +++ b/pom.xml @@ -103,6 +103,8 @@ spring-ai-tika-document-reader + + org.springdoc springdoc-openapi-starter-webmvc-ui diff --git a/src/main/java/com/olympus/apollo/controllers/TestController.java b/src/main/java/com/olympus/apollo/controllers/TestController.java index 2ca3835..e3739d4 100644 --- a/src/main/java/com/olympus/apollo/controllers/TestController.java +++ b/src/main/java/com/olympus/apollo/controllers/TestController.java @@ -2,24 +2,29 @@ package com.olympus.apollo.controllers; import java.util.List; -import com.olympus.apollo.feign.services.REModuleService; -import com.olympus.dto.ResultDTO; -import com.olympus.apollo.feign.services.ParserModuleService; -import com.olympus.apollo.services.GitService; -import com.olympus.dto.CommonParseRequest; -import com.olympus.dto.ApolloParseRequestDTO; -import com.olympus.feign.JavaParserModule; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.springframework.ai.document.Document; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.http.HttpStatus; import org.springframework.http.ResponseEntity; -import org.springframework.web.bind.annotation.*; +import org.springframework.web.bind.annotation.GetMapping; +import org.springframework.web.bind.annotation.PathVariable; +import org.springframework.web.bind.annotation.PostMapping; +import org.springframework.web.bind.annotation.RequestBody; +import org.springframework.web.bind.annotation.RequestParam; +import org.springframework.web.bind.annotation.RestController; -import com.olympus.dto.IngestionOutput; +import com.olympus.apollo.feign.services.ParserModuleService; +import com.olympus.apollo.feign.services.REModuleService; import com.olympus.apollo.services.GitRepositoryIngestor; +import com.olympus.apollo.services.GitService; import com.olympus.apollo.services.KSIngestor; -import org.springframework.ai.document.Document; +import com.olympus.dto.ApolloParseRequestDTO; +import com.olympus.dto.CommonParseRequest; +import com.olympus.dto.IngestionOutput; +import com.olympus.dto.ResultDTO; +import com.olympus.feign.JavaParserModule; @RestController @@ -52,7 +57,7 @@ public class TestController { @GetMapping("test/ingest_document/{id}") public IngestionOutput ingestDocumentById(@PathVariable String id) { - return ksIngestor.ingestDocumentById(id); + return ksIngestor.ingestDocumentByIdAsync(id); } @GetMapping("test/query_vector") diff --git a/src/main/java/com/olympus/apollo/services/DeletionService.java b/src/main/java/com/olympus/apollo/services/DeletionService.java index 1229437..ad80796 100644 --- a/src/main/java/com/olympus/apollo/services/DeletionService.java +++ b/src/main/java/com/olympus/apollo/services/DeletionService.java @@ -1,29 +1,30 @@ package com.olympus.apollo.services; -import com.olympus.dto.DeleteGitRepoDetailsRequest; -import com.olympus.dto.ResultDTO; -import com.olympus.apollo.exception.vectorStoreMetaDetailsEmptyException; -import com.olympus.apollo.repository.*; -import com.olympus.model.apollo.KSGitInfo; +import java.util.Date; +import java.util.List; +import java.util.Optional; +import java.util.concurrent.CompletableFuture; + import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.util.Date; - -import com.olympus.dto.DeletionRequest; -import org.springframework.ai.document.Document; -import org.springframework.ai.vectorstore.SearchRequest; +import org.springframework.ai.vectorstore.VectorStore; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.messaging.simp.SimpMessagingTemplate; import org.springframework.scheduling.annotation.Async; import org.springframework.stereotype.Service; -import org.springframework.ai.vectorstore.VectorStore; + +import com.olympus.apollo.exception.vectorStoreMetaDetailsEmptyException; +import com.olympus.apollo.repository.KSDocumentRepository; +import com.olympus.apollo.repository.KSGitInfoRepository; +import com.olympus.apollo.repository.KSGitIngestionInfoRepository; +import com.olympus.apollo.repository.KSIngestionInfoRepository; +import com.olympus.apollo.repository.KSTextsRepository; +import com.olympus.apollo.repository.VectorStoreRepository; +import com.olympus.dto.DeleteGitRepoDetailsRequest; +import com.olympus.dto.DeletionRequest; +import com.olympus.dto.ResultDTO; import com.olympus.model.apollo.KSDocument; - - -import java.util.ArrayList; -import java.util.List; -import java.util.Optional; -import java.util.concurrent.CompletableFuture; +import com.olympus.model.apollo.KSGitInfo; @Service public class DeletionService { @@ -58,21 +59,6 @@ public class DeletionService { public void deleteRecords(DeletionRequest deletionRequest) { try { - //TODO: COMPLETE REFACTOR REQUIRED TO DELETE RECORD FROM AZURE SEARCH - // NOT WORKING AT THE MOMENT - // boolean KSDocumentExists = deletionRequest.getKsDocumentId() != null && - // !deletionRequest.getKsDocumentId().isEmpty() && - // ksDocumentRepository.existsById(deletionRequest.getKsDocumentId()); - // if(KSDocumentExists){ - // SearchRequest searchRequest = SearchRequest.defaults() - // .withQuery("a").withTopK(1000) - // .withSimilarityThreshold(0.0) - // .withFilterExpression("KsDocumentId=='"+deletionRequest.getKsDocumentId()+"'"); - - - // List docs = vectorStore.similaritySearch(searchRequest); - // List ids = docs.stream().map(Document::getId).toList(); - // vectorStore.delete(ids); String rag_filter = "KsDocumentId=='"+deletionRequest.getKsDocumentId()+"'"; logger.info("Starting deletion"); vectorStore.delete(rag_filter); @@ -92,21 +78,7 @@ public class DeletionService { public void deleteRecordsOnlyFromVectorStore(DeletionRequest deletionRequest) { try { - //TODO: COMPLETE REFACTOR REQUIRED TO DELETE RECORD FROM AZURE SEARCH - // NOT WORKING AT THE MOMENT - // boolean KSDocumentExists = deletionRequest.getKsDocumentId() != null && - // !deletionRequest.getKsDocumentId().isEmpty() && - // ksDocumentRepository.existsById(deletionRequest.getKsDocumentId()); - // if(KSDocumentExists){ - // SearchRequest searchRequest = SearchRequest.defaults() - // .withQuery("a").withTopK(1000) - // .withSimilarityThreshold(0.0) - // .withFilterExpression("KsDocumentId=='"+deletionRequest.getKsDocumentId()+"'"); - - // List docs = vectorStore.similaritySearch(searchRequest); - // List ids = docs.stream().map(Document::getId).toList(); - // vectorStore.delete(ids); String rag_filter = "KsDocumentId=='"+deletionRequest.getKsDocumentId()+"'"; logger.info("Starting deletion"); vectorStore.delete(rag_filter); diff --git a/src/main/java/com/olympus/apollo/services/KSDocumentService.java b/src/main/java/com/olympus/apollo/services/KSDocumentService.java index 0ab6ddd..67a5d1d 100644 --- a/src/main/java/com/olympus/apollo/services/KSDocumentService.java +++ b/src/main/java/com/olympus/apollo/services/KSDocumentService.java @@ -5,7 +5,6 @@ import java.nio.file.Path; import java.nio.file.Paths; import java.util.List; -import org.apache.tomcat.util.openssl.openssl_h; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.beans.factory.annotation.Autowired; @@ -18,7 +17,6 @@ import org.springframework.security.core.context.SecurityContextHolder; import org.springframework.stereotype.Service; import com.olympus.apollo.repository.KSDocumentRepository; -import com.olympus.apollo.repository.ProjectRepository; import com.olympus.apollo.security.entity.User; import com.olympus.model.apollo.KSDocument; @@ -31,7 +29,6 @@ public class KSDocumentService { private KSDocumentRepository ksdocRepo; public List findByProjectNameAndApplicationName() { - logger.info("findByProjectNameAndApplicationName function:"); User principal = (User) SecurityContextHolder.getContext().getAuthentication().getPrincipal(); try { diff --git a/src/main/java/com/olympus/apollo/services/KSIngestor.java b/src/main/java/com/olympus/apollo/services/KSIngestor.java index 20b866f..e45dbe6 100644 --- a/src/main/java/com/olympus/apollo/services/KSIngestor.java +++ b/src/main/java/com/olympus/apollo/services/KSIngestor.java @@ -1,28 +1,33 @@ package com.olympus.apollo.services; -import java.util.*; import java.text.SimpleDateFormat; +import java.util.Collections; +import java.util.Date; +import java.util.HashMap; +import java.util.List; +import java.util.Optional; +import java.util.concurrent.CompletableFuture; -import com.olympus.dto.IngestionOutput; -import com.olympus.model.apollo.KSDocument; -import com.olympus.model.apollo.KSTexts; -import com.olympus.apollo.repository.KSTextsRepository; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.ai.document.Document; -import org.springframework.ai.vectorstore.SearchRequest.Builder; import org.springframework.ai.reader.tika.TikaDocumentReader; import org.springframework.ai.transformer.splitter.TokenTextSplitter; import org.springframework.ai.vectorstore.SearchRequest; +import org.springframework.ai.vectorstore.SearchRequest.Builder; import org.springframework.ai.vectorstore.VectorStore; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Value; import org.springframework.core.io.Resource; +import org.springframework.scheduling.annotation.Async; import org.springframework.stereotype.Service; -import com.olympus.model.apollo.KSIngestionInfo; import com.olympus.apollo.repository.KSDocumentRepository; -import com.olympus.apollo.repository.KSIngestionInfoRepository; +import com.olympus.apollo.repository.KSTextsRepository; +import com.olympus.dto.IngestionOutput; +import com.olympus.model.apollo.KSDocument; +import com.olympus.model.apollo.KSIngestionInfo; +import com.olympus.model.apollo.KSTexts; @Service @@ -93,39 +98,83 @@ public class KSIngestor { } } + public IngestionOutput ingestDocumentByIdAsync(String id) { + IngestionOutput ingestionOutput= new IngestionOutput(); + Optional optionalDocument = ksDocumentRepository.findById(id); + if (optionalDocument.isPresent()) { + KSDocument ksDocument = optionalDocument.get(); + if ("LOADED".equals(ksDocument.getIngestionStatus()) || "ERROR".equals(ksDocument.getIngestionStatus())) { + ingestionOutput.setStatus("IN PROGRESS"); + ingestDocumentAsync(ksDocument); + return ingestionOutput; + } else { + ingestionOutput.setMessage("OOPS: Document is already Injected"); + return ingestionOutput; + } + } else { + ingestionOutput.setMessage("OOPS: Document Not found"); + return ingestionOutput; + } + } + + @Async + private CompletableFuture ingestDocumentAsync(KSDocument ksDocument) { + ingestDocument(ksDocument); + return null; + } + private IngestionOutput ingestDocument(KSDocument ksDocument) { IngestionOutput ingestionLoopOutput = new IngestionOutput(); try { ksDocument.setIngestionStatus("IN PROGRESS"); ksDocumentRepository.save(ksDocument); - - Resource file = storageService.loadAsResource(ksDocument.getFilePath()); - TikaDocumentReader tikaDocumentReader = new TikaDocumentReader(file); - - List docs = tikaDocumentReader.read(); - logger.info("Ingested document: " + ksDocument.getFilePath()); - logger.info("Number of documents: " + docs.size()); - + KSIngestionInfo ingestionInfo = ksDocument.getIngestionInfo(); + List docs = null; + try { + ksDocument.setIngestionMessage("Reading document: " + ksDocument.getFilePath()); + ksDocumentRepository.save(ksDocument); + + Resource file = storageService.loadAsResource(ksDocument.getFilePath()); + TikaDocumentReader tikaDocumentReader = new TikaDocumentReader(file); + + docs = tikaDocumentReader.read(); + + logger.info("Ingested document: " + ksDocument.getFilePath()); + logger.info("Number of documents: " + docs.size()); + ksDocument.setIngestionMessage("Document read successfully"); + ksDocumentRepository.save(ksDocument); + + } catch (Exception e) { + logger.error("Error reading document: " + e.getMessage()); + ksDocument.setIngestionStatus("ERROR"); + ksDocument.setIngestionMessage("Error reading document: " + e.getMessage()); + ksDocumentRepository.save(ksDocument); + ingestionLoopOutput.setStatus("ERROR"); + ingestionLoopOutput.setMessage("Error reading document: " + e.getMessage()); + return ingestionLoopOutput; + } TokenTextSplitter splitter = new TokenTextSplitter(ingestionInfo.getDefaultChunkSize(), - ingestionInfo.getMinChunkSize(), - ingestionInfo.getMinChunkSizeToEmbed(), - ingestionInfo.getMaxNumberOfChunks(), - true); + ingestionInfo.getMinChunkSize(), + ingestionInfo.getMinChunkSizeToEmbed(), + ingestionInfo.getMaxNumberOfChunks(), + true); HashMap metadata = ingestionInfo.getMetadata(); metadata.put("KsDocumentId",ksDocument.getId()); docs.forEach(doc -> { List splitDocs = splitter.split(doc); - - logger.info("Number of documents: " + splitDocs.size()); for (Document splitDoc : splitDocs) { splitDoc.getMetadata().putAll(metadata); - } + } + + ksDocument.setIngestionMessage("Embedding documents"); + ksDocumentRepository.save(ksDocument); embedDocuments(splitDocs, ingestionInfo); }); + ksDocument.setIngestionStatus("INGESTED"); ksDocument.setIngestionDate(new Date()); ksDocument.setIngestionDateFormat(new SimpleDateFormat("MM/dd/yy").format(new Date())); @@ -135,12 +184,17 @@ public class KSIngestor { ingestionLoopOutput.setStatus("OK"); ingestionLoopOutput.setMessage("OK"); }catch (Exception e){ + ksDocument.setIngestionStatus("ERROR"); + ksDocument.setIngestionMessage("Error ingesting document: " + e.getMessage()); + ksDocumentRepository.save(ksDocument); + ingestionLoopOutput.setStatus("ERROR"); ingestionLoopOutput.setMessage(e.getMessage()); } return ingestionLoopOutput; } + public IngestionOutput ingestTextById(String id,String textToBeEmbed,String KsExternalDocUniqueID) { IngestionOutput ingestionOutput= new IngestionOutput(); Optional optionalDocument = ksTextsRepository.findById(id); @@ -218,9 +272,6 @@ public class KSIngestor { private void embedDocuments(List docs, KSIngestionInfo ingestionInfo) { logger.info("Embedding documents"); - - docs.forEach(doc -> logger.info("Document metadata: " + doc.getMetadata())); - int batchSize = embDocsBatchSize; for (int i = 0; i < docs.size(); i += batchSize) { int end = Math.min(i + batchSize, docs.size()); @@ -228,7 +279,7 @@ public class KSIngestor { try { Thread.sleep(embDocRetryTime); vectorStore.add(currentList); - logger.info("Documents embedded - Progress: Batch from {} to {} completed", i, end); + logger.info("Documents embedded - Progress: Batch from {} to {} completed of {} total chunks", i, end, docs.size()); } catch (Exception e) { logger.error("Error embedding documents from {} to {}: {}", i, end, e.getMessage()); } diff --git a/src/main/resources/application.yml b/src/main/resources/application.yml index 32b80c4..14bd3c8 100644 --- a/src/main/resources/application.yml +++ b/src/main/resources/application.yml @@ -66,4 +66,6 @@ logging: #org.springframework.web.client: DEBUG java-re-module: - url: "http://localhost:8084" \ No newline at end of file + url: "http://localhost:8084" + +tika.config: "tika-config.xml" diff --git a/tika-config.xml b/tika-config.xml new file mode 100644 index 0000000..50d22a4 --- /dev/null +++ b/tika-config.xml @@ -0,0 +1,4 @@ + + + + \ No newline at end of file