diff --git a/pom.xml b/pom.xml
index a90a3e3..b793e50 100644
--- a/pom.xml
+++ b/pom.xml
@@ -103,6 +103,8 @@
spring-ai-tika-document-reader
+
+
org.springdoc
springdoc-openapi-starter-webmvc-ui
diff --git a/src/main/java/com/olympus/apollo/controllers/TestController.java b/src/main/java/com/olympus/apollo/controllers/TestController.java
index 2ca3835..e3739d4 100644
--- a/src/main/java/com/olympus/apollo/controllers/TestController.java
+++ b/src/main/java/com/olympus/apollo/controllers/TestController.java
@@ -2,24 +2,29 @@ package com.olympus.apollo.controllers;
import java.util.List;
-import com.olympus.apollo.feign.services.REModuleService;
-import com.olympus.dto.ResultDTO;
-import com.olympus.apollo.feign.services.ParserModuleService;
-import com.olympus.apollo.services.GitService;
-import com.olympus.dto.CommonParseRequest;
-import com.olympus.dto.ApolloParseRequestDTO;
-import com.olympus.feign.JavaParserModule;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import org.springframework.ai.document.Document;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.http.HttpStatus;
import org.springframework.http.ResponseEntity;
-import org.springframework.web.bind.annotation.*;
+import org.springframework.web.bind.annotation.GetMapping;
+import org.springframework.web.bind.annotation.PathVariable;
+import org.springframework.web.bind.annotation.PostMapping;
+import org.springframework.web.bind.annotation.RequestBody;
+import org.springframework.web.bind.annotation.RequestParam;
+import org.springframework.web.bind.annotation.RestController;
-import com.olympus.dto.IngestionOutput;
+import com.olympus.apollo.feign.services.ParserModuleService;
+import com.olympus.apollo.feign.services.REModuleService;
import com.olympus.apollo.services.GitRepositoryIngestor;
+import com.olympus.apollo.services.GitService;
import com.olympus.apollo.services.KSIngestor;
-import org.springframework.ai.document.Document;
+import com.olympus.dto.ApolloParseRequestDTO;
+import com.olympus.dto.CommonParseRequest;
+import com.olympus.dto.IngestionOutput;
+import com.olympus.dto.ResultDTO;
+import com.olympus.feign.JavaParserModule;
@RestController
@@ -52,7 +57,7 @@ public class TestController {
@GetMapping("test/ingest_document/{id}")
public IngestionOutput ingestDocumentById(@PathVariable String id) {
- return ksIngestor.ingestDocumentById(id);
+ return ksIngestor.ingestDocumentByIdAsync(id);
}
@GetMapping("test/query_vector")
diff --git a/src/main/java/com/olympus/apollo/services/DeletionService.java b/src/main/java/com/olympus/apollo/services/DeletionService.java
index 1229437..ad80796 100644
--- a/src/main/java/com/olympus/apollo/services/DeletionService.java
+++ b/src/main/java/com/olympus/apollo/services/DeletionService.java
@@ -1,29 +1,30 @@
package com.olympus.apollo.services;
-import com.olympus.dto.DeleteGitRepoDetailsRequest;
-import com.olympus.dto.ResultDTO;
-import com.olympus.apollo.exception.vectorStoreMetaDetailsEmptyException;
-import com.olympus.apollo.repository.*;
-import com.olympus.model.apollo.KSGitInfo;
+import java.util.Date;
+import java.util.List;
+import java.util.Optional;
+import java.util.concurrent.CompletableFuture;
+
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-import java.util.Date;
-
-import com.olympus.dto.DeletionRequest;
-import org.springframework.ai.document.Document;
-import org.springframework.ai.vectorstore.SearchRequest;
+import org.springframework.ai.vectorstore.VectorStore;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.messaging.simp.SimpMessagingTemplate;
import org.springframework.scheduling.annotation.Async;
import org.springframework.stereotype.Service;
-import org.springframework.ai.vectorstore.VectorStore;
+
+import com.olympus.apollo.exception.vectorStoreMetaDetailsEmptyException;
+import com.olympus.apollo.repository.KSDocumentRepository;
+import com.olympus.apollo.repository.KSGitInfoRepository;
+import com.olympus.apollo.repository.KSGitIngestionInfoRepository;
+import com.olympus.apollo.repository.KSIngestionInfoRepository;
+import com.olympus.apollo.repository.KSTextsRepository;
+import com.olympus.apollo.repository.VectorStoreRepository;
+import com.olympus.dto.DeleteGitRepoDetailsRequest;
+import com.olympus.dto.DeletionRequest;
+import com.olympus.dto.ResultDTO;
import com.olympus.model.apollo.KSDocument;
-
-
-import java.util.ArrayList;
-import java.util.List;
-import java.util.Optional;
-import java.util.concurrent.CompletableFuture;
+import com.olympus.model.apollo.KSGitInfo;
@Service
public class DeletionService {
@@ -58,21 +59,6 @@ public class DeletionService {
public void deleteRecords(DeletionRequest deletionRequest) {
try {
- //TODO: COMPLETE REFACTOR REQUIRED TO DELETE RECORD FROM AZURE SEARCH
- // NOT WORKING AT THE MOMENT
- // boolean KSDocumentExists = deletionRequest.getKsDocumentId() != null &&
- // !deletionRequest.getKsDocumentId().isEmpty() &&
- // ksDocumentRepository.existsById(deletionRequest.getKsDocumentId());
- // if(KSDocumentExists){
- // SearchRequest searchRequest = SearchRequest.defaults()
- // .withQuery("a").withTopK(1000)
- // .withSimilarityThreshold(0.0)
- // .withFilterExpression("KsDocumentId=='"+deletionRequest.getKsDocumentId()+"'");
-
-
- // List docs = vectorStore.similaritySearch(searchRequest);
- // List ids = docs.stream().map(Document::getId).toList();
- // vectorStore.delete(ids);
String rag_filter = "KsDocumentId=='"+deletionRequest.getKsDocumentId()+"'";
logger.info("Starting deletion");
vectorStore.delete(rag_filter);
@@ -92,21 +78,7 @@ public class DeletionService {
public void deleteRecordsOnlyFromVectorStore(DeletionRequest deletionRequest) {
try {
- //TODO: COMPLETE REFACTOR REQUIRED TO DELETE RECORD FROM AZURE SEARCH
- // NOT WORKING AT THE MOMENT
- // boolean KSDocumentExists = deletionRequest.getKsDocumentId() != null &&
- // !deletionRequest.getKsDocumentId().isEmpty() &&
- // ksDocumentRepository.existsById(deletionRequest.getKsDocumentId());
- // if(KSDocumentExists){
- // SearchRequest searchRequest = SearchRequest.defaults()
- // .withQuery("a").withTopK(1000)
- // .withSimilarityThreshold(0.0)
- // .withFilterExpression("KsDocumentId=='"+deletionRequest.getKsDocumentId()+"'");
-
- // List docs = vectorStore.similaritySearch(searchRequest);
- // List ids = docs.stream().map(Document::getId).toList();
- // vectorStore.delete(ids);
String rag_filter = "KsDocumentId=='"+deletionRequest.getKsDocumentId()+"'";
logger.info("Starting deletion");
vectorStore.delete(rag_filter);
diff --git a/src/main/java/com/olympus/apollo/services/KSDocumentService.java b/src/main/java/com/olympus/apollo/services/KSDocumentService.java
index 0ab6ddd..67a5d1d 100644
--- a/src/main/java/com/olympus/apollo/services/KSDocumentService.java
+++ b/src/main/java/com/olympus/apollo/services/KSDocumentService.java
@@ -5,7 +5,6 @@ import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.List;
-import org.apache.tomcat.util.openssl.openssl_h;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
@@ -18,7 +17,6 @@ import org.springframework.security.core.context.SecurityContextHolder;
import org.springframework.stereotype.Service;
import com.olympus.apollo.repository.KSDocumentRepository;
-import com.olympus.apollo.repository.ProjectRepository;
import com.olympus.apollo.security.entity.User;
import com.olympus.model.apollo.KSDocument;
@@ -31,7 +29,6 @@ public class KSDocumentService {
private KSDocumentRepository ksdocRepo;
public List findByProjectNameAndApplicationName() {
- logger.info("findByProjectNameAndApplicationName function:");
User principal = (User) SecurityContextHolder.getContext().getAuthentication().getPrincipal();
try {
diff --git a/src/main/java/com/olympus/apollo/services/KSIngestor.java b/src/main/java/com/olympus/apollo/services/KSIngestor.java
index 20b866f..e45dbe6 100644
--- a/src/main/java/com/olympus/apollo/services/KSIngestor.java
+++ b/src/main/java/com/olympus/apollo/services/KSIngestor.java
@@ -1,28 +1,33 @@
package com.olympus.apollo.services;
-import java.util.*;
import java.text.SimpleDateFormat;
+import java.util.Collections;
+import java.util.Date;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Optional;
+import java.util.concurrent.CompletableFuture;
-import com.olympus.dto.IngestionOutput;
-import com.olympus.model.apollo.KSDocument;
-import com.olympus.model.apollo.KSTexts;
-import com.olympus.apollo.repository.KSTextsRepository;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.ai.document.Document;
-import org.springframework.ai.vectorstore.SearchRequest.Builder;
import org.springframework.ai.reader.tika.TikaDocumentReader;
import org.springframework.ai.transformer.splitter.TokenTextSplitter;
import org.springframework.ai.vectorstore.SearchRequest;
+import org.springframework.ai.vectorstore.SearchRequest.Builder;
import org.springframework.ai.vectorstore.VectorStore;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.core.io.Resource;
+import org.springframework.scheduling.annotation.Async;
import org.springframework.stereotype.Service;
-import com.olympus.model.apollo.KSIngestionInfo;
import com.olympus.apollo.repository.KSDocumentRepository;
-import com.olympus.apollo.repository.KSIngestionInfoRepository;
+import com.olympus.apollo.repository.KSTextsRepository;
+import com.olympus.dto.IngestionOutput;
+import com.olympus.model.apollo.KSDocument;
+import com.olympus.model.apollo.KSIngestionInfo;
+import com.olympus.model.apollo.KSTexts;
@Service
@@ -93,39 +98,83 @@ public class KSIngestor {
}
}
+ public IngestionOutput ingestDocumentByIdAsync(String id) {
+ IngestionOutput ingestionOutput= new IngestionOutput();
+ Optional optionalDocument = ksDocumentRepository.findById(id);
+ if (optionalDocument.isPresent()) {
+ KSDocument ksDocument = optionalDocument.get();
+ if ("LOADED".equals(ksDocument.getIngestionStatus()) || "ERROR".equals(ksDocument.getIngestionStatus())) {
+ ingestionOutput.setStatus("IN PROGRESS");
+ ingestDocumentAsync(ksDocument);
+ return ingestionOutput;
+ } else {
+ ingestionOutput.setMessage("OOPS: Document is already Injected");
+ return ingestionOutput;
+ }
+ } else {
+ ingestionOutput.setMessage("OOPS: Document Not found");
+ return ingestionOutput;
+ }
+ }
+
+ @Async
+ private CompletableFuture ingestDocumentAsync(KSDocument ksDocument) {
+ ingestDocument(ksDocument);
+ return null;
+ }
+
private IngestionOutput ingestDocument(KSDocument ksDocument) {
IngestionOutput ingestionLoopOutput = new IngestionOutput();
try {
ksDocument.setIngestionStatus("IN PROGRESS");
ksDocumentRepository.save(ksDocument);
-
- Resource file = storageService.loadAsResource(ksDocument.getFilePath());
- TikaDocumentReader tikaDocumentReader = new TikaDocumentReader(file);
-
- List docs = tikaDocumentReader.read();
- logger.info("Ingested document: " + ksDocument.getFilePath());
- logger.info("Number of documents: " + docs.size());
-
+
KSIngestionInfo ingestionInfo = ksDocument.getIngestionInfo();
+ List docs = null;
+ try {
+ ksDocument.setIngestionMessage("Reading document: " + ksDocument.getFilePath());
+ ksDocumentRepository.save(ksDocument);
+
+ Resource file = storageService.loadAsResource(ksDocument.getFilePath());
+ TikaDocumentReader tikaDocumentReader = new TikaDocumentReader(file);
+
+ docs = tikaDocumentReader.read();
+
+ logger.info("Ingested document: " + ksDocument.getFilePath());
+ logger.info("Number of documents: " + docs.size());
+ ksDocument.setIngestionMessage("Document read successfully");
+ ksDocumentRepository.save(ksDocument);
+
+ } catch (Exception e) {
+ logger.error("Error reading document: " + e.getMessage());
+ ksDocument.setIngestionStatus("ERROR");
+ ksDocument.setIngestionMessage("Error reading document: " + e.getMessage());
+ ksDocumentRepository.save(ksDocument);
+ ingestionLoopOutput.setStatus("ERROR");
+ ingestionLoopOutput.setMessage("Error reading document: " + e.getMessage());
+ return ingestionLoopOutput;
+ }
TokenTextSplitter splitter = new TokenTextSplitter(ingestionInfo.getDefaultChunkSize(),
- ingestionInfo.getMinChunkSize(),
- ingestionInfo.getMinChunkSizeToEmbed(),
- ingestionInfo.getMaxNumberOfChunks(),
- true);
+ ingestionInfo.getMinChunkSize(),
+ ingestionInfo.getMinChunkSizeToEmbed(),
+ ingestionInfo.getMaxNumberOfChunks(),
+ true);
HashMap metadata = ingestionInfo.getMetadata();
metadata.put("KsDocumentId",ksDocument.getId());
docs.forEach(doc -> {
List splitDocs = splitter.split(doc);
-
- logger.info("Number of documents: " + splitDocs.size());
for (Document splitDoc : splitDocs) {
splitDoc.getMetadata().putAll(metadata);
- }
+ }
+
+ ksDocument.setIngestionMessage("Embedding documents");
+ ksDocumentRepository.save(ksDocument);
embedDocuments(splitDocs, ingestionInfo);
});
+
ksDocument.setIngestionStatus("INGESTED");
ksDocument.setIngestionDate(new Date());
ksDocument.setIngestionDateFormat(new SimpleDateFormat("MM/dd/yy").format(new Date()));
@@ -135,12 +184,17 @@ public class KSIngestor {
ingestionLoopOutput.setStatus("OK");
ingestionLoopOutput.setMessage("OK");
}catch (Exception e){
+ ksDocument.setIngestionStatus("ERROR");
+ ksDocument.setIngestionMessage("Error ingesting document: " + e.getMessage());
+ ksDocumentRepository.save(ksDocument);
+
ingestionLoopOutput.setStatus("ERROR");
ingestionLoopOutput.setMessage(e.getMessage());
}
return ingestionLoopOutput;
}
+
public IngestionOutput ingestTextById(String id,String textToBeEmbed,String KsExternalDocUniqueID) {
IngestionOutput ingestionOutput= new IngestionOutput();
Optional optionalDocument = ksTextsRepository.findById(id);
@@ -218,9 +272,6 @@ public class KSIngestor {
private void embedDocuments(List docs, KSIngestionInfo ingestionInfo) {
logger.info("Embedding documents");
-
- docs.forEach(doc -> logger.info("Document metadata: " + doc.getMetadata()));
-
int batchSize = embDocsBatchSize;
for (int i = 0; i < docs.size(); i += batchSize) {
int end = Math.min(i + batchSize, docs.size());
@@ -228,7 +279,7 @@ public class KSIngestor {
try {
Thread.sleep(embDocRetryTime);
vectorStore.add(currentList);
- logger.info("Documents embedded - Progress: Batch from {} to {} completed", i, end);
+ logger.info("Documents embedded - Progress: Batch from {} to {} completed of {} total chunks", i, end, docs.size());
} catch (Exception e) {
logger.error("Error embedding documents from {} to {}: {}", i, end, e.getMessage());
}
diff --git a/src/main/resources/application.yml b/src/main/resources/application.yml
index 32b80c4..14bd3c8 100644
--- a/src/main/resources/application.yml
+++ b/src/main/resources/application.yml
@@ -66,4 +66,6 @@ logging:
#org.springframework.web.client: DEBUG
java-re-module:
- url: "http://localhost:8084"
\ No newline at end of file
+ url: "http://localhost:8084"
+
+tika.config: "tika-config.xml"
diff --git a/tika-config.xml b/tika-config.xml
new file mode 100644
index 0000000..50d22a4
--- /dev/null
+++ b/tika-config.xml
@@ -0,0 +1,4 @@
+
+
+
+
\ No newline at end of file