diff --git a/src/main/java/com/olympus/apollo/controllers/FeApi/KsDocumentController.java b/src/main/java/com/olympus/apollo/controllers/FeApi/KsDocumentController.java index ccb94c7..0abd934 100644 --- a/src/main/java/com/olympus/apollo/controllers/FeApi/KsDocumentController.java +++ b/src/main/java/com/olympus/apollo/controllers/FeApi/KsDocumentController.java @@ -4,18 +4,14 @@ import java.util.List; import java.util.Map; import org.springframework.beans.factory.annotation.Autowired; -import org.springframework.web.bind.annotation.CrossOrigin; -import org.springframework.web.bind.annotation.GetMapping; -import org.springframework.web.bind.annotation.RequestMapping; -import org.springframework.web.bind.annotation.RequestParam; -import org.springframework.web.bind.annotation.RestController; +import org.springframework.web.bind.annotation.*; import com.olympus.apollo.models.KSDocument; import com.olympus.apollo.repository.KSDocumentRepository; @RestController @RequestMapping("/fe-api/ksdocuments") -@CrossOrigin +@CrossOrigin(origins = "http://localhost:5173") public class KsDocumentController { @Autowired @@ -30,7 +26,7 @@ public class KsDocumentController { return result; } @GetMapping("/{id}") - public KSDocument getDocument(@RequestParam String id) { + public KSDocument getDocument(@PathVariable String id) { KSDocument result = ksDocumentREpository.findById(id).get(); diff --git a/src/main/java/com/olympus/apollo/controllers/KSFileController.java b/src/main/java/com/olympus/apollo/controllers/KSFileController.java index 25efcfb..94fced1 100644 --- a/src/main/java/com/olympus/apollo/controllers/KSFileController.java +++ b/src/main/java/com/olympus/apollo/controllers/KSFileController.java @@ -3,16 +3,13 @@ package com.olympus.apollo.controllers; import java.util.HashMap; import java.util.List; import java.util.Map; +import java.util.Date; import org.codelibs.jhighlight.fastutil.Hash; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.http.ResponseEntity; import org.springframework.stereotype.Controller; -import org.springframework.web.bind.annotation.ExceptionHandler; -import org.springframework.web.bind.annotation.GetMapping; -import org.springframework.web.bind.annotation.PostMapping; -import org.springframework.web.bind.annotation.RequestParam; -import org.springframework.web.bind.annotation.RestController; +import org.springframework.web.bind.annotation.*; import org.springframework.web.multipart.MultipartFile; import org.springframework.web.servlet.mvc.support.RedirectAttributes; @@ -22,7 +19,8 @@ import com.olympus.apollo.repository.KSDocumentRepository; import com.olympus.apollo.repository.KSIngestionInfoRepository; import com.olympus.apollo.services.StorageFileNotFoundException; import com.olympus.apollo.services.StorageService; - +import com.olympus.apollo.models.FileUploadDTO; +@CrossOrigin @RestController public class KSFileController { @Autowired @@ -33,44 +31,48 @@ public class KSFileController { private KSIngestionInfoRepository ksIngestionInfoRepository; - - @PostMapping("/upload") - public String handleFileUpload(@RequestParam("file") MultipartFile file) { + public String handleFileUpload( + @RequestParam("file") MultipartFile file, + @ModelAttribute FileUploadDTO fileUploadDTO + ) { + String filePath = storageService.store(file); - String filePath = storageService.store(file); KSDocument ksDocument = new KSDocument(); ksDocument.setFilePath(filePath); ksDocument.setFileName(file.getOriginalFilename()); ksDocument.setName(file.getOriginalFilename()); - ksDocument.setDescription("Uploaded file"); + ksDocument.setDescription(fileUploadDTO.getDescription()); ksDocument.setIngestionStatus("NEW"); - KSIngestionInfo ksIngestionInfo = new KSIngestionInfo(); - ksIngestionInfo.setType("MD_DOCUMENT"); //TODO: This should be dynamic + Date now = new Date(); + ksDocument.setIngestionDate(now); + + + KSIngestionInfo ksIngestionInfo = new KSIngestionInfo(); + ksIngestionInfo.setType(fileUploadDTO.getType()); // != null ? type : "MD_DOCUMENT" + + HashMap metadata = new HashMap<>(); + metadata.put("KsApplicationName", fileUploadDTO.getKsApplicationName()); + metadata.put("KsDoctype", fileUploadDTO.getKsDocType()); + metadata.put("KsDocSource", fileUploadDTO.getKsDocType()); + metadata.put("Source", file.getOriginalFilename()); - HashMap metadata = new HashMap<>(); - - metadata.put("KsApplicatioName","atf"); - metadata.put("KsDoctype","documentation"); - metadata.put("KsDoSource","wiki"); - metadata.put("Source",file.getOriginalFilename()); - ksIngestionInfo.setMetadata(metadata); - ksIngestionInfo.setDefaultChunkSize(1000); - ksIngestionInfo.setMinChunkSize(200); - ksIngestionInfo.setMaxNumberOfChunks(1000); - ksIngestionInfo.setMinChunkSizeToEmbed(20); + ksIngestionInfo.setDefaultChunkSize(fileUploadDTO.getDefaultChunkSize()); + ksIngestionInfo.setMinChunkSize(fileUploadDTO.getMinChunkSize()); + ksIngestionInfo.setMaxNumberOfChunks(fileUploadDTO.getMaxNumberOfChunks()); + ksIngestionInfo.setMinChunkSizeToEmbed(fileUploadDTO.getMinChunkSizeToEmbed()); ksIngestionInfoRepository.save(ksIngestionInfo); ksDocument.setIngestionInfo(ksIngestionInfo); ksDocumentREpository.save(ksDocument); - return "OK"; - } + return "OK"; + } - @ExceptionHandler(StorageFileNotFoundException.class) - public ResponseEntity handleStorageFileNotFound(StorageFileNotFoundException exc) { - return ResponseEntity.notFound().build(); - } -} + @ExceptionHandler(StorageFileNotFoundException.class) + public ResponseEntity handleStorageFileNotFound(StorageFileNotFoundException exc) { + return ResponseEntity.notFound().build(); + } +} \ No newline at end of file diff --git a/src/main/java/com/olympus/apollo/controllers/TestController.java b/src/main/java/com/olympus/apollo/controllers/TestController.java index f5159f1..761aabd 100644 --- a/src/main/java/com/olympus/apollo/controllers/TestController.java +++ b/src/main/java/com/olympus/apollo/controllers/TestController.java @@ -3,16 +3,16 @@ package com.olympus.apollo.controllers; import java.util.HashMap; import java.util.List; +import com.olympus.apollo.dto.IngestionOutput; import org.springframework.beans.factory.annotation.Autowired; -import org.springframework.web.bind.annotation.RestController; +import org.springframework.http.ResponseEntity; +import org.springframework.web.bind.annotation.*; import com.olympus.apollo.models.KSIngestionInfo; import com.olympus.apollo.services.GitRepositoryIngestor; import com.olympus.apollo.services.KSIngestor; -import org.springframework.web.bind.annotation.GetMapping; -import org.springframework.web.bind.annotation.RequestParam; - +@CrossOrigin(origins = "http://localhost:5173") @RestController public class TestController { @@ -21,16 +21,21 @@ public class TestController { @Autowired GitRepositoryIngestor gitRepositoryIngestor; + @GetMapping("test/ingestion_loop") - public String testIngestionLoop() { - ksIngestor.ingestLoop(); - return "Ingestion Loop Completed"; + public IngestionOutput testIngestionLoop() { + return ksIngestor.ingestLoop(); + } + + @GetMapping("test/ingest_document/{id}") + public IngestionOutput ingestDocumentById(@PathVariable String id) { + return ksIngestor.ingestDocumentById(id); } @GetMapping("test/query_vector") - public List testSimilaritySearch(@RequestParam String query) { - return ksIngestor.testSimilaritySearch(query,"documentation"); + public List testSimilaritySearch(@RequestParam String query, @RequestParam String type) { + return ksIngestor.testSimilaritySearch(query,type); } @GetMapping("test/delete") diff --git a/src/main/java/com/olympus/apollo/dto/IngestionOutput.java b/src/main/java/com/olympus/apollo/dto/IngestionOutput.java new file mode 100644 index 0000000..fd256c1 --- /dev/null +++ b/src/main/java/com/olympus/apollo/dto/IngestionOutput.java @@ -0,0 +1,20 @@ +package com.olympus.apollo.dto; + +import lombok.Getter; +import lombok.Setter; + +import java.util.ArrayList; +import java.util.List; + +@Getter @Setter +public class IngestionOutput { + private String status; + private String message; + private List ingestedDocumentId; + + public IngestionOutput(){ + ingestedDocumentId=new ArrayList(); + } + + +} diff --git a/src/main/java/com/olympus/apollo/models/FileUploadDTO.java b/src/main/java/com/olympus/apollo/models/FileUploadDTO.java new file mode 100644 index 0000000..26e6c8e --- /dev/null +++ b/src/main/java/com/olympus/apollo/models/FileUploadDTO.java @@ -0,0 +1,21 @@ +package com.olympus.apollo.models; + +import lombok.Getter; +import lombok.Setter; + +import java.util.Date; + +@Getter @Setter +public class FileUploadDTO { + private String description; + private String ingestionStatus; + private String type; + private String ksApplicationName; + private String ksDocType; + private String ksDocSource; + private int defaultChunkSize; + private int minChunkSize; + private int maxNumberOfChunks; + private int minChunkSizeToEmbed; + private Date ingestionDate; +} diff --git a/src/main/java/com/olympus/apollo/models/KSDocument.java b/src/main/java/com/olympus/apollo/models/KSDocument.java index 82bad74..411adf8 100644 --- a/src/main/java/com/olympus/apollo/models/KSDocument.java +++ b/src/main/java/com/olympus/apollo/models/KSDocument.java @@ -23,7 +23,7 @@ public class KSDocument { private String ingestionStatus; - private String ingestionMessage; + //private String ingestionMessage; private Date ingestionDate; private KSIngestionInfo ingestionInfo; diff --git a/src/main/java/com/olympus/apollo/repository/KSIngestionInfoRepository.java b/src/main/java/com/olympus/apollo/repository/KSIngestionInfoRepository.java index 0052b90..ae6c8a4 100644 --- a/src/main/java/com/olympus/apollo/repository/KSIngestionInfoRepository.java +++ b/src/main/java/com/olympus/apollo/repository/KSIngestionInfoRepository.java @@ -4,7 +4,9 @@ import org.springframework.data.repository.CrudRepository; import org.springframework.stereotype.Repository; import com.olympus.apollo.models.KSIngestionInfo; +import org.springframework.web.bind.annotation.CrossOrigin; +@CrossOrigin @Repository public interface KSIngestionInfoRepository extends CrudRepository { diff --git a/src/main/java/com/olympus/apollo/services/KSIngestor.java b/src/main/java/com/olympus/apollo/services/KSIngestor.java index 618775e..654177c 100644 --- a/src/main/java/com/olympus/apollo/services/KSIngestor.java +++ b/src/main/java/com/olympus/apollo/services/KSIngestor.java @@ -1,16 +1,13 @@ package com.olympus.apollo.services; -import java.util.ArrayList; -import java.util.Date; -import java.util.HashMap; -import java.util.List; +import java.util.*; -import org.codelibs.jhighlight.fastutil.Hash; +import com.olympus.apollo.dto.IngestionOutput; +import com.olympus.apollo.models.KSDocument; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.ai.document.Document; import org.springframework.ai.reader.tika.TikaDocumentReader; -import org.springframework.ai.transformer.KeywordMetadataEnricher; import org.springframework.ai.transformer.splitter.TokenTextSplitter; import org.springframework.ai.vectorstore.SearchRequest; import org.springframework.ai.vectorstore.VectorStore; @@ -35,43 +32,107 @@ public class KSIngestor { @Autowired private VectorStore vectorStore; - Logger logger = LoggerFactory.getLogger(KSIngestor.class); public void deleteAll(String document_file_name) { List docToDelete = vectorStore.similaritySearch(SearchRequest.defaults().withQuery("*") - .withSimilarityThreshold(0.0) - .withFilterExpression("'source'=='3-automated-test-framework---atf.md'")); + .withSimilarityThreshold(0.0) + .withFilterExpression("'source'=='3-automated-test-framework---atf.md'")); logger.info("Number of documents to delete: " + docToDelete.size()); } - public void ingestLoop() { + public IngestionOutput ingestLoop() { - - ksDocumentRepository.findAllByIngestionStatus("NEW").forEach(ksDocument -> { - // ingest the document + IngestionOutput ingestionLoopOutput = new IngestionOutput(); + try { + ksDocumentRepository.findAllByIngestionStatus("NEW").forEach(ksDocument -> { + logger.info("Processing document: " + ksDocument.getFilePath()); + // ingest the document + ksDocument.setIngestionStatus("IN PROGRESS"); + ksDocumentRepository.save(ksDocument); + + Resource file = storageService.loadAsResource(ksDocument.getFilePath()); + TikaDocumentReader tikaDocumentReader = new TikaDocumentReader(file); + + List docs = tikaDocumentReader.read(); + logger.info("Ingested document: " + ksDocument.getFilePath()); + logger.info("Number of documents: " + docs.size()); + + KSIngestionInfo ingestionInfo = ksDocument.getIngestionInfo(); + + + TokenTextSplitter splitter = new TokenTextSplitter(ingestionInfo.getDefaultChunkSize(), + ingestionInfo.getMinChunkSize(), + ingestionInfo.getMinChunkSizeToEmbed(), + ingestionInfo.getMaxNumberOfChunks(), + true); + + + docs.forEach(doc -> { + List splitDocs = splitter.split(doc); + + logger.info("Number of documents: " + splitDocs.size()); + for (Document splitDoc : splitDocs) { + logger.info("Split before put document metadata: " + splitDoc.getMetadata()); + splitDoc.getMetadata().putAll(getMetadata(ingestionInfo)); + logger.info("Split after put document metadata: " + splitDoc.getMetadata()); + } + embedDocuments(splitDocs, ingestionInfo); + }); + ksDocument.setIngestionStatus("INGESTED");//we have to set to DONE + ksDocument.setIngestionDate(new Date()); + + ksDocumentRepository.save(ksDocument); + + ingestionLoopOutput.getIngestedDocumentId().add(ksDocument.getId()); + }); + ingestionLoopOutput.setStatus("OK"); + }catch (Exception e){ + ingestionLoopOutput.setStatus("ERROR"); + ingestionLoopOutput.setMessage(e.getMessage()); + } + return ingestionLoopOutput; + } + + public IngestionOutput ingestDocumentById(String id) { + IngestionOutput ingestionOutput= new IngestionOutput(); + Optional optionalDocument = ksDocumentRepository.findById(id); + if (optionalDocument.isPresent()) { + KSDocument ksDocument = optionalDocument.get(); + if ("NEW".equals(ksDocument.getIngestionStatus())) { + return ingestDocument(ksDocument); + } else { + ingestionOutput.setMessage("OOPS: Document is already Injected"); + return ingestionOutput; + } + } else { + ingestionOutput.setMessage("OOPS: Document Not found"); + return ingestionOutput; + } + } + + private IngestionOutput ingestDocument(KSDocument ksDocument) { + IngestionOutput ingestionLoopOutput = new IngestionOutput(); + try { ksDocument.setIngestionStatus("IN PROGRESS"); ksDocumentRepository.save(ksDocument); - Resource file = storageService.loadAsResource(ksDocument.getFilePath()); + Resource file = storageService.loadAsResource(ksDocument.getFilePath()); TikaDocumentReader tikaDocumentReader = new TikaDocumentReader(file); - List docs = tikaDocumentReader.read(); - + List docs = tikaDocumentReader.read(); logger.info("Ingested document: " + ksDocument.getFilePath()); logger.info("Number of documents: " + docs.size()); KSIngestionInfo ingestionInfo = ksDocument.getIngestionInfo(); - TokenTextSplitter splitter = new TokenTextSplitter(ingestionInfo.getDefaultChunkSize(), - ingestionInfo.getMinChunkSize(), - ingestionInfo.getMinChunkSizeToEmbed(), - ingestionInfo.getMaxNumberOfChunks(), - true); - - + ingestionInfo.getMinChunkSize(), + ingestionInfo.getMinChunkSizeToEmbed(), + ingestionInfo.getMaxNumberOfChunks(), + true); + docs.forEach(doc -> { List splitDocs = splitter.split(doc); @@ -81,40 +142,51 @@ public class KSIngestor { } embedDocuments(splitDocs, ingestionInfo); }); - ksDocument.setIngestionStatus("NEW"); + ksDocument.setIngestionStatus("INGESTED"); ksDocument.setIngestionDate(new Date()); - - ksDocumentRepository.save(ksDocument); - - }); - - + ksDocumentRepository.save(ksDocument); + + ingestionLoopOutput.getIngestedDocumentId().add(ksDocument.getId()); + ingestionLoopOutput.setStatus("OK"); + ingestionLoopOutput.setMessage("OK"); + }catch (Exception e){ + ingestionLoopOutput.setStatus("ERROR"); + ingestionLoopOutput.setMessage(e.getMessage()); + } + return ingestionLoopOutput; } - + + + private void embedDocuments(List docs, KSIngestionInfo ingestionInfo) { - - logger.info("Embedding documents"); - - vectorStore.add(docs); - logger.info("Documents embedded"); + + logger.info("Embedding documents"); + + docs.forEach(doc -> logger.info("Document metadata: " + doc.getMetadata())); + try { + vectorStore.add(docs); + logger.info("Documents embedded"); + } catch (Exception e) { + logger.error("Error embedding documents: ", e); + } } public List testSimilaritySearch(String query,String filter_doc_type) { List docs = vectorStore.similaritySearch( - SearchRequest.defaults() - .withQuery(query) - .withTopK(5).withSimilarityThreshold(0.8) - .withFilterExpression("'ks_document_type'=='"+filter_doc_type+"'")); + SearchRequest.defaults() + .withQuery(query) + .withTopK(5).withSimilarityThreshold(0.8) + .withFilterExpression("'KsDoctype'=='"+filter_doc_type+"'")); List result = new ArrayList(); for (Document doc : docs) { result.add(doc.getContent()); } return result; - } + } + - private HashMap getMetadata(KSIngestionInfo ingestionInfo) { return ingestionInfo.getMetadata(); @@ -130,4 +202,4 @@ public class KSIngestor { } -} +} \ No newline at end of file diff --git a/src/main/resources/application.properties b/src/main/resources/application.properties index d7eda1e..b9f6efe 100644 --- a/src/main/resources/application.properties +++ b/src/main/resources/application.properties @@ -23,3 +23,7 @@ spring.ai.vectorstore.mongodb.initialize-schema=false # API key if needed, e.g. OpenAI spring.ai.openai.api-key=sk-proj-j3TFJ0h348DIzMrYYfyUT3BlbkFJjk4HMc8A2ux2Asg8Y7H1 +#size filter +spring.servlet.multipart.max-file-size=10MB +spring.servlet.multipart.max-request-size=10MB +