ingestion api working fine

This commit is contained in:
sumedh
2024-08-02 22:16:33 +05:30
parent 1fdd4f52ec
commit 0c386d4d06
9 changed files with 213 additions and 91 deletions

View File

@@ -4,18 +4,14 @@ import java.util.List;
import java.util.Map;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.web.bind.annotation.CrossOrigin;
import org.springframework.web.bind.annotation.GetMapping;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RequestParam;
import org.springframework.web.bind.annotation.RestController;
import org.springframework.web.bind.annotation.*;
import com.olympus.apollo.models.KSDocument;
import com.olympus.apollo.repository.KSDocumentRepository;
@RestController
@RequestMapping("/fe-api/ksdocuments")
@CrossOrigin
@CrossOrigin(origins = "http://localhost:5173")
public class KsDocumentController {
@Autowired
@@ -30,7 +26,7 @@ public class KsDocumentController {
return result;
}
@GetMapping("/{id}")
public KSDocument getDocument(@RequestParam String id) {
public KSDocument getDocument(@PathVariable String id) {
KSDocument result = ksDocumentREpository.findById(id).get();

View File

@@ -3,16 +3,13 @@ package com.olympus.apollo.controllers;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Date;
import org.codelibs.jhighlight.fastutil.Hash;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.http.ResponseEntity;
import org.springframework.stereotype.Controller;
import org.springframework.web.bind.annotation.ExceptionHandler;
import org.springframework.web.bind.annotation.GetMapping;
import org.springframework.web.bind.annotation.PostMapping;
import org.springframework.web.bind.annotation.RequestParam;
import org.springframework.web.bind.annotation.RestController;
import org.springframework.web.bind.annotation.*;
import org.springframework.web.multipart.MultipartFile;
import org.springframework.web.servlet.mvc.support.RedirectAttributes;
@@ -22,7 +19,8 @@ import com.olympus.apollo.repository.KSDocumentRepository;
import com.olympus.apollo.repository.KSIngestionInfoRepository;
import com.olympus.apollo.services.StorageFileNotFoundException;
import com.olympus.apollo.services.StorageService;
import com.olympus.apollo.models.FileUploadDTO;
@CrossOrigin
@RestController
public class KSFileController {
@Autowired
@@ -33,44 +31,48 @@ public class KSFileController {
private KSIngestionInfoRepository ksIngestionInfoRepository;
@PostMapping("/upload")
public String handleFileUpload(@RequestParam("file") MultipartFile file) {
public String handleFileUpload(
@RequestParam("file") MultipartFile file,
@ModelAttribute FileUploadDTO fileUploadDTO
) {
String filePath = storageService.store(file);
String filePath = storageService.store(file);
KSDocument ksDocument = new KSDocument();
ksDocument.setFilePath(filePath);
ksDocument.setFileName(file.getOriginalFilename());
ksDocument.setName(file.getOriginalFilename());
ksDocument.setDescription("Uploaded file");
ksDocument.setDescription(fileUploadDTO.getDescription());
ksDocument.setIngestionStatus("NEW");
KSIngestionInfo ksIngestionInfo = new KSIngestionInfo();
ksIngestionInfo.setType("MD_DOCUMENT"); //TODO: This should be dynamic
Date now = new Date();
ksDocument.setIngestionDate(now);
KSIngestionInfo ksIngestionInfo = new KSIngestionInfo();
ksIngestionInfo.setType(fileUploadDTO.getType()); // != null ? type : "MD_DOCUMENT"
HashMap<String, String> metadata = new HashMap<>();
metadata.put("KsApplicationName", fileUploadDTO.getKsApplicationName());
metadata.put("KsDoctype", fileUploadDTO.getKsDocType());
metadata.put("KsDocSource", fileUploadDTO.getKsDocType());
metadata.put("Source", file.getOriginalFilename());
HashMap<String, String> metadata = new HashMap<>();
metadata.put("KsApplicatioName","atf");
metadata.put("KsDoctype","documentation");
metadata.put("KsDoSource","wiki");
metadata.put("Source",file.getOriginalFilename());
ksIngestionInfo.setMetadata(metadata);
ksIngestionInfo.setDefaultChunkSize(1000);
ksIngestionInfo.setMinChunkSize(200);
ksIngestionInfo.setMaxNumberOfChunks(1000);
ksIngestionInfo.setMinChunkSizeToEmbed(20);
ksIngestionInfo.setDefaultChunkSize(fileUploadDTO.getDefaultChunkSize());
ksIngestionInfo.setMinChunkSize(fileUploadDTO.getMinChunkSize());
ksIngestionInfo.setMaxNumberOfChunks(fileUploadDTO.getMaxNumberOfChunks());
ksIngestionInfo.setMinChunkSizeToEmbed(fileUploadDTO.getMinChunkSizeToEmbed());
ksIngestionInfoRepository.save(ksIngestionInfo);
ksDocument.setIngestionInfo(ksIngestionInfo);
ksDocumentREpository.save(ksDocument);
return "OK";
}
return "OK";
}
@ExceptionHandler(StorageFileNotFoundException.class)
public ResponseEntity<?> handleStorageFileNotFound(StorageFileNotFoundException exc) {
return ResponseEntity.notFound().build();
}
}
@ExceptionHandler(StorageFileNotFoundException.class)
public ResponseEntity<?> handleStorageFileNotFound(StorageFileNotFoundException exc) {
return ResponseEntity.notFound().build();
}
}

View File

@@ -3,16 +3,16 @@ package com.olympus.apollo.controllers;
import java.util.HashMap;
import java.util.List;
import com.olympus.apollo.dto.IngestionOutput;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.web.bind.annotation.RestController;
import org.springframework.http.ResponseEntity;
import org.springframework.web.bind.annotation.*;
import com.olympus.apollo.models.KSIngestionInfo;
import com.olympus.apollo.services.GitRepositoryIngestor;
import com.olympus.apollo.services.KSIngestor;
import org.springframework.web.bind.annotation.GetMapping;
import org.springframework.web.bind.annotation.RequestParam;
@CrossOrigin(origins = "http://localhost:5173")
@RestController
public class TestController {
@@ -21,16 +21,21 @@ public class TestController {
@Autowired
GitRepositoryIngestor gitRepositoryIngestor;
@GetMapping("test/ingestion_loop")
public String testIngestionLoop() {
ksIngestor.ingestLoop();
return "Ingestion Loop Completed";
public IngestionOutput testIngestionLoop() {
return ksIngestor.ingestLoop();
}
@GetMapping("test/ingest_document/{id}")
public IngestionOutput ingestDocumentById(@PathVariable String id) {
return ksIngestor.ingestDocumentById(id);
}
@GetMapping("test/query_vector")
public List<String> testSimilaritySearch(@RequestParam String query) {
return ksIngestor.testSimilaritySearch(query,"documentation");
public List<String> testSimilaritySearch(@RequestParam String query, @RequestParam String type) {
return ksIngestor.testSimilaritySearch(query,type);
}
@GetMapping("test/delete")

View File

@@ -0,0 +1,20 @@
package com.olympus.apollo.dto;
import lombok.Getter;
import lombok.Setter;
import java.util.ArrayList;
import java.util.List;
@Getter @Setter
public class IngestionOutput {
private String status;
private String message;
private List<String> ingestedDocumentId;
public IngestionOutput(){
ingestedDocumentId=new ArrayList<String>();
}
}

View File

@@ -0,0 +1,21 @@
package com.olympus.apollo.models;
import lombok.Getter;
import lombok.Setter;
import java.util.Date;
@Getter @Setter
public class FileUploadDTO {
private String description;
private String ingestionStatus;
private String type;
private String ksApplicationName;
private String ksDocType;
private String ksDocSource;
private int defaultChunkSize;
private int minChunkSize;
private int maxNumberOfChunks;
private int minChunkSizeToEmbed;
private Date ingestionDate;
}

View File

@@ -23,7 +23,7 @@ public class KSDocument {
private String ingestionStatus;
private String ingestionMessage;
//private String ingestionMessage;
private Date ingestionDate;
private KSIngestionInfo ingestionInfo;

View File

@@ -4,7 +4,9 @@ import org.springframework.data.repository.CrudRepository;
import org.springframework.stereotype.Repository;
import com.olympus.apollo.models.KSIngestionInfo;
import org.springframework.web.bind.annotation.CrossOrigin;
@CrossOrigin
@Repository
public interface KSIngestionInfoRepository extends CrudRepository<KSIngestionInfo, String> {

View File

@@ -1,16 +1,13 @@
package com.olympus.apollo.services;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.*;
import org.codelibs.jhighlight.fastutil.Hash;
import com.olympus.apollo.dto.IngestionOutput;
import com.olympus.apollo.models.KSDocument;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.ai.document.Document;
import org.springframework.ai.reader.tika.TikaDocumentReader;
import org.springframework.ai.transformer.KeywordMetadataEnricher;
import org.springframework.ai.transformer.splitter.TokenTextSplitter;
import org.springframework.ai.vectorstore.SearchRequest;
import org.springframework.ai.vectorstore.VectorStore;
@@ -35,43 +32,107 @@ public class KSIngestor {
@Autowired
private VectorStore vectorStore;
Logger logger = LoggerFactory.getLogger(KSIngestor.class);
public void deleteAll(String document_file_name) {
List<Document> docToDelete = vectorStore.similaritySearch(SearchRequest.defaults().withQuery("*")
.withSimilarityThreshold(0.0)
.withFilterExpression("'source'=='3-automated-test-framework---atf.md'"));
.withSimilarityThreshold(0.0)
.withFilterExpression("'source'=='3-automated-test-framework---atf.md'"));
logger.info("Number of documents to delete: " + docToDelete.size());
}
public void ingestLoop() {
public IngestionOutput ingestLoop() {
ksDocumentRepository.findAllByIngestionStatus("NEW").forEach(ksDocument -> {
// ingest the document
IngestionOutput ingestionLoopOutput = new IngestionOutput();
try {
ksDocumentRepository.findAllByIngestionStatus("NEW").forEach(ksDocument -> {
logger.info("Processing document: " + ksDocument.getFilePath());
// ingest the document
ksDocument.setIngestionStatus("IN PROGRESS");
ksDocumentRepository.save(ksDocument);
Resource file = storageService.loadAsResource(ksDocument.getFilePath());
TikaDocumentReader tikaDocumentReader = new TikaDocumentReader(file);
List<Document> docs = tikaDocumentReader.read();
logger.info("Ingested document: " + ksDocument.getFilePath());
logger.info("Number of documents: " + docs.size());
KSIngestionInfo ingestionInfo = ksDocument.getIngestionInfo();
TokenTextSplitter splitter = new TokenTextSplitter(ingestionInfo.getDefaultChunkSize(),
ingestionInfo.getMinChunkSize(),
ingestionInfo.getMinChunkSizeToEmbed(),
ingestionInfo.getMaxNumberOfChunks(),
true);
docs.forEach(doc -> {
List<Document> splitDocs = splitter.split(doc);
logger.info("Number of documents: " + splitDocs.size());
for (Document splitDoc : splitDocs) {
logger.info("Split before put document metadata: " + splitDoc.getMetadata());
splitDoc.getMetadata().putAll(getMetadata(ingestionInfo));
logger.info("Split after put document metadata: " + splitDoc.getMetadata());
}
embedDocuments(splitDocs, ingestionInfo);
});
ksDocument.setIngestionStatus("INGESTED");//we have to set to DONE
ksDocument.setIngestionDate(new Date());
ksDocumentRepository.save(ksDocument);
ingestionLoopOutput.getIngestedDocumentId().add(ksDocument.getId());
});
ingestionLoopOutput.setStatus("OK");
}catch (Exception e){
ingestionLoopOutput.setStatus("ERROR");
ingestionLoopOutput.setMessage(e.getMessage());
}
return ingestionLoopOutput;
}
public IngestionOutput ingestDocumentById(String id) {
IngestionOutput ingestionOutput= new IngestionOutput();
Optional<KSDocument> optionalDocument = ksDocumentRepository.findById(id);
if (optionalDocument.isPresent()) {
KSDocument ksDocument = optionalDocument.get();
if ("NEW".equals(ksDocument.getIngestionStatus())) {
return ingestDocument(ksDocument);
} else {
ingestionOutput.setMessage("OOPS: Document is already Injected");
return ingestionOutput;
}
} else {
ingestionOutput.setMessage("OOPS: Document Not found");
return ingestionOutput;
}
}
private IngestionOutput ingestDocument(KSDocument ksDocument) {
IngestionOutput ingestionLoopOutput = new IngestionOutput();
try {
ksDocument.setIngestionStatus("IN PROGRESS");
ksDocumentRepository.save(ksDocument);
Resource file = storageService.loadAsResource(ksDocument.getFilePath());
Resource file = storageService.loadAsResource(ksDocument.getFilePath());
TikaDocumentReader tikaDocumentReader = new TikaDocumentReader(file);
List<Document> docs = tikaDocumentReader.read();
List<Document> docs = tikaDocumentReader.read();
logger.info("Ingested document: " + ksDocument.getFilePath());
logger.info("Number of documents: " + docs.size());
KSIngestionInfo ingestionInfo = ksDocument.getIngestionInfo();
TokenTextSplitter splitter = new TokenTextSplitter(ingestionInfo.getDefaultChunkSize(),
ingestionInfo.getMinChunkSize(),
ingestionInfo.getMinChunkSizeToEmbed(),
ingestionInfo.getMaxNumberOfChunks(),
true);
ingestionInfo.getMinChunkSize(),
ingestionInfo.getMinChunkSizeToEmbed(),
ingestionInfo.getMaxNumberOfChunks(),
true);
docs.forEach(doc -> {
List<Document> splitDocs = splitter.split(doc);
@@ -81,40 +142,51 @@ public class KSIngestor {
}
embedDocuments(splitDocs, ingestionInfo);
});
ksDocument.setIngestionStatus("NEW");
ksDocument.setIngestionStatus("INGESTED");
ksDocument.setIngestionDate(new Date());
ksDocumentRepository.save(ksDocument);
});
ksDocumentRepository.save(ksDocument);
ingestionLoopOutput.getIngestedDocumentId().add(ksDocument.getId());
ingestionLoopOutput.setStatus("OK");
ingestionLoopOutput.setMessage("OK");
}catch (Exception e){
ingestionLoopOutput.setStatus("ERROR");
ingestionLoopOutput.setMessage(e.getMessage());
}
return ingestionLoopOutput;
}
private void embedDocuments(List<Document> docs, KSIngestionInfo ingestionInfo) {
logger.info("Embedding documents");
vectorStore.add(docs);
logger.info("Documents embedded");
logger.info("Embedding documents");
docs.forEach(doc -> logger.info("Document metadata: " + doc.getMetadata()));
try {
vectorStore.add(docs);
logger.info("Documents embedded");
} catch (Exception e) {
logger.error("Error embedding documents: ", e);
}
}
public List<String> testSimilaritySearch(String query,String filter_doc_type) {
List<Document> docs = vectorStore.similaritySearch(
SearchRequest.defaults()
.withQuery(query)
.withTopK(5).withSimilarityThreshold(0.8)
.withFilterExpression("'ks_document_type'=='"+filter_doc_type+"'"));
SearchRequest.defaults()
.withQuery(query)
.withTopK(5).withSimilarityThreshold(0.8)
.withFilterExpression("'KsDoctype'=='"+filter_doc_type+"'"));
List<String> result = new ArrayList<String>();
for (Document doc : docs) {
result.add(doc.getContent());
}
return result;
}
}
private HashMap<String, String> getMetadata(KSIngestionInfo ingestionInfo) {
return ingestionInfo.getMetadata();
@@ -130,4 +202,4 @@ public class KSIngestor {
}
}
}

View File

@@ -23,3 +23,7 @@ spring.ai.vectorstore.mongodb.initialize-schema=false
# API key if needed, e.g. OpenAI
spring.ai.openai.api-key=sk-proj-j3TFJ0h348DIzMrYYfyUT3BlbkFJjk4HMc8A2ux2Asg8Y7H1
#size filter
spring.servlet.multipart.max-file-size=10MB
spring.servlet.multipart.max-request-size=10MB