ingestion api working fine
This commit is contained in:
@@ -4,18 +4,14 @@ import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.web.bind.annotation.CrossOrigin;
|
||||
import org.springframework.web.bind.annotation.GetMapping;
|
||||
import org.springframework.web.bind.annotation.RequestMapping;
|
||||
import org.springframework.web.bind.annotation.RequestParam;
|
||||
import org.springframework.web.bind.annotation.RestController;
|
||||
import org.springframework.web.bind.annotation.*;
|
||||
|
||||
import com.olympus.apollo.models.KSDocument;
|
||||
import com.olympus.apollo.repository.KSDocumentRepository;
|
||||
|
||||
@RestController
|
||||
@RequestMapping("/fe-api/ksdocuments")
|
||||
@CrossOrigin
|
||||
@CrossOrigin(origins = "http://localhost:5173")
|
||||
public class KsDocumentController {
|
||||
|
||||
@Autowired
|
||||
@@ -30,7 +26,7 @@ public class KsDocumentController {
|
||||
return result;
|
||||
}
|
||||
@GetMapping("/{id}")
|
||||
public KSDocument getDocument(@RequestParam String id) {
|
||||
public KSDocument getDocument(@PathVariable String id) {
|
||||
|
||||
KSDocument result = ksDocumentREpository.findById(id).get();
|
||||
|
||||
|
||||
@@ -3,16 +3,13 @@ package com.olympus.apollo.controllers;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Date;
|
||||
|
||||
import org.codelibs.jhighlight.fastutil.Hash;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.http.ResponseEntity;
|
||||
import org.springframework.stereotype.Controller;
|
||||
import org.springframework.web.bind.annotation.ExceptionHandler;
|
||||
import org.springframework.web.bind.annotation.GetMapping;
|
||||
import org.springframework.web.bind.annotation.PostMapping;
|
||||
import org.springframework.web.bind.annotation.RequestParam;
|
||||
import org.springframework.web.bind.annotation.RestController;
|
||||
import org.springframework.web.bind.annotation.*;
|
||||
import org.springframework.web.multipart.MultipartFile;
|
||||
import org.springframework.web.servlet.mvc.support.RedirectAttributes;
|
||||
|
||||
@@ -22,7 +19,8 @@ import com.olympus.apollo.repository.KSDocumentRepository;
|
||||
import com.olympus.apollo.repository.KSIngestionInfoRepository;
|
||||
import com.olympus.apollo.services.StorageFileNotFoundException;
|
||||
import com.olympus.apollo.services.StorageService;
|
||||
|
||||
import com.olympus.apollo.models.FileUploadDTO;
|
||||
@CrossOrigin
|
||||
@RestController
|
||||
public class KSFileController {
|
||||
@Autowired
|
||||
@@ -33,44 +31,48 @@ public class KSFileController {
|
||||
private KSIngestionInfoRepository ksIngestionInfoRepository;
|
||||
|
||||
|
||||
|
||||
|
||||
@PostMapping("/upload")
|
||||
public String handleFileUpload(@RequestParam("file") MultipartFile file) {
|
||||
public String handleFileUpload(
|
||||
@RequestParam("file") MultipartFile file,
|
||||
@ModelAttribute FileUploadDTO fileUploadDTO
|
||||
) {
|
||||
String filePath = storageService.store(file);
|
||||
|
||||
String filePath = storageService.store(file);
|
||||
KSDocument ksDocument = new KSDocument();
|
||||
ksDocument.setFilePath(filePath);
|
||||
ksDocument.setFileName(file.getOriginalFilename());
|
||||
ksDocument.setName(file.getOriginalFilename());
|
||||
ksDocument.setDescription("Uploaded file");
|
||||
ksDocument.setDescription(fileUploadDTO.getDescription());
|
||||
ksDocument.setIngestionStatus("NEW");
|
||||
|
||||
KSIngestionInfo ksIngestionInfo = new KSIngestionInfo();
|
||||
ksIngestionInfo.setType("MD_DOCUMENT"); //TODO: This should be dynamic
|
||||
Date now = new Date();
|
||||
ksDocument.setIngestionDate(now);
|
||||
|
||||
|
||||
KSIngestionInfo ksIngestionInfo = new KSIngestionInfo();
|
||||
ksIngestionInfo.setType(fileUploadDTO.getType()); // != null ? type : "MD_DOCUMENT"
|
||||
|
||||
HashMap<String, String> metadata = new HashMap<>();
|
||||
metadata.put("KsApplicationName", fileUploadDTO.getKsApplicationName());
|
||||
metadata.put("KsDoctype", fileUploadDTO.getKsDocType());
|
||||
metadata.put("KsDocSource", fileUploadDTO.getKsDocType());
|
||||
metadata.put("Source", file.getOriginalFilename());
|
||||
|
||||
HashMap<String, String> metadata = new HashMap<>();
|
||||
|
||||
metadata.put("KsApplicatioName","atf");
|
||||
metadata.put("KsDoctype","documentation");
|
||||
metadata.put("KsDoSource","wiki");
|
||||
metadata.put("Source",file.getOriginalFilename());
|
||||
|
||||
ksIngestionInfo.setMetadata(metadata);
|
||||
ksIngestionInfo.setDefaultChunkSize(1000);
|
||||
ksIngestionInfo.setMinChunkSize(200);
|
||||
ksIngestionInfo.setMaxNumberOfChunks(1000);
|
||||
ksIngestionInfo.setMinChunkSizeToEmbed(20);
|
||||
ksIngestionInfo.setDefaultChunkSize(fileUploadDTO.getDefaultChunkSize());
|
||||
ksIngestionInfo.setMinChunkSize(fileUploadDTO.getMinChunkSize());
|
||||
ksIngestionInfo.setMaxNumberOfChunks(fileUploadDTO.getMaxNumberOfChunks());
|
||||
ksIngestionInfo.setMinChunkSizeToEmbed(fileUploadDTO.getMinChunkSizeToEmbed());
|
||||
|
||||
ksIngestionInfoRepository.save(ksIngestionInfo);
|
||||
ksDocument.setIngestionInfo(ksIngestionInfo);
|
||||
ksDocumentREpository.save(ksDocument);
|
||||
|
||||
return "OK";
|
||||
}
|
||||
return "OK";
|
||||
}
|
||||
|
||||
@ExceptionHandler(StorageFileNotFoundException.class)
|
||||
public ResponseEntity<?> handleStorageFileNotFound(StorageFileNotFoundException exc) {
|
||||
return ResponseEntity.notFound().build();
|
||||
}
|
||||
}
|
||||
@ExceptionHandler(StorageFileNotFoundException.class)
|
||||
public ResponseEntity<?> handleStorageFileNotFound(StorageFileNotFoundException exc) {
|
||||
return ResponseEntity.notFound().build();
|
||||
}
|
||||
}
|
||||
@@ -3,16 +3,16 @@ package com.olympus.apollo.controllers;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
|
||||
import com.olympus.apollo.dto.IngestionOutput;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.web.bind.annotation.RestController;
|
||||
import org.springframework.http.ResponseEntity;
|
||||
import org.springframework.web.bind.annotation.*;
|
||||
|
||||
import com.olympus.apollo.models.KSIngestionInfo;
|
||||
import com.olympus.apollo.services.GitRepositoryIngestor;
|
||||
import com.olympus.apollo.services.KSIngestor;
|
||||
import org.springframework.web.bind.annotation.GetMapping;
|
||||
import org.springframework.web.bind.annotation.RequestParam;
|
||||
|
||||
|
||||
@CrossOrigin(origins = "http://localhost:5173")
|
||||
@RestController
|
||||
public class TestController {
|
||||
|
||||
@@ -21,16 +21,21 @@ public class TestController {
|
||||
|
||||
@Autowired
|
||||
GitRepositoryIngestor gitRepositoryIngestor;
|
||||
|
||||
|
||||
@GetMapping("test/ingestion_loop")
|
||||
public String testIngestionLoop() {
|
||||
ksIngestor.ingestLoop();
|
||||
return "Ingestion Loop Completed";
|
||||
public IngestionOutput testIngestionLoop() {
|
||||
return ksIngestor.ingestLoop();
|
||||
}
|
||||
|
||||
@GetMapping("test/ingest_document/{id}")
|
||||
public IngestionOutput ingestDocumentById(@PathVariable String id) {
|
||||
return ksIngestor.ingestDocumentById(id);
|
||||
}
|
||||
|
||||
@GetMapping("test/query_vector")
|
||||
public List<String> testSimilaritySearch(@RequestParam String query) {
|
||||
return ksIngestor.testSimilaritySearch(query,"documentation");
|
||||
public List<String> testSimilaritySearch(@RequestParam String query, @RequestParam String type) {
|
||||
return ksIngestor.testSimilaritySearch(query,type);
|
||||
}
|
||||
|
||||
@GetMapping("test/delete")
|
||||
|
||||
20
src/main/java/com/olympus/apollo/dto/IngestionOutput.java
Normal file
20
src/main/java/com/olympus/apollo/dto/IngestionOutput.java
Normal file
@@ -0,0 +1,20 @@
|
||||
package com.olympus.apollo.dto;
|
||||
|
||||
import lombok.Getter;
|
||||
import lombok.Setter;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
@Getter @Setter
|
||||
public class IngestionOutput {
|
||||
private String status;
|
||||
private String message;
|
||||
private List<String> ingestedDocumentId;
|
||||
|
||||
public IngestionOutput(){
|
||||
ingestedDocumentId=new ArrayList<String>();
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
21
src/main/java/com/olympus/apollo/models/FileUploadDTO.java
Normal file
21
src/main/java/com/olympus/apollo/models/FileUploadDTO.java
Normal file
@@ -0,0 +1,21 @@
|
||||
package com.olympus.apollo.models;
|
||||
|
||||
import lombok.Getter;
|
||||
import lombok.Setter;
|
||||
|
||||
import java.util.Date;
|
||||
|
||||
@Getter @Setter
|
||||
public class FileUploadDTO {
|
||||
private String description;
|
||||
private String ingestionStatus;
|
||||
private String type;
|
||||
private String ksApplicationName;
|
||||
private String ksDocType;
|
||||
private String ksDocSource;
|
||||
private int defaultChunkSize;
|
||||
private int minChunkSize;
|
||||
private int maxNumberOfChunks;
|
||||
private int minChunkSizeToEmbed;
|
||||
private Date ingestionDate;
|
||||
}
|
||||
@@ -23,7 +23,7 @@ public class KSDocument {
|
||||
|
||||
|
||||
private String ingestionStatus;
|
||||
private String ingestionMessage;
|
||||
//private String ingestionMessage;
|
||||
private Date ingestionDate;
|
||||
|
||||
private KSIngestionInfo ingestionInfo;
|
||||
|
||||
@@ -4,7 +4,9 @@ import org.springframework.data.repository.CrudRepository;
|
||||
import org.springframework.stereotype.Repository;
|
||||
|
||||
import com.olympus.apollo.models.KSIngestionInfo;
|
||||
import org.springframework.web.bind.annotation.CrossOrigin;
|
||||
|
||||
@CrossOrigin
|
||||
@Repository
|
||||
public interface KSIngestionInfoRepository extends CrudRepository<KSIngestionInfo, String> {
|
||||
|
||||
|
||||
@@ -1,16 +1,13 @@
|
||||
package com.olympus.apollo.services;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Date;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.*;
|
||||
|
||||
import org.codelibs.jhighlight.fastutil.Hash;
|
||||
import com.olympus.apollo.dto.IngestionOutput;
|
||||
import com.olympus.apollo.models.KSDocument;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.ai.document.Document;
|
||||
import org.springframework.ai.reader.tika.TikaDocumentReader;
|
||||
import org.springframework.ai.transformer.KeywordMetadataEnricher;
|
||||
import org.springframework.ai.transformer.splitter.TokenTextSplitter;
|
||||
import org.springframework.ai.vectorstore.SearchRequest;
|
||||
import org.springframework.ai.vectorstore.VectorStore;
|
||||
@@ -35,43 +32,107 @@ public class KSIngestor {
|
||||
|
||||
@Autowired
|
||||
private VectorStore vectorStore;
|
||||
|
||||
Logger logger = LoggerFactory.getLogger(KSIngestor.class);
|
||||
|
||||
public void deleteAll(String document_file_name) {
|
||||
List<Document> docToDelete = vectorStore.similaritySearch(SearchRequest.defaults().withQuery("*")
|
||||
.withSimilarityThreshold(0.0)
|
||||
.withFilterExpression("'source'=='3-automated-test-framework---atf.md'"));
|
||||
.withSimilarityThreshold(0.0)
|
||||
.withFilterExpression("'source'=='3-automated-test-framework---atf.md'"));
|
||||
|
||||
logger.info("Number of documents to delete: " + docToDelete.size());
|
||||
}
|
||||
|
||||
public void ingestLoop() {
|
||||
public IngestionOutput ingestLoop() {
|
||||
|
||||
|
||||
ksDocumentRepository.findAllByIngestionStatus("NEW").forEach(ksDocument -> {
|
||||
// ingest the document
|
||||
IngestionOutput ingestionLoopOutput = new IngestionOutput();
|
||||
try {
|
||||
ksDocumentRepository.findAllByIngestionStatus("NEW").forEach(ksDocument -> {
|
||||
logger.info("Processing document: " + ksDocument.getFilePath());
|
||||
// ingest the document
|
||||
ksDocument.setIngestionStatus("IN PROGRESS");
|
||||
ksDocumentRepository.save(ksDocument);
|
||||
|
||||
Resource file = storageService.loadAsResource(ksDocument.getFilePath());
|
||||
TikaDocumentReader tikaDocumentReader = new TikaDocumentReader(file);
|
||||
|
||||
List<Document> docs = tikaDocumentReader.read();
|
||||
logger.info("Ingested document: " + ksDocument.getFilePath());
|
||||
logger.info("Number of documents: " + docs.size());
|
||||
|
||||
KSIngestionInfo ingestionInfo = ksDocument.getIngestionInfo();
|
||||
|
||||
|
||||
TokenTextSplitter splitter = new TokenTextSplitter(ingestionInfo.getDefaultChunkSize(),
|
||||
ingestionInfo.getMinChunkSize(),
|
||||
ingestionInfo.getMinChunkSizeToEmbed(),
|
||||
ingestionInfo.getMaxNumberOfChunks(),
|
||||
true);
|
||||
|
||||
|
||||
docs.forEach(doc -> {
|
||||
List<Document> splitDocs = splitter.split(doc);
|
||||
|
||||
logger.info("Number of documents: " + splitDocs.size());
|
||||
for (Document splitDoc : splitDocs) {
|
||||
logger.info("Split before put document metadata: " + splitDoc.getMetadata());
|
||||
splitDoc.getMetadata().putAll(getMetadata(ingestionInfo));
|
||||
logger.info("Split after put document metadata: " + splitDoc.getMetadata());
|
||||
}
|
||||
embedDocuments(splitDocs, ingestionInfo);
|
||||
});
|
||||
ksDocument.setIngestionStatus("INGESTED");//we have to set to DONE
|
||||
ksDocument.setIngestionDate(new Date());
|
||||
|
||||
ksDocumentRepository.save(ksDocument);
|
||||
|
||||
ingestionLoopOutput.getIngestedDocumentId().add(ksDocument.getId());
|
||||
});
|
||||
ingestionLoopOutput.setStatus("OK");
|
||||
}catch (Exception e){
|
||||
ingestionLoopOutput.setStatus("ERROR");
|
||||
ingestionLoopOutput.setMessage(e.getMessage());
|
||||
}
|
||||
return ingestionLoopOutput;
|
||||
}
|
||||
|
||||
public IngestionOutput ingestDocumentById(String id) {
|
||||
IngestionOutput ingestionOutput= new IngestionOutput();
|
||||
Optional<KSDocument> optionalDocument = ksDocumentRepository.findById(id);
|
||||
if (optionalDocument.isPresent()) {
|
||||
KSDocument ksDocument = optionalDocument.get();
|
||||
if ("NEW".equals(ksDocument.getIngestionStatus())) {
|
||||
return ingestDocument(ksDocument);
|
||||
} else {
|
||||
ingestionOutput.setMessage("OOPS: Document is already Injected");
|
||||
return ingestionOutput;
|
||||
}
|
||||
} else {
|
||||
ingestionOutput.setMessage("OOPS: Document Not found");
|
||||
return ingestionOutput;
|
||||
}
|
||||
}
|
||||
|
||||
private IngestionOutput ingestDocument(KSDocument ksDocument) {
|
||||
IngestionOutput ingestionLoopOutput = new IngestionOutput();
|
||||
try {
|
||||
ksDocument.setIngestionStatus("IN PROGRESS");
|
||||
ksDocumentRepository.save(ksDocument);
|
||||
|
||||
Resource file = storageService.loadAsResource(ksDocument.getFilePath());
|
||||
Resource file = storageService.loadAsResource(ksDocument.getFilePath());
|
||||
TikaDocumentReader tikaDocumentReader = new TikaDocumentReader(file);
|
||||
|
||||
List<Document> docs = tikaDocumentReader.read();
|
||||
|
||||
List<Document> docs = tikaDocumentReader.read();
|
||||
logger.info("Ingested document: " + ksDocument.getFilePath());
|
||||
logger.info("Number of documents: " + docs.size());
|
||||
|
||||
KSIngestionInfo ingestionInfo = ksDocument.getIngestionInfo();
|
||||
|
||||
|
||||
TokenTextSplitter splitter = new TokenTextSplitter(ingestionInfo.getDefaultChunkSize(),
|
||||
ingestionInfo.getMinChunkSize(),
|
||||
ingestionInfo.getMinChunkSizeToEmbed(),
|
||||
ingestionInfo.getMaxNumberOfChunks(),
|
||||
true);
|
||||
|
||||
|
||||
ingestionInfo.getMinChunkSize(),
|
||||
ingestionInfo.getMinChunkSizeToEmbed(),
|
||||
ingestionInfo.getMaxNumberOfChunks(),
|
||||
true);
|
||||
|
||||
docs.forEach(doc -> {
|
||||
List<Document> splitDocs = splitter.split(doc);
|
||||
|
||||
@@ -81,40 +142,51 @@ public class KSIngestor {
|
||||
}
|
||||
embedDocuments(splitDocs, ingestionInfo);
|
||||
});
|
||||
ksDocument.setIngestionStatus("NEW");
|
||||
ksDocument.setIngestionStatus("INGESTED");
|
||||
ksDocument.setIngestionDate(new Date());
|
||||
|
||||
ksDocumentRepository.save(ksDocument);
|
||||
|
||||
});
|
||||
|
||||
|
||||
|
||||
ksDocumentRepository.save(ksDocument);
|
||||
|
||||
ingestionLoopOutput.getIngestedDocumentId().add(ksDocument.getId());
|
||||
ingestionLoopOutput.setStatus("OK");
|
||||
ingestionLoopOutput.setMessage("OK");
|
||||
}catch (Exception e){
|
||||
ingestionLoopOutput.setStatus("ERROR");
|
||||
ingestionLoopOutput.setMessage(e.getMessage());
|
||||
}
|
||||
return ingestionLoopOutput;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
private void embedDocuments(List<Document> docs, KSIngestionInfo ingestionInfo) {
|
||||
|
||||
logger.info("Embedding documents");
|
||||
|
||||
vectorStore.add(docs);
|
||||
logger.info("Documents embedded");
|
||||
|
||||
logger.info("Embedding documents");
|
||||
|
||||
docs.forEach(doc -> logger.info("Document metadata: " + doc.getMetadata()));
|
||||
try {
|
||||
vectorStore.add(docs);
|
||||
logger.info("Documents embedded");
|
||||
} catch (Exception e) {
|
||||
logger.error("Error embedding documents: ", e);
|
||||
}
|
||||
}
|
||||
|
||||
public List<String> testSimilaritySearch(String query,String filter_doc_type) {
|
||||
List<Document> docs = vectorStore.similaritySearch(
|
||||
SearchRequest.defaults()
|
||||
.withQuery(query)
|
||||
.withTopK(5).withSimilarityThreshold(0.8)
|
||||
.withFilterExpression("'ks_document_type'=='"+filter_doc_type+"'"));
|
||||
SearchRequest.defaults()
|
||||
.withQuery(query)
|
||||
.withTopK(5).withSimilarityThreshold(0.8)
|
||||
.withFilterExpression("'KsDoctype'=='"+filter_doc_type+"'"));
|
||||
|
||||
List<String> result = new ArrayList<String>();
|
||||
for (Document doc : docs) {
|
||||
result.add(doc.getContent());
|
||||
}
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
private HashMap<String, String> getMetadata(KSIngestionInfo ingestionInfo) {
|
||||
|
||||
return ingestionInfo.getMetadata();
|
||||
@@ -130,4 +202,4 @@ public class KSIngestor {
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
@@ -23,3 +23,7 @@ spring.ai.vectorstore.mongodb.initialize-schema=false
|
||||
# API key if needed, e.g. OpenAI
|
||||
spring.ai.openai.api-key=sk-proj-j3TFJ0h348DIzMrYYfyUT3BlbkFJjk4HMc8A2ux2Asg8Y7H1
|
||||
|
||||
#size filter
|
||||
spring.servlet.multipart.max-file-size=10MB
|
||||
spring.servlet.multipart.max-request-size=10MB
|
||||
|
||||
|
||||
Reference in New Issue
Block a user