From 962d135bdc8f03f21b1c4bee03d1ceacdbc4304c Mon Sep 17 00:00:00 2001 From: sumedh Date: Fri, 4 Oct 2024 18:08:27 +0530 Subject: [PATCH] text ingestion process added --- .../apollo/controllers/KSFileController.java | 61 +++++++++++++ .../apollo/dto/ExternalFileIngestionDTO.java | 23 +++++ .../com/olympus/apollo/models/KSTexts.java | 34 ++++++++ .../apollo/repository/KSTextsRepository.java | 15 ++++ .../olympus/apollo/services/KSIngestor.java | 85 +++++++++++++++++++ 5 files changed, 218 insertions(+) create mode 100644 src/main/java/com/olympus/apollo/dto/ExternalFileIngestionDTO.java create mode 100644 src/main/java/com/olympus/apollo/models/KSTexts.java create mode 100644 src/main/java/com/olympus/apollo/repository/KSTextsRepository.java diff --git a/src/main/java/com/olympus/apollo/controllers/KSFileController.java b/src/main/java/com/olympus/apollo/controllers/KSFileController.java index a4b46bd..8e0704d 100644 --- a/src/main/java/com/olympus/apollo/controllers/KSFileController.java +++ b/src/main/java/com/olympus/apollo/controllers/KSFileController.java @@ -3,7 +3,14 @@ package com.olympus.apollo.controllers; import java.util.HashMap; import java.util.Date; import java.text.SimpleDateFormat; +import java.util.List; +import com.olympus.apollo.dto.ExternalFileIngestionDTO; +import com.olympus.apollo.dto.IngestionOutput; +import com.olympus.apollo.models.KSGitInfo; +import com.olympus.apollo.models.KSTexts; +import com.olympus.apollo.repository.KSTextsRepository; +import com.olympus.apollo.services.KSIngestor; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.http.ResponseEntity; import org.springframework.web.bind.annotation.*; @@ -26,6 +33,11 @@ public class KSFileController { @Autowired private KSIngestionInfoRepository ksIngestionInfoRepository; + @Autowired + private KSTextsRepository ksTextsRepository; + @Autowired + private KSIngestor ksIngestor; + @PostMapping("/upload") public String handleFileUpload( @@ -72,4 +84,53 @@ public class KSFileController { public ResponseEntity handleStorageFileNotFound(StorageFileNotFoundException exc) { return ResponseEntity.notFound().build(); } + //ingestion of text process + @PostMapping("/externalingestion") + public String handleExternalIngestion( + @RequestBody ExternalFileIngestionDTO externalFileIngestionDTO + ) { + + KSTexts ksTexts = new KSTexts(); + ksTexts.setTextToEmbed(externalFileIngestionDTO.getTextToEmbed()); + ksTexts.setName(externalFileIngestionDTO.getName()); + ksTexts.setDescription(externalFileIngestionDTO.getDescription()); + ksTexts.setIngestionStatus("NEW"); + ksTexts.setIngestionDateFormat(new SimpleDateFormat("MM/dd/yy").format(new Date())); + + Date now = new Date(); + ksTexts.setIngestionDate(now); + + + HashMap IngestionInfo = new HashMap<>(); + ksTexts.setType(externalFileIngestionDTO.getType()); + HashMap metadata = new HashMap<>(); + metadata.put("KsApplicationName", externalFileIngestionDTO.getKsApplicationName()); + metadata.put("KsDoctype", externalFileIngestionDTO.getKsDocType()); + metadata.put("KsDocSource", externalFileIngestionDTO.getKsDocSource()); + metadata.put("KsFileSource", externalFileIngestionDTO.getName()); + IngestionInfo.put("type",externalFileIngestionDTO.getType()); + IngestionInfo.put("metadata",metadata); + //ksTexts.setMetadata(metadata); + ksTexts.setIngestionInfo(IngestionInfo); + ksTexts.setDefaultChunkSize(externalFileIngestionDTO.getDefaultChunkSize()); + ksTexts.setMinChunkSize(externalFileIngestionDTO.getMinChunkSize()); + ksTexts.setMaxNumberOfChunks(externalFileIngestionDTO.getMaxNumberOfChunks()); + ksTexts.setMinChunkSizeToEmbed(externalFileIngestionDTO.getMinChunkSizeToEmbed()); + ksTexts.setAdditionalMetadata(externalFileIngestionDTO.getAdditionalMetaData()); + + ksTextsRepository.save(ksTexts); + + return "OK"; + } + + @GetMapping("/texts") + public List listTextsInfo() { + List result = (List) ksTextsRepository.findAll(); + return result; + } + + @GetMapping("/ingest_texts/{id}") + public IngestionOutput ingestDocumentById(@PathVariable String id) { + return ksIngestor.ingestTextById(id); + } } \ No newline at end of file diff --git a/src/main/java/com/olympus/apollo/dto/ExternalFileIngestionDTO.java b/src/main/java/com/olympus/apollo/dto/ExternalFileIngestionDTO.java new file mode 100644 index 0000000..d5e85dd --- /dev/null +++ b/src/main/java/com/olympus/apollo/dto/ExternalFileIngestionDTO.java @@ -0,0 +1,23 @@ +package com.olympus.apollo.dto; + +import lombok.Getter; +import lombok.Setter; + +import java.util.Date; +import java.util.HashMap; + +@Getter @Setter +public class ExternalFileIngestionDTO { + private String textToEmbed; + private String name; + private String description; + private String type; + private String ksApplicationName; + private String ksDocType; + private String ksDocSource; + private int defaultChunkSize; + private int minChunkSize; + private int maxNumberOfChunks; + private int minChunkSizeToEmbed; + private HashMap additionalMetaData; +} diff --git a/src/main/java/com/olympus/apollo/models/KSTexts.java b/src/main/java/com/olympus/apollo/models/KSTexts.java new file mode 100644 index 0000000..0e0813a --- /dev/null +++ b/src/main/java/com/olympus/apollo/models/KSTexts.java @@ -0,0 +1,34 @@ +package com.olympus.apollo.models; + +import lombok.Getter; +import lombok.Setter; +import org.springframework.data.annotation.Id; +import org.springframework.data.mongodb.core.mapping.Document; + +import java.util.Date; +import java.util.HashMap; + +@Document(collection = "ksinternal") +@Getter @Setter +public class KSTexts { + private @Id String id; + private String textToEmbed; + private String name; + private String description; + + private String ingestionStatus; + //private String ingestionMessage; + private Date ingestionDate; + private String ingestionDateFormat; + + //private KSIngestionInfo ingestionInfo; + private String type; + private HashMap IngestionInfo; + private HashMap metadata; + + private int minChunkSizeToEmbed; + private int maxNumberOfChunks; + private int minChunkSize; + private int defaultChunkSize; + private HashMap additionalMetadata; +} diff --git a/src/main/java/com/olympus/apollo/repository/KSTextsRepository.java b/src/main/java/com/olympus/apollo/repository/KSTextsRepository.java new file mode 100644 index 0000000..d36e52b --- /dev/null +++ b/src/main/java/com/olympus/apollo/repository/KSTextsRepository.java @@ -0,0 +1,15 @@ +package com.olympus.apollo.repository; + + +import com.olympus.apollo.models.KSDocument; +import com.olympus.apollo.models.KSTexts; +import org.springframework.data.mongodb.repository.MongoRepository; +import org.springframework.data.rest.core.annotation.RepositoryRestResource; +import org.springframework.web.bind.annotation.CrossOrigin; + +@RepositoryRestResource(collectionResourceRel = "ksinternal", path = "ksinternal") +@CrossOrigin +public interface KSTextsRepository extends MongoRepository { + + public Iterable findAllByIngestionStatus(String status); +} diff --git a/src/main/java/com/olympus/apollo/services/KSIngestor.java b/src/main/java/com/olympus/apollo/services/KSIngestor.java index a692e4b..993889a 100644 --- a/src/main/java/com/olympus/apollo/services/KSIngestor.java +++ b/src/main/java/com/olympus/apollo/services/KSIngestor.java @@ -5,6 +5,9 @@ import java.text.SimpleDateFormat; import com.olympus.apollo.dto.IngestionOutput; import com.olympus.apollo.models.KSDocument; +import com.olympus.apollo.models.KSTexts; +import com.olympus.apollo.repository.KSTextsRepository; +import org.codelibs.jhighlight.fastutil.Hash; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.ai.document.Document; @@ -27,6 +30,8 @@ public class KSIngestor { @Autowired private KSDocumentRepository ksDocumentRepository; @Autowired + private KSTextsRepository ksTextsRepository; + @Autowired private KSIngestionInfoRepository ksIngestionInfoRepository; @Autowired private FileSystemStorageService storageService; @@ -159,6 +164,86 @@ public class KSIngestor { return ingestionLoopOutput; } + public IngestionOutput ingestTextById(String id) { + IngestionOutput ingestionOutput= new IngestionOutput(); + Optional optionalDocument = ksTextsRepository.findById(id); + if (optionalDocument.isPresent()) { + KSTexts ksTexts = optionalDocument.get(); + if ("NEW".equals(ksTexts.getIngestionStatus())) { + return ingestText(ksTexts); + } else { + ingestionOutput.setMessage("OOPS: TEXT is already Injected"); + return ingestionOutput; + } + } else { + ingestionOutput.setMessage("OOPS: TEXT Not found"); + return ingestionOutput; + } + } + + private IngestionOutput ingestText(KSTexts ksTexts) { + IngestionOutput ingestionLoopOutput = new IngestionOutput(); + try { + ksTexts.setIngestionStatus("IN PROGRESS"); + ksTextsRepository.save(ksTexts); + + + //TikaDocumentReader tikaDocumentReader = new TikaDocumentReader(new Document(ksTexts.getTextToEmbed())); + Document myDoc = new Document(ksTexts.getTextToEmbed()); + + + List docs = Collections.singletonList(myDoc);;//tikaDocumentReader.read(); + logger.info("Ingested Text: " + ksTexts.getName()); + logger.info("Number of Text: " + docs.size()); + + TokenTextSplitter splitter = new TokenTextSplitter(ksTexts.getDefaultChunkSize(), + ksTexts.getMinChunkSize(), + ksTexts.getMinChunkSizeToEmbed(), + ksTexts.getMaxNumberOfChunks(), + true); + + docs.forEach(doc -> { + List splitDocs = splitter.split(doc); + + logger.info("Number of documents: " + splitDocs.size()); + HashMap meta=(HashMap) ksTexts.getIngestionInfo().get("metadata"); + HashMap meta1= ksTexts.getAdditionalMetadata(); + HashMap meta2 = new HashMap(); + meta2.putAll(meta); + meta2.putAll(meta1); + for (Document splitDoc : splitDocs) { + splitDoc.getMetadata().putAll(meta2); + } + embedtexts(splitDocs); + }); + ksTexts.setIngestionStatus("INGESTED"); + ksTexts.setIngestionDate(new Date()); + ksTexts.setIngestionDateFormat(new SimpleDateFormat("MM/dd/yy").format(new Date())); + ksTextsRepository.save(ksTexts); + + ingestionLoopOutput.getIngestedDocumentId().add(ksTexts.getId()); + ingestionLoopOutput.setStatus("OK"); + ingestionLoopOutput.setMessage("OK"); + }catch (Exception e){ + ingestionLoopOutput.setStatus("ERROR"); + ingestionLoopOutput.setMessage(e.getMessage()); + } + return ingestionLoopOutput; + } + + private void embedtexts(List docs) { + + logger.info("Embedding texts"); + + docs.forEach(doc -> logger.info("text metadata: " + doc.getMetadata())); + try { + vectorStore.add(docs); + logger.info("Texts embedded"); + } catch (Exception e) { + logger.error("Error embedding Texts: ", e); + } + } + private void embedDocuments(List docs, KSIngestionInfo ingestionInfo) {