text ingestion process added

This commit is contained in:
sumedh
2024-10-04 18:08:27 +05:30
parent 0b65bcf860
commit 962d135bdc
5 changed files with 218 additions and 0 deletions

View File

@@ -3,7 +3,14 @@ package com.olympus.apollo.controllers;
import java.util.HashMap; import java.util.HashMap;
import java.util.Date; import java.util.Date;
import java.text.SimpleDateFormat; import java.text.SimpleDateFormat;
import java.util.List;
import com.olympus.apollo.dto.ExternalFileIngestionDTO;
import com.olympus.apollo.dto.IngestionOutput;
import com.olympus.apollo.models.KSGitInfo;
import com.olympus.apollo.models.KSTexts;
import com.olympus.apollo.repository.KSTextsRepository;
import com.olympus.apollo.services.KSIngestor;
import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.http.ResponseEntity; import org.springframework.http.ResponseEntity;
import org.springframework.web.bind.annotation.*; import org.springframework.web.bind.annotation.*;
@@ -26,6 +33,11 @@ public class KSFileController {
@Autowired @Autowired
private KSIngestionInfoRepository ksIngestionInfoRepository; private KSIngestionInfoRepository ksIngestionInfoRepository;
@Autowired
private KSTextsRepository ksTextsRepository;
@Autowired
private KSIngestor ksIngestor;
@PostMapping("/upload") @PostMapping("/upload")
public String handleFileUpload( public String handleFileUpload(
@@ -72,4 +84,53 @@ public class KSFileController {
public ResponseEntity<?> handleStorageFileNotFound(StorageFileNotFoundException exc) { public ResponseEntity<?> handleStorageFileNotFound(StorageFileNotFoundException exc) {
return ResponseEntity.notFound().build(); return ResponseEntity.notFound().build();
} }
//ingestion of text process
@PostMapping("/externalingestion")
public String handleExternalIngestion(
@RequestBody ExternalFileIngestionDTO externalFileIngestionDTO
) {
KSTexts ksTexts = new KSTexts();
ksTexts.setTextToEmbed(externalFileIngestionDTO.getTextToEmbed());
ksTexts.setName(externalFileIngestionDTO.getName());
ksTexts.setDescription(externalFileIngestionDTO.getDescription());
ksTexts.setIngestionStatus("NEW");
ksTexts.setIngestionDateFormat(new SimpleDateFormat("MM/dd/yy").format(new Date()));
Date now = new Date();
ksTexts.setIngestionDate(now);
HashMap IngestionInfo = new HashMap<>();
ksTexts.setType(externalFileIngestionDTO.getType());
HashMap<String, String> metadata = new HashMap<>();
metadata.put("KsApplicationName", externalFileIngestionDTO.getKsApplicationName());
metadata.put("KsDoctype", externalFileIngestionDTO.getKsDocType());
metadata.put("KsDocSource", externalFileIngestionDTO.getKsDocSource());
metadata.put("KsFileSource", externalFileIngestionDTO.getName());
IngestionInfo.put("type",externalFileIngestionDTO.getType());
IngestionInfo.put("metadata",metadata);
//ksTexts.setMetadata(metadata);
ksTexts.setIngestionInfo(IngestionInfo);
ksTexts.setDefaultChunkSize(externalFileIngestionDTO.getDefaultChunkSize());
ksTexts.setMinChunkSize(externalFileIngestionDTO.getMinChunkSize());
ksTexts.setMaxNumberOfChunks(externalFileIngestionDTO.getMaxNumberOfChunks());
ksTexts.setMinChunkSizeToEmbed(externalFileIngestionDTO.getMinChunkSizeToEmbed());
ksTexts.setAdditionalMetadata(externalFileIngestionDTO.getAdditionalMetaData());
ksTextsRepository.save(ksTexts);
return "OK";
}
@GetMapping("/texts")
public List<KSTexts> listTextsInfo() {
List<KSTexts> result = (List<KSTexts>) ksTextsRepository.findAll();
return result;
}
@GetMapping("/ingest_texts/{id}")
public IngestionOutput ingestDocumentById(@PathVariable String id) {
return ksIngestor.ingestTextById(id);
}
} }

View File

@@ -0,0 +1,23 @@
package com.olympus.apollo.dto;
import lombok.Getter;
import lombok.Setter;
import java.util.Date;
import java.util.HashMap;
@Getter @Setter
public class ExternalFileIngestionDTO {
private String textToEmbed;
private String name;
private String description;
private String type;
private String ksApplicationName;
private String ksDocType;
private String ksDocSource;
private int defaultChunkSize;
private int minChunkSize;
private int maxNumberOfChunks;
private int minChunkSizeToEmbed;
private HashMap additionalMetaData;
}

View File

@@ -0,0 +1,34 @@
package com.olympus.apollo.models;
import lombok.Getter;
import lombok.Setter;
import org.springframework.data.annotation.Id;
import org.springframework.data.mongodb.core.mapping.Document;
import java.util.Date;
import java.util.HashMap;
@Document(collection = "ksinternal")
@Getter @Setter
public class KSTexts {
private @Id String id;
private String textToEmbed;
private String name;
private String description;
private String ingestionStatus;
//private String ingestionMessage;
private Date ingestionDate;
private String ingestionDateFormat;
//private KSIngestionInfo ingestionInfo;
private String type;
private HashMap IngestionInfo;
private HashMap<String,String> metadata;
private int minChunkSizeToEmbed;
private int maxNumberOfChunks;
private int minChunkSize;
private int defaultChunkSize;
private HashMap additionalMetadata;
}

View File

@@ -0,0 +1,15 @@
package com.olympus.apollo.repository;
import com.olympus.apollo.models.KSDocument;
import com.olympus.apollo.models.KSTexts;
import org.springframework.data.mongodb.repository.MongoRepository;
import org.springframework.data.rest.core.annotation.RepositoryRestResource;
import org.springframework.web.bind.annotation.CrossOrigin;
@RepositoryRestResource(collectionResourceRel = "ksinternal", path = "ksinternal")
@CrossOrigin
public interface KSTextsRepository extends MongoRepository<KSTexts, String> {
public Iterable<KSDocument> findAllByIngestionStatus(String status);
}

View File

@@ -5,6 +5,9 @@ import java.text.SimpleDateFormat;
import com.olympus.apollo.dto.IngestionOutput; import com.olympus.apollo.dto.IngestionOutput;
import com.olympus.apollo.models.KSDocument; import com.olympus.apollo.models.KSDocument;
import com.olympus.apollo.models.KSTexts;
import com.olympus.apollo.repository.KSTextsRepository;
import org.codelibs.jhighlight.fastutil.Hash;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import org.springframework.ai.document.Document; import org.springframework.ai.document.Document;
@@ -27,6 +30,8 @@ public class KSIngestor {
@Autowired @Autowired
private KSDocumentRepository ksDocumentRepository; private KSDocumentRepository ksDocumentRepository;
@Autowired @Autowired
private KSTextsRepository ksTextsRepository;
@Autowired
private KSIngestionInfoRepository ksIngestionInfoRepository; private KSIngestionInfoRepository ksIngestionInfoRepository;
@Autowired @Autowired
private FileSystemStorageService storageService; private FileSystemStorageService storageService;
@@ -159,6 +164,86 @@ public class KSIngestor {
return ingestionLoopOutput; return ingestionLoopOutput;
} }
public IngestionOutput ingestTextById(String id) {
IngestionOutput ingestionOutput= new IngestionOutput();
Optional<KSTexts> optionalDocument = ksTextsRepository.findById(id);
if (optionalDocument.isPresent()) {
KSTexts ksTexts = optionalDocument.get();
if ("NEW".equals(ksTexts.getIngestionStatus())) {
return ingestText(ksTexts);
} else {
ingestionOutput.setMessage("OOPS: TEXT is already Injected");
return ingestionOutput;
}
} else {
ingestionOutput.setMessage("OOPS: TEXT Not found");
return ingestionOutput;
}
}
private IngestionOutput ingestText(KSTexts ksTexts) {
IngestionOutput ingestionLoopOutput = new IngestionOutput();
try {
ksTexts.setIngestionStatus("IN PROGRESS");
ksTextsRepository.save(ksTexts);
//TikaDocumentReader tikaDocumentReader = new TikaDocumentReader(new Document(ksTexts.getTextToEmbed()));
Document myDoc = new Document(ksTexts.getTextToEmbed());
List<Document> docs = Collections.singletonList(myDoc);;//tikaDocumentReader.read();
logger.info("Ingested Text: " + ksTexts.getName());
logger.info("Number of Text: " + docs.size());
TokenTextSplitter splitter = new TokenTextSplitter(ksTexts.getDefaultChunkSize(),
ksTexts.getMinChunkSize(),
ksTexts.getMinChunkSizeToEmbed(),
ksTexts.getMaxNumberOfChunks(),
true);
docs.forEach(doc -> {
List<Document> splitDocs = splitter.split(doc);
logger.info("Number of documents: " + splitDocs.size());
HashMap meta=(HashMap) ksTexts.getIngestionInfo().get("metadata");
HashMap meta1= ksTexts.getAdditionalMetadata();
HashMap meta2 = new HashMap();
meta2.putAll(meta);
meta2.putAll(meta1);
for (Document splitDoc : splitDocs) {
splitDoc.getMetadata().putAll(meta2);
}
embedtexts(splitDocs);
});
ksTexts.setIngestionStatus("INGESTED");
ksTexts.setIngestionDate(new Date());
ksTexts.setIngestionDateFormat(new SimpleDateFormat("MM/dd/yy").format(new Date()));
ksTextsRepository.save(ksTexts);
ingestionLoopOutput.getIngestedDocumentId().add(ksTexts.getId());
ingestionLoopOutput.setStatus("OK");
ingestionLoopOutput.setMessage("OK");
}catch (Exception e){
ingestionLoopOutput.setStatus("ERROR");
ingestionLoopOutput.setMessage(e.getMessage());
}
return ingestionLoopOutput;
}
private void embedtexts(List<Document> docs) {
logger.info("Embedding texts");
docs.forEach(doc -> logger.info("text metadata: " + doc.getMetadata()));
try {
vectorStore.add(docs);
logger.info("Texts embedded");
} catch (Exception e) {
logger.error("Error embedding Texts: ", e);
}
}
private void embedDocuments(List<Document> docs, KSIngestionInfo ingestionInfo) { private void embedDocuments(List<Document> docs, KSIngestionInfo ingestionInfo) {