text ingestion process added
This commit is contained in:
@@ -3,7 +3,14 @@ package com.olympus.apollo.controllers;
|
|||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.Date;
|
import java.util.Date;
|
||||||
import java.text.SimpleDateFormat;
|
import java.text.SimpleDateFormat;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import com.olympus.apollo.dto.ExternalFileIngestionDTO;
|
||||||
|
import com.olympus.apollo.dto.IngestionOutput;
|
||||||
|
import com.olympus.apollo.models.KSGitInfo;
|
||||||
|
import com.olympus.apollo.models.KSTexts;
|
||||||
|
import com.olympus.apollo.repository.KSTextsRepository;
|
||||||
|
import com.olympus.apollo.services.KSIngestor;
|
||||||
import org.springframework.beans.factory.annotation.Autowired;
|
import org.springframework.beans.factory.annotation.Autowired;
|
||||||
import org.springframework.http.ResponseEntity;
|
import org.springframework.http.ResponseEntity;
|
||||||
import org.springframework.web.bind.annotation.*;
|
import org.springframework.web.bind.annotation.*;
|
||||||
@@ -26,6 +33,11 @@ public class KSFileController {
|
|||||||
@Autowired
|
@Autowired
|
||||||
private KSIngestionInfoRepository ksIngestionInfoRepository;
|
private KSIngestionInfoRepository ksIngestionInfoRepository;
|
||||||
|
|
||||||
|
@Autowired
|
||||||
|
private KSTextsRepository ksTextsRepository;
|
||||||
|
@Autowired
|
||||||
|
private KSIngestor ksIngestor;
|
||||||
|
|
||||||
|
|
||||||
@PostMapping("/upload")
|
@PostMapping("/upload")
|
||||||
public String handleFileUpload(
|
public String handleFileUpload(
|
||||||
@@ -72,4 +84,53 @@ public class KSFileController {
|
|||||||
public ResponseEntity<?> handleStorageFileNotFound(StorageFileNotFoundException exc) {
|
public ResponseEntity<?> handleStorageFileNotFound(StorageFileNotFoundException exc) {
|
||||||
return ResponseEntity.notFound().build();
|
return ResponseEntity.notFound().build();
|
||||||
}
|
}
|
||||||
|
//ingestion of text process
|
||||||
|
@PostMapping("/externalingestion")
|
||||||
|
public String handleExternalIngestion(
|
||||||
|
@RequestBody ExternalFileIngestionDTO externalFileIngestionDTO
|
||||||
|
) {
|
||||||
|
|
||||||
|
KSTexts ksTexts = new KSTexts();
|
||||||
|
ksTexts.setTextToEmbed(externalFileIngestionDTO.getTextToEmbed());
|
||||||
|
ksTexts.setName(externalFileIngestionDTO.getName());
|
||||||
|
ksTexts.setDescription(externalFileIngestionDTO.getDescription());
|
||||||
|
ksTexts.setIngestionStatus("NEW");
|
||||||
|
ksTexts.setIngestionDateFormat(new SimpleDateFormat("MM/dd/yy").format(new Date()));
|
||||||
|
|
||||||
|
Date now = new Date();
|
||||||
|
ksTexts.setIngestionDate(now);
|
||||||
|
|
||||||
|
|
||||||
|
HashMap IngestionInfo = new HashMap<>();
|
||||||
|
ksTexts.setType(externalFileIngestionDTO.getType());
|
||||||
|
HashMap<String, String> metadata = new HashMap<>();
|
||||||
|
metadata.put("KsApplicationName", externalFileIngestionDTO.getKsApplicationName());
|
||||||
|
metadata.put("KsDoctype", externalFileIngestionDTO.getKsDocType());
|
||||||
|
metadata.put("KsDocSource", externalFileIngestionDTO.getKsDocSource());
|
||||||
|
metadata.put("KsFileSource", externalFileIngestionDTO.getName());
|
||||||
|
IngestionInfo.put("type",externalFileIngestionDTO.getType());
|
||||||
|
IngestionInfo.put("metadata",metadata);
|
||||||
|
//ksTexts.setMetadata(metadata);
|
||||||
|
ksTexts.setIngestionInfo(IngestionInfo);
|
||||||
|
ksTexts.setDefaultChunkSize(externalFileIngestionDTO.getDefaultChunkSize());
|
||||||
|
ksTexts.setMinChunkSize(externalFileIngestionDTO.getMinChunkSize());
|
||||||
|
ksTexts.setMaxNumberOfChunks(externalFileIngestionDTO.getMaxNumberOfChunks());
|
||||||
|
ksTexts.setMinChunkSizeToEmbed(externalFileIngestionDTO.getMinChunkSizeToEmbed());
|
||||||
|
ksTexts.setAdditionalMetadata(externalFileIngestionDTO.getAdditionalMetaData());
|
||||||
|
|
||||||
|
ksTextsRepository.save(ksTexts);
|
||||||
|
|
||||||
|
return "OK";
|
||||||
|
}
|
||||||
|
|
||||||
|
@GetMapping("/texts")
|
||||||
|
public List<KSTexts> listTextsInfo() {
|
||||||
|
List<KSTexts> result = (List<KSTexts>) ksTextsRepository.findAll();
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
@GetMapping("/ingest_texts/{id}")
|
||||||
|
public IngestionOutput ingestDocumentById(@PathVariable String id) {
|
||||||
|
return ksIngestor.ingestTextById(id);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
@@ -0,0 +1,23 @@
|
|||||||
|
package com.olympus.apollo.dto;
|
||||||
|
|
||||||
|
import lombok.Getter;
|
||||||
|
import lombok.Setter;
|
||||||
|
|
||||||
|
import java.util.Date;
|
||||||
|
import java.util.HashMap;
|
||||||
|
|
||||||
|
@Getter @Setter
|
||||||
|
public class ExternalFileIngestionDTO {
|
||||||
|
private String textToEmbed;
|
||||||
|
private String name;
|
||||||
|
private String description;
|
||||||
|
private String type;
|
||||||
|
private String ksApplicationName;
|
||||||
|
private String ksDocType;
|
||||||
|
private String ksDocSource;
|
||||||
|
private int defaultChunkSize;
|
||||||
|
private int minChunkSize;
|
||||||
|
private int maxNumberOfChunks;
|
||||||
|
private int minChunkSizeToEmbed;
|
||||||
|
private HashMap additionalMetaData;
|
||||||
|
}
|
||||||
34
src/main/java/com/olympus/apollo/models/KSTexts.java
Normal file
34
src/main/java/com/olympus/apollo/models/KSTexts.java
Normal file
@@ -0,0 +1,34 @@
|
|||||||
|
package com.olympus.apollo.models;
|
||||||
|
|
||||||
|
import lombok.Getter;
|
||||||
|
import lombok.Setter;
|
||||||
|
import org.springframework.data.annotation.Id;
|
||||||
|
import org.springframework.data.mongodb.core.mapping.Document;
|
||||||
|
|
||||||
|
import java.util.Date;
|
||||||
|
import java.util.HashMap;
|
||||||
|
|
||||||
|
@Document(collection = "ksinternal")
|
||||||
|
@Getter @Setter
|
||||||
|
public class KSTexts {
|
||||||
|
private @Id String id;
|
||||||
|
private String textToEmbed;
|
||||||
|
private String name;
|
||||||
|
private String description;
|
||||||
|
|
||||||
|
private String ingestionStatus;
|
||||||
|
//private String ingestionMessage;
|
||||||
|
private Date ingestionDate;
|
||||||
|
private String ingestionDateFormat;
|
||||||
|
|
||||||
|
//private KSIngestionInfo ingestionInfo;
|
||||||
|
private String type;
|
||||||
|
private HashMap IngestionInfo;
|
||||||
|
private HashMap<String,String> metadata;
|
||||||
|
|
||||||
|
private int minChunkSizeToEmbed;
|
||||||
|
private int maxNumberOfChunks;
|
||||||
|
private int minChunkSize;
|
||||||
|
private int defaultChunkSize;
|
||||||
|
private HashMap additionalMetadata;
|
||||||
|
}
|
||||||
@@ -0,0 +1,15 @@
|
|||||||
|
package com.olympus.apollo.repository;
|
||||||
|
|
||||||
|
|
||||||
|
import com.olympus.apollo.models.KSDocument;
|
||||||
|
import com.olympus.apollo.models.KSTexts;
|
||||||
|
import org.springframework.data.mongodb.repository.MongoRepository;
|
||||||
|
import org.springframework.data.rest.core.annotation.RepositoryRestResource;
|
||||||
|
import org.springframework.web.bind.annotation.CrossOrigin;
|
||||||
|
|
||||||
|
@RepositoryRestResource(collectionResourceRel = "ksinternal", path = "ksinternal")
|
||||||
|
@CrossOrigin
|
||||||
|
public interface KSTextsRepository extends MongoRepository<KSTexts, String> {
|
||||||
|
|
||||||
|
public Iterable<KSDocument> findAllByIngestionStatus(String status);
|
||||||
|
}
|
||||||
@@ -5,6 +5,9 @@ import java.text.SimpleDateFormat;
|
|||||||
|
|
||||||
import com.olympus.apollo.dto.IngestionOutput;
|
import com.olympus.apollo.dto.IngestionOutput;
|
||||||
import com.olympus.apollo.models.KSDocument;
|
import com.olympus.apollo.models.KSDocument;
|
||||||
|
import com.olympus.apollo.models.KSTexts;
|
||||||
|
import com.olympus.apollo.repository.KSTextsRepository;
|
||||||
|
import org.codelibs.jhighlight.fastutil.Hash;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
import org.springframework.ai.document.Document;
|
import org.springframework.ai.document.Document;
|
||||||
@@ -27,6 +30,8 @@ public class KSIngestor {
|
|||||||
@Autowired
|
@Autowired
|
||||||
private KSDocumentRepository ksDocumentRepository;
|
private KSDocumentRepository ksDocumentRepository;
|
||||||
@Autowired
|
@Autowired
|
||||||
|
private KSTextsRepository ksTextsRepository;
|
||||||
|
@Autowired
|
||||||
private KSIngestionInfoRepository ksIngestionInfoRepository;
|
private KSIngestionInfoRepository ksIngestionInfoRepository;
|
||||||
@Autowired
|
@Autowired
|
||||||
private FileSystemStorageService storageService;
|
private FileSystemStorageService storageService;
|
||||||
@@ -159,6 +164,86 @@ public class KSIngestor {
|
|||||||
return ingestionLoopOutput;
|
return ingestionLoopOutput;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public IngestionOutput ingestTextById(String id) {
|
||||||
|
IngestionOutput ingestionOutput= new IngestionOutput();
|
||||||
|
Optional<KSTexts> optionalDocument = ksTextsRepository.findById(id);
|
||||||
|
if (optionalDocument.isPresent()) {
|
||||||
|
KSTexts ksTexts = optionalDocument.get();
|
||||||
|
if ("NEW".equals(ksTexts.getIngestionStatus())) {
|
||||||
|
return ingestText(ksTexts);
|
||||||
|
} else {
|
||||||
|
ingestionOutput.setMessage("OOPS: TEXT is already Injected");
|
||||||
|
return ingestionOutput;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
ingestionOutput.setMessage("OOPS: TEXT Not found");
|
||||||
|
return ingestionOutput;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private IngestionOutput ingestText(KSTexts ksTexts) {
|
||||||
|
IngestionOutput ingestionLoopOutput = new IngestionOutput();
|
||||||
|
try {
|
||||||
|
ksTexts.setIngestionStatus("IN PROGRESS");
|
||||||
|
ksTextsRepository.save(ksTexts);
|
||||||
|
|
||||||
|
|
||||||
|
//TikaDocumentReader tikaDocumentReader = new TikaDocumentReader(new Document(ksTexts.getTextToEmbed()));
|
||||||
|
Document myDoc = new Document(ksTexts.getTextToEmbed());
|
||||||
|
|
||||||
|
|
||||||
|
List<Document> docs = Collections.singletonList(myDoc);;//tikaDocumentReader.read();
|
||||||
|
logger.info("Ingested Text: " + ksTexts.getName());
|
||||||
|
logger.info("Number of Text: " + docs.size());
|
||||||
|
|
||||||
|
TokenTextSplitter splitter = new TokenTextSplitter(ksTexts.getDefaultChunkSize(),
|
||||||
|
ksTexts.getMinChunkSize(),
|
||||||
|
ksTexts.getMinChunkSizeToEmbed(),
|
||||||
|
ksTexts.getMaxNumberOfChunks(),
|
||||||
|
true);
|
||||||
|
|
||||||
|
docs.forEach(doc -> {
|
||||||
|
List<Document> splitDocs = splitter.split(doc);
|
||||||
|
|
||||||
|
logger.info("Number of documents: " + splitDocs.size());
|
||||||
|
HashMap meta=(HashMap) ksTexts.getIngestionInfo().get("metadata");
|
||||||
|
HashMap meta1= ksTexts.getAdditionalMetadata();
|
||||||
|
HashMap meta2 = new HashMap();
|
||||||
|
meta2.putAll(meta);
|
||||||
|
meta2.putAll(meta1);
|
||||||
|
for (Document splitDoc : splitDocs) {
|
||||||
|
splitDoc.getMetadata().putAll(meta2);
|
||||||
|
}
|
||||||
|
embedtexts(splitDocs);
|
||||||
|
});
|
||||||
|
ksTexts.setIngestionStatus("INGESTED");
|
||||||
|
ksTexts.setIngestionDate(new Date());
|
||||||
|
ksTexts.setIngestionDateFormat(new SimpleDateFormat("MM/dd/yy").format(new Date()));
|
||||||
|
ksTextsRepository.save(ksTexts);
|
||||||
|
|
||||||
|
ingestionLoopOutput.getIngestedDocumentId().add(ksTexts.getId());
|
||||||
|
ingestionLoopOutput.setStatus("OK");
|
||||||
|
ingestionLoopOutput.setMessage("OK");
|
||||||
|
}catch (Exception e){
|
||||||
|
ingestionLoopOutput.setStatus("ERROR");
|
||||||
|
ingestionLoopOutput.setMessage(e.getMessage());
|
||||||
|
}
|
||||||
|
return ingestionLoopOutput;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void embedtexts(List<Document> docs) {
|
||||||
|
|
||||||
|
logger.info("Embedding texts");
|
||||||
|
|
||||||
|
docs.forEach(doc -> logger.info("text metadata: " + doc.getMetadata()));
|
||||||
|
try {
|
||||||
|
vectorStore.add(docs);
|
||||||
|
logger.info("Texts embedded");
|
||||||
|
} catch (Exception e) {
|
||||||
|
logger.error("Error embedding Texts: ", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
private void embedDocuments(List<Document> docs, KSIngestionInfo ingestionInfo) {
|
private void embedDocuments(List<Document> docs, KSIngestionInfo ingestionInfo) {
|
||||||
|
|||||||
Reference in New Issue
Block a user