text ingestion process added
This commit is contained in:
@@ -3,7 +3,14 @@ package com.olympus.apollo.controllers;
|
||||
import java.util.HashMap;
|
||||
import java.util.Date;
|
||||
import java.text.SimpleDateFormat;
|
||||
import java.util.List;
|
||||
|
||||
import com.olympus.apollo.dto.ExternalFileIngestionDTO;
|
||||
import com.olympus.apollo.dto.IngestionOutput;
|
||||
import com.olympus.apollo.models.KSGitInfo;
|
||||
import com.olympus.apollo.models.KSTexts;
|
||||
import com.olympus.apollo.repository.KSTextsRepository;
|
||||
import com.olympus.apollo.services.KSIngestor;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.http.ResponseEntity;
|
||||
import org.springframework.web.bind.annotation.*;
|
||||
@@ -26,6 +33,11 @@ public class KSFileController {
|
||||
@Autowired
|
||||
private KSIngestionInfoRepository ksIngestionInfoRepository;
|
||||
|
||||
@Autowired
|
||||
private KSTextsRepository ksTextsRepository;
|
||||
@Autowired
|
||||
private KSIngestor ksIngestor;
|
||||
|
||||
|
||||
@PostMapping("/upload")
|
||||
public String handleFileUpload(
|
||||
@@ -72,4 +84,53 @@ public class KSFileController {
|
||||
public ResponseEntity<?> handleStorageFileNotFound(StorageFileNotFoundException exc) {
|
||||
return ResponseEntity.notFound().build();
|
||||
}
|
||||
//ingestion of text process
|
||||
@PostMapping("/externalingestion")
|
||||
public String handleExternalIngestion(
|
||||
@RequestBody ExternalFileIngestionDTO externalFileIngestionDTO
|
||||
) {
|
||||
|
||||
KSTexts ksTexts = new KSTexts();
|
||||
ksTexts.setTextToEmbed(externalFileIngestionDTO.getTextToEmbed());
|
||||
ksTexts.setName(externalFileIngestionDTO.getName());
|
||||
ksTexts.setDescription(externalFileIngestionDTO.getDescription());
|
||||
ksTexts.setIngestionStatus("NEW");
|
||||
ksTexts.setIngestionDateFormat(new SimpleDateFormat("MM/dd/yy").format(new Date()));
|
||||
|
||||
Date now = new Date();
|
||||
ksTexts.setIngestionDate(now);
|
||||
|
||||
|
||||
HashMap IngestionInfo = new HashMap<>();
|
||||
ksTexts.setType(externalFileIngestionDTO.getType());
|
||||
HashMap<String, String> metadata = new HashMap<>();
|
||||
metadata.put("KsApplicationName", externalFileIngestionDTO.getKsApplicationName());
|
||||
metadata.put("KsDoctype", externalFileIngestionDTO.getKsDocType());
|
||||
metadata.put("KsDocSource", externalFileIngestionDTO.getKsDocSource());
|
||||
metadata.put("KsFileSource", externalFileIngestionDTO.getName());
|
||||
IngestionInfo.put("type",externalFileIngestionDTO.getType());
|
||||
IngestionInfo.put("metadata",metadata);
|
||||
//ksTexts.setMetadata(metadata);
|
||||
ksTexts.setIngestionInfo(IngestionInfo);
|
||||
ksTexts.setDefaultChunkSize(externalFileIngestionDTO.getDefaultChunkSize());
|
||||
ksTexts.setMinChunkSize(externalFileIngestionDTO.getMinChunkSize());
|
||||
ksTexts.setMaxNumberOfChunks(externalFileIngestionDTO.getMaxNumberOfChunks());
|
||||
ksTexts.setMinChunkSizeToEmbed(externalFileIngestionDTO.getMinChunkSizeToEmbed());
|
||||
ksTexts.setAdditionalMetadata(externalFileIngestionDTO.getAdditionalMetaData());
|
||||
|
||||
ksTextsRepository.save(ksTexts);
|
||||
|
||||
return "OK";
|
||||
}
|
||||
|
||||
@GetMapping("/texts")
|
||||
public List<KSTexts> listTextsInfo() {
|
||||
List<KSTexts> result = (List<KSTexts>) ksTextsRepository.findAll();
|
||||
return result;
|
||||
}
|
||||
|
||||
@GetMapping("/ingest_texts/{id}")
|
||||
public IngestionOutput ingestDocumentById(@PathVariable String id) {
|
||||
return ksIngestor.ingestTextById(id);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,23 @@
|
||||
package com.olympus.apollo.dto;
|
||||
|
||||
import lombok.Getter;
|
||||
import lombok.Setter;
|
||||
|
||||
import java.util.Date;
|
||||
import java.util.HashMap;
|
||||
|
||||
@Getter @Setter
|
||||
public class ExternalFileIngestionDTO {
|
||||
private String textToEmbed;
|
||||
private String name;
|
||||
private String description;
|
||||
private String type;
|
||||
private String ksApplicationName;
|
||||
private String ksDocType;
|
||||
private String ksDocSource;
|
||||
private int defaultChunkSize;
|
||||
private int minChunkSize;
|
||||
private int maxNumberOfChunks;
|
||||
private int minChunkSizeToEmbed;
|
||||
private HashMap additionalMetaData;
|
||||
}
|
||||
34
src/main/java/com/olympus/apollo/models/KSTexts.java
Normal file
34
src/main/java/com/olympus/apollo/models/KSTexts.java
Normal file
@@ -0,0 +1,34 @@
|
||||
package com.olympus.apollo.models;
|
||||
|
||||
import lombok.Getter;
|
||||
import lombok.Setter;
|
||||
import org.springframework.data.annotation.Id;
|
||||
import org.springframework.data.mongodb.core.mapping.Document;
|
||||
|
||||
import java.util.Date;
|
||||
import java.util.HashMap;
|
||||
|
||||
@Document(collection = "ksinternal")
|
||||
@Getter @Setter
|
||||
public class KSTexts {
|
||||
private @Id String id;
|
||||
private String textToEmbed;
|
||||
private String name;
|
||||
private String description;
|
||||
|
||||
private String ingestionStatus;
|
||||
//private String ingestionMessage;
|
||||
private Date ingestionDate;
|
||||
private String ingestionDateFormat;
|
||||
|
||||
//private KSIngestionInfo ingestionInfo;
|
||||
private String type;
|
||||
private HashMap IngestionInfo;
|
||||
private HashMap<String,String> metadata;
|
||||
|
||||
private int minChunkSizeToEmbed;
|
||||
private int maxNumberOfChunks;
|
||||
private int minChunkSize;
|
||||
private int defaultChunkSize;
|
||||
private HashMap additionalMetadata;
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
package com.olympus.apollo.repository;
|
||||
|
||||
|
||||
import com.olympus.apollo.models.KSDocument;
|
||||
import com.olympus.apollo.models.KSTexts;
|
||||
import org.springframework.data.mongodb.repository.MongoRepository;
|
||||
import org.springframework.data.rest.core.annotation.RepositoryRestResource;
|
||||
import org.springframework.web.bind.annotation.CrossOrigin;
|
||||
|
||||
@RepositoryRestResource(collectionResourceRel = "ksinternal", path = "ksinternal")
|
||||
@CrossOrigin
|
||||
public interface KSTextsRepository extends MongoRepository<KSTexts, String> {
|
||||
|
||||
public Iterable<KSDocument> findAllByIngestionStatus(String status);
|
||||
}
|
||||
@@ -5,6 +5,9 @@ import java.text.SimpleDateFormat;
|
||||
|
||||
import com.olympus.apollo.dto.IngestionOutput;
|
||||
import com.olympus.apollo.models.KSDocument;
|
||||
import com.olympus.apollo.models.KSTexts;
|
||||
import com.olympus.apollo.repository.KSTextsRepository;
|
||||
import org.codelibs.jhighlight.fastutil.Hash;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.ai.document.Document;
|
||||
@@ -27,6 +30,8 @@ public class KSIngestor {
|
||||
@Autowired
|
||||
private KSDocumentRepository ksDocumentRepository;
|
||||
@Autowired
|
||||
private KSTextsRepository ksTextsRepository;
|
||||
@Autowired
|
||||
private KSIngestionInfoRepository ksIngestionInfoRepository;
|
||||
@Autowired
|
||||
private FileSystemStorageService storageService;
|
||||
@@ -159,6 +164,86 @@ public class KSIngestor {
|
||||
return ingestionLoopOutput;
|
||||
}
|
||||
|
||||
public IngestionOutput ingestTextById(String id) {
|
||||
IngestionOutput ingestionOutput= new IngestionOutput();
|
||||
Optional<KSTexts> optionalDocument = ksTextsRepository.findById(id);
|
||||
if (optionalDocument.isPresent()) {
|
||||
KSTexts ksTexts = optionalDocument.get();
|
||||
if ("NEW".equals(ksTexts.getIngestionStatus())) {
|
||||
return ingestText(ksTexts);
|
||||
} else {
|
||||
ingestionOutput.setMessage("OOPS: TEXT is already Injected");
|
||||
return ingestionOutput;
|
||||
}
|
||||
} else {
|
||||
ingestionOutput.setMessage("OOPS: TEXT Not found");
|
||||
return ingestionOutput;
|
||||
}
|
||||
}
|
||||
|
||||
private IngestionOutput ingestText(KSTexts ksTexts) {
|
||||
IngestionOutput ingestionLoopOutput = new IngestionOutput();
|
||||
try {
|
||||
ksTexts.setIngestionStatus("IN PROGRESS");
|
||||
ksTextsRepository.save(ksTexts);
|
||||
|
||||
|
||||
//TikaDocumentReader tikaDocumentReader = new TikaDocumentReader(new Document(ksTexts.getTextToEmbed()));
|
||||
Document myDoc = new Document(ksTexts.getTextToEmbed());
|
||||
|
||||
|
||||
List<Document> docs = Collections.singletonList(myDoc);;//tikaDocumentReader.read();
|
||||
logger.info("Ingested Text: " + ksTexts.getName());
|
||||
logger.info("Number of Text: " + docs.size());
|
||||
|
||||
TokenTextSplitter splitter = new TokenTextSplitter(ksTexts.getDefaultChunkSize(),
|
||||
ksTexts.getMinChunkSize(),
|
||||
ksTexts.getMinChunkSizeToEmbed(),
|
||||
ksTexts.getMaxNumberOfChunks(),
|
||||
true);
|
||||
|
||||
docs.forEach(doc -> {
|
||||
List<Document> splitDocs = splitter.split(doc);
|
||||
|
||||
logger.info("Number of documents: " + splitDocs.size());
|
||||
HashMap meta=(HashMap) ksTexts.getIngestionInfo().get("metadata");
|
||||
HashMap meta1= ksTexts.getAdditionalMetadata();
|
||||
HashMap meta2 = new HashMap();
|
||||
meta2.putAll(meta);
|
||||
meta2.putAll(meta1);
|
||||
for (Document splitDoc : splitDocs) {
|
||||
splitDoc.getMetadata().putAll(meta2);
|
||||
}
|
||||
embedtexts(splitDocs);
|
||||
});
|
||||
ksTexts.setIngestionStatus("INGESTED");
|
||||
ksTexts.setIngestionDate(new Date());
|
||||
ksTexts.setIngestionDateFormat(new SimpleDateFormat("MM/dd/yy").format(new Date()));
|
||||
ksTextsRepository.save(ksTexts);
|
||||
|
||||
ingestionLoopOutput.getIngestedDocumentId().add(ksTexts.getId());
|
||||
ingestionLoopOutput.setStatus("OK");
|
||||
ingestionLoopOutput.setMessage("OK");
|
||||
}catch (Exception e){
|
||||
ingestionLoopOutput.setStatus("ERROR");
|
||||
ingestionLoopOutput.setMessage(e.getMessage());
|
||||
}
|
||||
return ingestionLoopOutput;
|
||||
}
|
||||
|
||||
private void embedtexts(List<Document> docs) {
|
||||
|
||||
logger.info("Embedding texts");
|
||||
|
||||
docs.forEach(doc -> logger.info("text metadata: " + doc.getMetadata()));
|
||||
try {
|
||||
vectorStore.add(docs);
|
||||
logger.info("Texts embedded");
|
||||
} catch (Exception e) {
|
||||
logger.error("Error embedding Texts: ", e);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
private void embedDocuments(List<Document> docs, KSIngestionInfo ingestionInfo) {
|
||||
|
||||
Reference in New Issue
Block a user