Update file system directory

This commit is contained in:
2025-05-28 14:41:31 +02:00
parent 1a1ead8fce
commit 485c781e4c
3 changed files with 23 additions and 15 deletions

View File

@@ -28,8 +28,6 @@ import com.olympus.model.apollo.KSIngestionInfo;
@Service @Service
public class FileService { public class FileService {
private static final String UPLOAD_DIR = "C:\\mnt\\hermione_storage\\documents\\file_input_scenarios\\";
private Logger logger = LoggerFactory.getLogger(FileService.class); private Logger logger = LoggerFactory.getLogger(FileService.class);
@Value("${file.upload-dir}") @Value("${file.upload-dir}")

View File

@@ -22,43 +22,48 @@ public class EmbeddingDocTempSolver extends StepSolver {
Logger logger = (Logger) LoggerFactory.getLogger(BasicQueryRagSolver.class); Logger logger = (Logger) LoggerFactory.getLogger(EmbeddingDocTempSolver.class);
private void loadParameters(){ private void loadParameters(){
logger.info("Loading parameters"); logger.info("Loading parameters");
this.scenarioExecution.getExecSharedMap().put("scenario_execution_id", this.scenarioExecution.getId()); this.scenarioExecution.getExecSharedMap().put("scenario_execution_id", this.scenarioExecution.getId());
logger.info("Scenario Execution ID: "+this.scenarioExecution.getId()); logger.info("Scenario Execution ID: {}", this.scenarioExecution.getId());
AttributeParser attributeParser = new AttributeParser(this.scenarioExecution); AttributeParser attributeParser = new AttributeParser(this.scenarioExecution);
this.scenario_execution_id = attributeParser.parse((String) this.scenarioExecution.getId()); this.scenario_execution_id = attributeParser.parse((String) this.scenarioExecution.getId());
this.path_file = attributeParser.parse((String) this.step.getAttributes().get("path_file")); this.path_file = attributeParser.parse((String) this.step.getAttributes().get("path_file"));
logger.info("Parsed path_file: {}", this.path_file);
if(this.step.getAttributes().containsKey("default_chunk_size")){ if(this.step.getAttributes().containsKey("default_chunk_size")){
this.default_chunk_size = (int) this.step.getAttributes().get("default_chunk_size"); this.default_chunk_size = (int) this.step.getAttributes().get("default_chunk_size");
logger.info("Parsed default_chunk_size from attributes: {}", this.default_chunk_size);
}else{ }else{
this.default_chunk_size = 8000; this.default_chunk_size = 8000;
logger.info("default_chunk_size not found in attributes, using default: 8000");
} }
if(this.step.getAttributes().containsKey("min_chunk_size")){ if(this.step.getAttributes().containsKey("min_chunk_size")){
this.min_chunk_size = (int) this.step.getAttributes().get("min_chunk_size"); this.min_chunk_size = (int) this.step.getAttributes().get("min_chunk_size");
}else{ }else{
this.min_chunk_size = 50; this.min_chunk_size = 50;
logger.info("min_chunk_size not found in attributes, using default: 50");
} }
if(this.step.getAttributes().containsKey("min_chunk_length_to_embed")){ if(this.step.getAttributes().containsKey("min_chunk_length_to_embed")){
this.min_chunk_length_to_embed = (int) this.step.getAttributes().get("min_chunk_length_to_embed"); this.min_chunk_length_to_embed = (int) this.step.getAttributes().get("min_chunk_length_to_embed");
}else{ }else{
this.min_chunk_length_to_embed = 50; this.min_chunk_length_to_embed = 50;
logger.info("min_chunk_length_to_embed not found in attributes, using default: 50");
} }
if(this.step.getAttributes().containsKey("max_num_chunks")){ if(this.step.getAttributes().containsKey("max_num_chunks")){
this.max_num_chunks = (int) this.step.getAttributes().get("max_num_chunks"); this.max_num_chunks = (int) this.step.getAttributes().get("max_num_chunks");
}else{ }else{
this.max_num_chunks = 1000; this.max_num_chunks = 1000;
logger.info("max_num_chunks not found in attributes, using default: 1000");
} }
} }
@Override @Override
@@ -67,45 +72,54 @@ public class EmbeddingDocTempSolver extends StepSolver {
try{ try{
logger.info("Solving step: " + this.step.getName()); logger.info("Solving step: " + this.step.getName());
this.scenarioExecution.setCurrentStepId(this.step.getStepId()); this.scenarioExecution.setCurrentStepId(this.step.getStepId());
logger.info("Loading parameters for step: {}", this.step.getName());
loadParameters(); loadParameters();
logger.info("Embedding documents"); logger.info("Embedding documents");
File file = new File(this.path_file); File file = new File(this.path_file);
logger.info("Reading file from path: {}", this.path_file);
Tika tika = new Tika(); Tika tika = new Tika();
tika.setMaxStringLength(-1); tika.setMaxStringLength(-1);
String text = tika.parseToString(file); String text = tika.parseToString(file);
logger.info("File read successfully. Length: {} characters", text.length());
Document myDoc = new Document(text); Document myDoc = new Document(text);
List<Document> docs = Collections.singletonList(myDoc); List<Document> docs = Collections.singletonList(myDoc);
logger.info("Initializing TokenTextSplitter with default_chunk_size={}, min_chunk_size={}, min_chunk_length_to_embed={}, max_num_chunks={}",
this.default_chunk_size, this.min_chunk_size, this.min_chunk_length_to_embed, this.max_num_chunks);
TokenTextSplitter splitter = new TokenTextSplitter(this.default_chunk_size, TokenTextSplitter splitter = new TokenTextSplitter(this.default_chunk_size,
this.min_chunk_size, this.min_chunk_size,
this.min_chunk_length_to_embed, this.min_chunk_length_to_embed,
this.max_num_chunks, this.max_num_chunks,
true); true);
logger.info("Splitting and embedding documents");
docs.forEach(doc -> { docs.forEach(doc -> {
List<Document> splitDocs = splitter.split(doc); List<Document> splitDocs = splitter.split(doc);
Integer docIndex = 0; Integer docIndex = 0;
logger.info("Number of documents: " + splitDocs.size()); logger.info("Number of split documents: {}", splitDocs.size());
for (Document splitDoc : splitDocs) { for (Document splitDoc : splitDocs) {
splitDoc.getMetadata().put("KsDocumentId", this.scenario_execution_id); splitDoc.getMetadata().put("KsDocumentId", this.scenario_execution_id);
splitDoc.getMetadata().put("KsDocumentIndex",docIndex.toString()); splitDoc.getMetadata().put("KsDocumentIndex",docIndex.toString());
logger.info("DOC INDEX: ", docIndex); splitDoc.getMetadata().put("KsDoctype", "temp");
logger.info("Adding split document with index {} to vector store", docIndex);
docIndex++; docIndex++;
} }
logger.info("Adding {} split documents to vector store", splitDocs.size());
vectorStore.add(splitDocs); vectorStore.add(splitDocs);
}); });
logger.info("All documents embedded and added to vector store successfully");
}catch (Exception e){ }catch (Exception e){
logger.error("Error while solvingStep: "+e.getMessage()); logger.error("Error while solvingStep: "+e.getMessage(), e);
e.printStackTrace(); e.printStackTrace();
} }
logger.info("Setting next step id: {}", this.step.getNextStepId());
this.scenarioExecution.setNextStepId(this.step.getNextStepId()); this.scenarioExecution.setNextStepId(this.step.getNextStepId());
logger.info("Returning scenario execution for step: {}", this.step.getName());
return this.scenarioExecution; return this.scenarioExecution;
} }

View File

@@ -56,10 +56,6 @@ eureka.instance.preferIpAddress: true
hermione.fe.url = http://localhost:5173 hermione.fe.url = http://localhost:5173
java-parser-module.url: http://java-parser-module-service.olympus.svc.cluster.local:8080
java-re-module.url: http://java-re-module-service.olympus.svc.cluster.local:8080
jsp-parser-module.url: http://jsp-parser-module-service.olympus.svc.cluster.local:8080
spring.ai.vectorstore.chroma.client.host=http://108.142.74.161 spring.ai.vectorstore.chroma.client.host=http://108.142.74.161
spring.ai.vectorstore.chroma.client.port=8000 spring.ai.vectorstore.chroma.client.port=8000
spring.ai.vectorstore.chroma.client.key-token=tKAJfN1Yv5lP7pKorJHGfHMQhNEcM9uu spring.ai.vectorstore.chroma.client.key-token=tKAJfN1Yv5lP7pKorJHGfHMQhNEcM9uu
@@ -70,7 +66,7 @@ spring.servlet.multipart.max-file-size=10MB
spring.servlet.multipart.max-request-size=10MB spring.servlet.multipart.max-request-size=10MB
file.upload-dir=/mnt/hermione_storage/documents/file_input_scenarios/ file.upload-dir=/mnt/hermione_storage/documents/file_input_scenarios/
# file.upload-dir=C:\\mnt\\hermione_storage\\documents\\file_input_scenarios\\
generic-file-parser-module.url=http://generic-file-parser-module-service.olympus.svc.cluster.local:8080 generic-file-parser-module.url=http://generic-file-parser-module-service.olympus.svc.cluster.local:8080