From 1fdd4f52ecb7edd7e57440ac4b4e5d4b31fdd891 Mon Sep 17 00:00:00 2001 From: "andrea.terzani" Date: Wed, 31 Jul 2024 17:30:08 +0200 Subject: [PATCH] feat: Update getDocument method in KsDocumentController Refactor the getDocument method in KsDocumentController to return a single KSDocument object instead of a list. This improves the efficiency and readability of the code. The method now uses the findById method of ksDocumentRepository to retrieve the document with the specified id. --- pom.xml | 12 ++ .../FeApi/KsDocumentController.java | 4 +- .../apollo/controllers/TestController.java | 35 ++++- .../services/GitRepositoryIngestor.java | 121 ++++++++++++++++++ 4 files changed, 169 insertions(+), 3 deletions(-) create mode 100644 src/main/java/com/olympus/apollo/services/GitRepositoryIngestor.java diff --git a/pom.xml b/pom.xml index 6717668..1c685a1 100644 --- a/pom.xml +++ b/pom.xml @@ -90,6 +90,14 @@ lombok 1.18.34 + + + + org.eclipse.jgit + org.eclipse.jgit + 6.8.0.202311291450-r + + @@ -133,6 +141,10 @@ false + + jgit-repository + https://repo.eclipse.org/content/groups/releases/ + diff --git a/src/main/java/com/olympus/apollo/controllers/FeApi/KsDocumentController.java b/src/main/java/com/olympus/apollo/controllers/FeApi/KsDocumentController.java index 3a46eba..ccb94c7 100644 --- a/src/main/java/com/olympus/apollo/controllers/FeApi/KsDocumentController.java +++ b/src/main/java/com/olympus/apollo/controllers/FeApi/KsDocumentController.java @@ -30,9 +30,9 @@ public class KsDocumentController { return result; } @GetMapping("/{id}") - public List getDocument(@RequestParam String id) { + public KSDocument getDocument(@RequestParam String id) { - List result = (List) ksDocumentREpository.findAll(); + KSDocument result = ksDocumentREpository.findById(id).get(); return result; } diff --git a/src/main/java/com/olympus/apollo/controllers/TestController.java b/src/main/java/com/olympus/apollo/controllers/TestController.java index 1c2aed2..f5159f1 100644 --- a/src/main/java/com/olympus/apollo/controllers/TestController.java +++ b/src/main/java/com/olympus/apollo/controllers/TestController.java @@ -1,10 +1,13 @@ package com.olympus.apollo.controllers; +import java.util.HashMap; import java.util.List; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.web.bind.annotation.RestController; +import com.olympus.apollo.models.KSIngestionInfo; +import com.olympus.apollo.services.GitRepositoryIngestor; import com.olympus.apollo.services.KSIngestor; import org.springframework.web.bind.annotation.GetMapping; import org.springframework.web.bind.annotation.RequestParam; @@ -16,7 +19,9 @@ public class TestController { @Autowired KSIngestor ksIngestor; - + @Autowired + GitRepositoryIngestor gitRepositoryIngestor; + @GetMapping("test/ingestion_loop") public String testIngestionLoop() { ksIngestor.ingestLoop(); @@ -34,5 +39,33 @@ public class TestController { return "Deleted"; } + @GetMapping("test/ingest_repo") + public String ingestRepo() { + try { + + KSIngestionInfo ksIngestionInfo = new KSIngestionInfo(); + + + HashMap metadata = new HashMap<>(); + + metadata.put("KsApplicatioName","doo"); + metadata.put("KsDoctype","sourcecode"); + metadata.put("KsDoSource","GIT"); + ksIngestionInfo.setMetadata(metadata); + ksIngestionInfo.setDefaultChunkSize(6000); + ksIngestionInfo.setMinChunkSize(200); + ksIngestionInfo.setMaxNumberOfChunks(10000); + ksIngestionInfo.setMinChunkSizeToEmbed(100); + + + String repoPath = "C:\\Users\\andrea.terzani\\dev\\DOO2_CLOUD"; + gitRepositoryIngestor.ingestGitRepository(repoPath, ksIngestionInfo); + + + return "Ingested"; + } catch (Exception e) { + return "Error"; + } + } } diff --git a/src/main/java/com/olympus/apollo/services/GitRepositoryIngestor.java b/src/main/java/com/olympus/apollo/services/GitRepositoryIngestor.java new file mode 100644 index 0000000..36920f8 --- /dev/null +++ b/src/main/java/com/olympus/apollo/services/GitRepositoryIngestor.java @@ -0,0 +1,121 @@ +package com.olympus.apollo.services; + +import org.eclipse.jgit.api.Git; +import org.eclipse.jgit.lib.Repository; +import org.eclipse.jgit.revwalk.RevCommit; +import org.eclipse.jgit.treewalk.TreeWalk; +import org.springframework.ai.document.Document; +import org.springframework.ai.transformer.splitter.TokenTextSplitter; +import org.springframework.ai.vectorstore.VectorStore; +import org.springframework.stereotype.Service; + +import com.olympus.apollo.models.KSIngestionInfo; + +import java.io.File; +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +@Service +public class GitRepositoryIngestor { + + private final VectorStore vectorStore; + + public GitRepositoryIngestor( VectorStore vectorStore) { + this.vectorStore = vectorStore; + } + + public void ingestGitRepository(String repoPath,KSIngestionInfo ingestionInfo) throws Exception { + try (Git git = Git.open(new File(repoPath))) { + Repository repository = git.getRepository(); + RevCommit latestCommit = git.log().setMaxCount(1).call().iterator().next(); + + try (TreeWalk treeWalk = new TreeWalk(repository)) { + treeWalk.addTree(latestCommit.getTree()); + treeWalk.setRecursive(true); + + List documents = new ArrayList<>(); + + while (treeWalk.next()) { + String filePath = treeWalk.getPathString(); + String fileName = treeWalk.getNameString(); + + if (isRelevantFile(fileName)) { + byte[] fileContent = repository.open(treeWalk.getObjectId(0)).getBytes(); + String fileContentStr = new String(fileContent, StandardCharsets.UTF_8); + + Map metadata = extractMetadata(fileName, fileContentStr); + metadata.put("filePath", filePath); + metadata.put("fileName", fileName); + + Document doc = new Document(fileContentStr); + doc.getMetadata().putAll(metadata); + + doc.getMetadata().putAll(ingestionInfo.getMetadata()); + documents.add(doc); + } + } + + + TokenTextSplitter splitter = new TokenTextSplitter(ingestionInfo.getDefaultChunkSize(), + ingestionInfo.getMinChunkSize(), + ingestionInfo.getMinChunkSizeToEmbed(), + ingestionInfo.getMaxNumberOfChunks(), + false); + + List splitDocuments = splitter.split(documents); + + vectorStore.add(splitDocuments); + } + } + } + + private boolean isRelevantFile(String fileName) { + // Add more relevant file extensions as needed + return fileName.endsWith(".java"); + } + + private Map extractMetadata(String fileName, String fileContent) { + Map metadata = new HashMap<>(); + + if (fileName.endsWith(".java")) { + metadata.putAll(extractJavaMetadata(fileContent)); + } else if (fileName.endsWith(".py")) { + metadata.putAll(extractPythonMetadata(fileContent)); + } else if (fileName.endsWith(".js")) { + metadata.putAll(extractJavaScriptMetadata(fileContent)); + } + + return metadata; + } + + private Map extractJavaMetadata(String fileContent) { + Map metadata = new HashMap<>(); + // Simple regex to find class names (this is a basic implementation and might miss some cases) + Pattern classPattern = Pattern.compile("class\\s+(\\w+)"); + Matcher classMatcher = classPattern.matcher(fileContent); + List classNames = new ArrayList<>(); + while (classMatcher.find()) { + classNames.add(classMatcher.group(1)); + } + metadata.put("classNames", String.join(",", classNames)); + return metadata; + } + + private Map extractPythonMetadata(String fileContent) { + // Implement Python-specific metadata extraction + // This is a placeholder and should be implemented based on your needs + return new HashMap<>(); + } + + private Map extractJavaScriptMetadata(String fileContent) { + // Implement JavaScript-specific metadata extraction + // This is a placeholder and should be implemented based on your needs + return new HashMap<>(); + } +}