Spaces:
Sleeping
Sleeping
github-actions[bot]
commited on
Commit
·
283e483
1
Parent(s):
f1eedd1
Auto-sync from demo at Fri Nov 7 10:46:57 UTC 2025
Browse files- app.py +5 -0
- graphgen/bases/__init__.py +1 -0
- graphgen/bases/base_extractor.py +22 -0
- graphgen/bases/base_storage.py +3 -0
- graphgen/configs/aggregated_config.yaml +5 -2
- graphgen/configs/atomic_config.yaml +3 -0
- graphgen/configs/cot_config.yaml +5 -2
- graphgen/configs/multi_hop_config.yaml +3 -0
- graphgen/configs/schema_guided_config.yaml +15 -0
- graphgen/configs/vqa_config.yaml +5 -2
- graphgen/graphgen.py +46 -6
- graphgen/models/extractor/__init__.py +1 -0
- graphgen/models/extractor/key_information_extractor.py +1 -0
- graphgen/models/extractor/schema_guided_extractor.py +101 -0
- graphgen/models/reader/txt_reader.py +1 -5
- graphgen/models/storage/json_storage.py +3 -0
- graphgen/operators/__init__.py +1 -0
- graphgen/operators/extract/__init__.py +1 -0
- graphgen/operators/extract/extract_info.py +47 -0
- graphgen/operators/split/split_chunks.py +7 -5
- graphgen/templates/__init__.py +1 -0
- graphgen/templates/extraction/__init__.py +1 -0
- graphgen/templates/extraction/schema_guided_extraction.py +70 -0
- graphgen/templates/extraction/schemas/legal_contract.json +58 -0
- graphgen/utils/__init__.py +6 -1
- graphgen/utils/hash.py +5 -0
- webui/app.py +5 -0
app.py
CHANGED
|
@@ -103,6 +103,11 @@ def run_graphgen(params: WebuiParams, progress=gr.Progress()):
|
|
| 103 |
"name": "read",
|
| 104 |
"params": {
|
| 105 |
"input_file": params.upload_file,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 106 |
"chunk_size": params.chunk_size,
|
| 107 |
"chunk_overlap": params.chunk_overlap,
|
| 108 |
},
|
|
|
|
| 103 |
"name": "read",
|
| 104 |
"params": {
|
| 105 |
"input_file": params.upload_file,
|
| 106 |
+
},
|
| 107 |
+
},
|
| 108 |
+
{
|
| 109 |
+
"name": "chunk",
|
| 110 |
+
"params": {
|
| 111 |
"chunk_size": params.chunk_size,
|
| 112 |
"chunk_overlap": params.chunk_overlap,
|
| 113 |
},
|
graphgen/bases/__init__.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
|
|
| 1 |
from .base_generator import BaseGenerator
|
| 2 |
from .base_kg_builder import BaseKGBuilder
|
| 3 |
from .base_llm_wrapper import BaseLLMWrapper
|
|
|
|
| 1 |
+
from .base_extractor import BaseExtractor
|
| 2 |
from .base_generator import BaseGenerator
|
| 3 |
from .base_kg_builder import BaseKGBuilder
|
| 4 |
from .base_llm_wrapper import BaseLLMWrapper
|
graphgen/bases/base_extractor.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from abc import ABC, abstractmethod
|
| 2 |
+
from typing import Any
|
| 3 |
+
|
| 4 |
+
from graphgen.bases.base_llm_wrapper import BaseLLMWrapper
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
class BaseExtractor(ABC):
|
| 8 |
+
"""
|
| 9 |
+
Extract information from given text.
|
| 10 |
+
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
def __init__(self, llm_client: BaseLLMWrapper):
|
| 14 |
+
self.llm_client = llm_client
|
| 15 |
+
|
| 16 |
+
@abstractmethod
|
| 17 |
+
async def extract(self, chunk: dict) -> Any:
|
| 18 |
+
"""Extract information from the given text"""
|
| 19 |
+
|
| 20 |
+
@abstractmethod
|
| 21 |
+
def build_prompt(self, text: str) -> str:
|
| 22 |
+
"""Build prompt for LLM based on the given text"""
|
graphgen/bases/base_storage.py
CHANGED
|
@@ -45,6 +45,9 @@ class BaseKVStorage(Generic[T], StorageNameSpace):
|
|
| 45 |
) -> list[Union[T, None]]:
|
| 46 |
raise NotImplementedError
|
| 47 |
|
|
|
|
|
|
|
|
|
|
| 48 |
async def filter_keys(self, data: list[str]) -> set[str]:
|
| 49 |
"""return un-exist keys"""
|
| 50 |
raise NotImplementedError
|
|
|
|
| 45 |
) -> list[Union[T, None]]:
|
| 46 |
raise NotImplementedError
|
| 47 |
|
| 48 |
+
async def get_all(self) -> dict[str, T]:
|
| 49 |
+
raise NotImplementedError
|
| 50 |
+
|
| 51 |
async def filter_keys(self, data: list[str]) -> set[str]:
|
| 52 |
"""return un-exist keys"""
|
| 53 |
raise NotImplementedError
|
graphgen/configs/aggregated_config.yaml
CHANGED
|
@@ -2,8 +2,11 @@ pipeline:
|
|
| 2 |
- name: read
|
| 3 |
params:
|
| 4 |
input_file: resources/input_examples/jsonl_demo.jsonl # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
|
| 5 |
-
|
| 6 |
-
|
|
|
|
|
|
|
|
|
|
| 7 |
|
| 8 |
- name: build_kg
|
| 9 |
|
|
|
|
| 2 |
- name: read
|
| 3 |
params:
|
| 4 |
input_file: resources/input_examples/jsonl_demo.jsonl # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
|
| 5 |
+
|
| 6 |
+
- name: chunk
|
| 7 |
+
params:
|
| 8 |
+
chunk_size: 1024 # chunk size for text splitting
|
| 9 |
+
chunk_overlap: 100 # chunk overlap for text splitting
|
| 10 |
|
| 11 |
- name: build_kg
|
| 12 |
|
graphgen/configs/atomic_config.yaml
CHANGED
|
@@ -2,6 +2,9 @@ pipeline:
|
|
| 2 |
- name: read
|
| 3 |
params:
|
| 4 |
input_file: resources/input_examples/json_demo.json # input file path, support json, jsonl, txt, csv, pdf. See resources/input_examples for examples
|
|
|
|
|
|
|
|
|
|
| 5 |
chunk_size: 1024 # chunk size for text splitting
|
| 6 |
chunk_overlap: 100 # chunk overlap for text splitting
|
| 7 |
|
|
|
|
| 2 |
- name: read
|
| 3 |
params:
|
| 4 |
input_file: resources/input_examples/json_demo.json # input file path, support json, jsonl, txt, csv, pdf. See resources/input_examples for examples
|
| 5 |
+
|
| 6 |
+
- name: chunk
|
| 7 |
+
params:
|
| 8 |
chunk_size: 1024 # chunk size for text splitting
|
| 9 |
chunk_overlap: 100 # chunk overlap for text splitting
|
| 10 |
|
graphgen/configs/cot_config.yaml
CHANGED
|
@@ -2,8 +2,11 @@ pipeline:
|
|
| 2 |
- name: read
|
| 3 |
params:
|
| 4 |
input_file: resources/input_examples/txt_demo.txt # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
|
| 5 |
-
|
| 6 |
-
|
|
|
|
|
|
|
|
|
|
| 7 |
|
| 8 |
- name: build_kg
|
| 9 |
|
|
|
|
| 2 |
- name: read
|
| 3 |
params:
|
| 4 |
input_file: resources/input_examples/txt_demo.txt # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
|
| 5 |
+
|
| 6 |
+
- name: chunk
|
| 7 |
+
params:
|
| 8 |
+
chunk_size: 1024 # chunk size for text splitting
|
| 9 |
+
chunk_overlap: 100 # chunk overlap for text splitting
|
| 10 |
|
| 11 |
- name: build_kg
|
| 12 |
|
graphgen/configs/multi_hop_config.yaml
CHANGED
|
@@ -2,6 +2,9 @@ pipeline:
|
|
| 2 |
- name: read
|
| 3 |
params:
|
| 4 |
input_file: resources/input_examples/csv_demo.csv # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
|
|
|
|
|
|
|
|
|
|
| 5 |
chunk_size: 1024 # chunk size for text splitting
|
| 6 |
chunk_overlap: 100 # chunk overlap for text splitting
|
| 7 |
|
|
|
|
| 2 |
- name: read
|
| 3 |
params:
|
| 4 |
input_file: resources/input_examples/csv_demo.csv # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
|
| 5 |
+
|
| 6 |
+
- name: chunk
|
| 7 |
+
params:
|
| 8 |
chunk_size: 1024 # chunk size for text splitting
|
| 9 |
chunk_overlap: 100 # chunk overlap for text splitting
|
| 10 |
|
graphgen/configs/schema_guided_config.yaml
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
pipeline:
|
| 2 |
+
- name: read
|
| 3 |
+
params:
|
| 4 |
+
input_file: resources/input_examples/extract_demo.txt # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
|
| 5 |
+
|
| 6 |
+
- name: chunk
|
| 7 |
+
params:
|
| 8 |
+
chunk_size: 20480
|
| 9 |
+
chunk_overlap: 2000
|
| 10 |
+
separators: []
|
| 11 |
+
|
| 12 |
+
- name: extract
|
| 13 |
+
params:
|
| 14 |
+
method: schema_guided # extraction method, support: schema_guided
|
| 15 |
+
schema_file: graphgen/templates/extraction/schemas/legal_contract.json # schema file path for schema_guided method
|
graphgen/configs/vqa_config.yaml
CHANGED
|
@@ -2,8 +2,11 @@ pipeline:
|
|
| 2 |
- name: read
|
| 3 |
params:
|
| 4 |
input_file: resources/input_examples/vqa_demo.json # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
|
| 5 |
-
|
| 6 |
-
|
|
|
|
|
|
|
|
|
|
| 7 |
|
| 8 |
- name: build_kg
|
| 9 |
|
|
|
|
| 2 |
- name: read
|
| 3 |
params:
|
| 4 |
input_file: resources/input_examples/vqa_demo.json # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
|
| 5 |
+
|
| 6 |
+
- name: chunk
|
| 7 |
+
params:
|
| 8 |
+
chunk_size: 1024 # chunk size for text splitting
|
| 9 |
+
chunk_overlap: 100 # chunk overlap for text splitting
|
| 10 |
|
| 11 |
- name: build_kg
|
| 12 |
|
graphgen/graphgen.py
CHANGED
|
@@ -18,6 +18,7 @@ from graphgen.models import (
|
|
| 18 |
from graphgen.operators import (
|
| 19 |
build_kg,
|
| 20 |
chunk_documents,
|
|
|
|
| 21 |
generate_qas,
|
| 22 |
init_llm,
|
| 23 |
judge_statement,
|
|
@@ -70,6 +71,7 @@ class GraphGen:
|
|
| 70 |
self.search_storage: JsonKVStorage = JsonKVStorage(
|
| 71 |
self.working_dir, namespace="search"
|
| 72 |
)
|
|
|
|
| 73 |
self.rephrase_storage: JsonKVStorage = JsonKVStorage(
|
| 74 |
self.working_dir, namespace="rephrase"
|
| 75 |
)
|
|
@@ -80,6 +82,10 @@ class GraphGen:
|
|
| 80 |
os.path.join(self.working_dir, "data", "graphgen", f"{self.unique_id}"),
|
| 81 |
namespace="qa",
|
| 82 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
|
| 84 |
# webui
|
| 85 |
self.progress_bar: gr.Progress = progress_bar
|
|
@@ -103,16 +109,30 @@ class GraphGen:
|
|
| 103 |
_add_doc_keys = await self.full_docs_storage.filter_keys(list(new_docs.keys()))
|
| 104 |
new_docs = {k: v for k, v in new_docs.items() if k in _add_doc_keys}
|
| 105 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 106 |
if len(new_docs) == 0:
|
| 107 |
logger.warning("All documents are already in the storage")
|
| 108 |
return
|
| 109 |
|
| 110 |
inserting_chunks = await chunk_documents(
|
| 111 |
new_docs,
|
| 112 |
-
read_config["chunk_size"],
|
| 113 |
-
read_config["chunk_overlap"],
|
| 114 |
self.tokenizer_instance,
|
| 115 |
self.progress_bar,
|
|
|
|
| 116 |
)
|
| 117 |
|
| 118 |
_add_chunk_keys = await self.chunks_storage.filter_keys(
|
|
@@ -126,12 +146,12 @@ class GraphGen:
|
|
| 126 |
logger.warning("All chunks are already in the storage")
|
| 127 |
return
|
| 128 |
|
| 129 |
-
await self.full_docs_storage.upsert(new_docs)
|
| 130 |
-
await self.full_docs_storage.index_done_callback()
|
| 131 |
await self.chunks_storage.upsert(inserting_chunks)
|
| 132 |
await self.chunks_storage.index_done_callback()
|
|
|
|
|
|
|
| 133 |
|
| 134 |
-
@op("build_kg", deps=["
|
| 135 |
@async_to_sync_method
|
| 136 |
async def build_kg(self):
|
| 137 |
"""
|
|
@@ -161,7 +181,7 @@ class GraphGen:
|
|
| 161 |
|
| 162 |
return _add_entities_and_relations
|
| 163 |
|
| 164 |
-
@op("search", deps=["
|
| 165 |
@async_to_sync_method
|
| 166 |
async def search(self, search_config: Dict):
|
| 167 |
logger.info(
|
|
@@ -248,6 +268,26 @@ class GraphGen:
|
|
| 248 |
await self.partition_storage.upsert(batches)
|
| 249 |
return batches
|
| 250 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 251 |
@op("generate", deps=["partition"])
|
| 252 |
@async_to_sync_method
|
| 253 |
async def generate(self, generate_config: Dict):
|
|
|
|
| 18 |
from graphgen.operators import (
|
| 19 |
build_kg,
|
| 20 |
chunk_documents,
|
| 21 |
+
extract_info,
|
| 22 |
generate_qas,
|
| 23 |
init_llm,
|
| 24 |
judge_statement,
|
|
|
|
| 71 |
self.search_storage: JsonKVStorage = JsonKVStorage(
|
| 72 |
self.working_dir, namespace="search"
|
| 73 |
)
|
| 74 |
+
|
| 75 |
self.rephrase_storage: JsonKVStorage = JsonKVStorage(
|
| 76 |
self.working_dir, namespace="rephrase"
|
| 77 |
)
|
|
|
|
| 82 |
os.path.join(self.working_dir, "data", "graphgen", f"{self.unique_id}"),
|
| 83 |
namespace="qa",
|
| 84 |
)
|
| 85 |
+
self.extract_storage: JsonKVStorage = JsonKVStorage(
|
| 86 |
+
os.path.join(self.working_dir, "data", "graphgen", f"{self.unique_id}"),
|
| 87 |
+
namespace="extraction",
|
| 88 |
+
)
|
| 89 |
|
| 90 |
# webui
|
| 91 |
self.progress_bar: gr.Progress = progress_bar
|
|
|
|
| 109 |
_add_doc_keys = await self.full_docs_storage.filter_keys(list(new_docs.keys()))
|
| 110 |
new_docs = {k: v for k, v in new_docs.items() if k in _add_doc_keys}
|
| 111 |
|
| 112 |
+
if len(new_docs) == 0:
|
| 113 |
+
logger.warning("All documents are already in the storage")
|
| 114 |
+
return
|
| 115 |
+
|
| 116 |
+
await self.full_docs_storage.upsert(new_docs)
|
| 117 |
+
await self.full_docs_storage.index_done_callback()
|
| 118 |
+
|
| 119 |
+
@op("chunk", deps=["read"])
|
| 120 |
+
@async_to_sync_method
|
| 121 |
+
async def chunk(self, chunk_config: Dict):
|
| 122 |
+
"""
|
| 123 |
+
chunk documents into smaller pieces from full_docs_storage if not already present
|
| 124 |
+
"""
|
| 125 |
+
|
| 126 |
+
new_docs = await self.meta_storage.get_new_data(self.full_docs_storage)
|
| 127 |
if len(new_docs) == 0:
|
| 128 |
logger.warning("All documents are already in the storage")
|
| 129 |
return
|
| 130 |
|
| 131 |
inserting_chunks = await chunk_documents(
|
| 132 |
new_docs,
|
|
|
|
|
|
|
| 133 |
self.tokenizer_instance,
|
| 134 |
self.progress_bar,
|
| 135 |
+
**chunk_config,
|
| 136 |
)
|
| 137 |
|
| 138 |
_add_chunk_keys = await self.chunks_storage.filter_keys(
|
|
|
|
| 146 |
logger.warning("All chunks are already in the storage")
|
| 147 |
return
|
| 148 |
|
|
|
|
|
|
|
| 149 |
await self.chunks_storage.upsert(inserting_chunks)
|
| 150 |
await self.chunks_storage.index_done_callback()
|
| 151 |
+
await self.meta_storage.mark_done(self.full_docs_storage)
|
| 152 |
+
await self.meta_storage.index_done_callback()
|
| 153 |
|
| 154 |
+
@op("build_kg", deps=["chunk"])
|
| 155 |
@async_to_sync_method
|
| 156 |
async def build_kg(self):
|
| 157 |
"""
|
|
|
|
| 181 |
|
| 182 |
return _add_entities_and_relations
|
| 183 |
|
| 184 |
+
@op("search", deps=["chunk"])
|
| 185 |
@async_to_sync_method
|
| 186 |
async def search(self, search_config: Dict):
|
| 187 |
logger.info(
|
|
|
|
| 268 |
await self.partition_storage.upsert(batches)
|
| 269 |
return batches
|
| 270 |
|
| 271 |
+
@op("extract", deps=["chunk"])
|
| 272 |
+
@async_to_sync_method
|
| 273 |
+
async def extract(self, extract_config: Dict):
|
| 274 |
+
logger.info("Extracting information from given chunks...")
|
| 275 |
+
|
| 276 |
+
results = await extract_info(
|
| 277 |
+
self.synthesizer_llm_client,
|
| 278 |
+
self.chunks_storage,
|
| 279 |
+
extract_config,
|
| 280 |
+
progress_bar=self.progress_bar,
|
| 281 |
+
)
|
| 282 |
+
if not results:
|
| 283 |
+
logger.warning("No information extracted")
|
| 284 |
+
return
|
| 285 |
+
|
| 286 |
+
await self.extract_storage.upsert(results)
|
| 287 |
+
await self.extract_storage.index_done_callback()
|
| 288 |
+
await self.meta_storage.mark_done(self.chunks_storage)
|
| 289 |
+
await self.meta_storage.index_done_callback()
|
| 290 |
+
|
| 291 |
@op("generate", deps=["partition"])
|
| 292 |
@async_to_sync_method
|
| 293 |
async def generate(self, generate_config: Dict):
|
graphgen/models/extractor/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
from .schema_guided_extractor import SchemaGuidedExtractor
|
graphgen/models/extractor/key_information_extractor.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# TODO: text2json
|
graphgen/models/extractor/schema_guided_extractor.py
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
from typing import Dict, List
|
| 3 |
+
|
| 4 |
+
from graphgen.bases import BaseExtractor, BaseLLMWrapper
|
| 5 |
+
from graphgen.templates import SCHEMA_GUIDED_EXTRACTION_PROMPT
|
| 6 |
+
from graphgen.utils import compute_dict_hash, detect_main_language, logger
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class SchemaGuidedExtractor(BaseExtractor):
|
| 10 |
+
"""
|
| 11 |
+
Use JSON/YAML Schema or Pydantic Model to guide the LLM to extract structured information from text.
|
| 12 |
+
|
| 13 |
+
Usage example:
|
| 14 |
+
schema = {
|
| 15 |
+
"type": "legal contract",
|
| 16 |
+
"description": "A legal contract for leasing property.",
|
| 17 |
+
"properties": {
|
| 18 |
+
"end_date": {"type": "string", "description": "The end date of the lease."},
|
| 19 |
+
"leased_space": {"type": "string", "description": "Description of the space that is being leased."},
|
| 20 |
+
"lessee": {"type": "string", "description": "The lessee's name (and possibly address)."},
|
| 21 |
+
"lessor": {"type": "string", "description": "The lessor's name (and possibly address)."},
|
| 22 |
+
"signing_date": {"type": "string", "description": "The date the contract was signed."},
|
| 23 |
+
"start_date": {"type": "string", "description": "The start date of the lease."},
|
| 24 |
+
"term_of_payment": {"type": "string", "description": "Description of the payment terms."},
|
| 25 |
+
"designated_use": {"type": "string",
|
| 26 |
+
"description": "Description of the designated use of the property being leased."},
|
| 27 |
+
"extension_period": {"type": "string",
|
| 28 |
+
"description": "Description of the extension options for the lease."},
|
| 29 |
+
"expiration_date_of_lease": {"type": "string", "description": "The expiration data of the lease."}
|
| 30 |
+
},
|
| 31 |
+
"required": ["lessee", "lessor", "start_date", "end_date"]
|
| 32 |
+
}
|
| 33 |
+
extractor = SchemaGuidedExtractor(llm_client, schema)
|
| 34 |
+
result = extractor.extract(text)
|
| 35 |
+
|
| 36 |
+
"""
|
| 37 |
+
|
| 38 |
+
def __init__(self, llm_client: BaseLLMWrapper, schema: dict):
|
| 39 |
+
super().__init__(llm_client)
|
| 40 |
+
self.schema = schema
|
| 41 |
+
self.required_keys = self.schema.get("required")
|
| 42 |
+
if not self.required_keys:
|
| 43 |
+
# If no required keys are specified, use all keys from the schema as default
|
| 44 |
+
self.required_keys = list(self.schema.get("properties", {}).keys())
|
| 45 |
+
|
| 46 |
+
def build_prompt(self, text: str) -> str:
|
| 47 |
+
schema_explanation = ""
|
| 48 |
+
for field, details in self.schema.get("properties", {}).items():
|
| 49 |
+
description = details.get("description", "No description provided.")
|
| 50 |
+
schema_explanation += f'- "{field}": {description}\n'
|
| 51 |
+
|
| 52 |
+
lang = detect_main_language(text)
|
| 53 |
+
|
| 54 |
+
prompt = SCHEMA_GUIDED_EXTRACTION_PROMPT[lang].format(
|
| 55 |
+
field=self.schema.get("name", "the document"),
|
| 56 |
+
schema_explanation=schema_explanation,
|
| 57 |
+
examples="",
|
| 58 |
+
text=text,
|
| 59 |
+
)
|
| 60 |
+
return prompt
|
| 61 |
+
|
| 62 |
+
async def extract(self, chunk: dict) -> dict:
|
| 63 |
+
text = chunk.get("text", "")
|
| 64 |
+
prompt = self.build_prompt(text)
|
| 65 |
+
response = await self.llm_client.generate_answer(prompt)
|
| 66 |
+
try:
|
| 67 |
+
extracted_info = json.loads(response)
|
| 68 |
+
# Ensure all required keys are present
|
| 69 |
+
for key in self.required_keys:
|
| 70 |
+
if key not in extracted_info:
|
| 71 |
+
extracted_info[key] = ""
|
| 72 |
+
if any(extracted_info[key] == "" for key in self.required_keys):
|
| 73 |
+
logger.debug("Missing required keys in extraction: %s", extracted_info)
|
| 74 |
+
return {}
|
| 75 |
+
main_keys_info = {key: extracted_info[key] for key in self.required_keys}
|
| 76 |
+
logger.debug("Extracted info: %s", extracted_info)
|
| 77 |
+
return {compute_dict_hash(main_keys_info, prefix="extract"): extracted_info}
|
| 78 |
+
except json.JSONDecodeError:
|
| 79 |
+
logger.error("Failed to parse extraction response: %s", response)
|
| 80 |
+
return {}
|
| 81 |
+
|
| 82 |
+
async def merge_extractions(
|
| 83 |
+
self, extraction_list: List[Dict[str, dict]]
|
| 84 |
+
) -> Dict[str, dict]:
|
| 85 |
+
"""
|
| 86 |
+
Merge multiple extraction results based on their hashes.
|
| 87 |
+
:param extraction_list: List of extraction results, each is a dict with hash as key and record as value.
|
| 88 |
+
:return: Merged extraction results.
|
| 89 |
+
"""
|
| 90 |
+
merged: Dict[str, dict] = {}
|
| 91 |
+
for ext in extraction_list:
|
| 92 |
+
for h, rec in ext.items():
|
| 93 |
+
if h not in merged:
|
| 94 |
+
merged[h] = rec.copy()
|
| 95 |
+
else:
|
| 96 |
+
for k, v in rec.items():
|
| 97 |
+
if k not in merged[h] or merged[h][k] == v:
|
| 98 |
+
merged[h][k] = v
|
| 99 |
+
else:
|
| 100 |
+
merged[h][k] = f"{merged[h][k]}<SEP>{v}"
|
| 101 |
+
return merged
|
graphgen/models/reader/txt_reader.py
CHANGED
|
@@ -5,10 +5,6 @@ from graphgen.bases.base_reader import BaseReader
|
|
| 5 |
|
| 6 |
class TXTReader(BaseReader):
|
| 7 |
def read(self, file_path: str) -> List[Dict[str, Any]]:
|
| 8 |
-
docs = []
|
| 9 |
with open(file_path, "r", encoding="utf-8") as f:
|
| 10 |
-
|
| 11 |
-
line = line.strip()
|
| 12 |
-
if line:
|
| 13 |
-
docs.append({self.text_column: line})
|
| 14 |
return self.filter(docs)
|
|
|
|
| 5 |
|
| 6 |
class TXTReader(BaseReader):
|
| 7 |
def read(self, file_path: str) -> List[Dict[str, Any]]:
|
|
|
|
| 8 |
with open(file_path, "r", encoding="utf-8") as f:
|
| 9 |
+
docs = [{"type": "text", self.text_column: f.read()}]
|
|
|
|
|
|
|
|
|
|
| 10 |
return self.filter(docs)
|
graphgen/models/storage/json_storage.py
CHANGED
|
@@ -39,6 +39,9 @@ class JsonKVStorage(BaseKVStorage):
|
|
| 39 |
for id in ids
|
| 40 |
]
|
| 41 |
|
|
|
|
|
|
|
|
|
|
| 42 |
async def filter_keys(self, data: list[str]) -> set[str]:
|
| 43 |
return {s for s in data if s not in self._data}
|
| 44 |
|
|
|
|
| 39 |
for id in ids
|
| 40 |
]
|
| 41 |
|
| 42 |
+
async def get_all(self) -> dict[str, str]:
|
| 43 |
+
return self._data
|
| 44 |
+
|
| 45 |
async def filter_keys(self, data: list[str]) -> set[str]:
|
| 46 |
return {s for s in data if s not in self._data}
|
| 47 |
|
graphgen/operators/__init__.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
from .build_kg import build_kg
|
|
|
|
| 2 |
from .generate import generate_qas
|
| 3 |
from .init import init_llm
|
| 4 |
from .judge import judge_statement
|
|
|
|
| 1 |
from .build_kg import build_kg
|
| 2 |
+
from .extract import extract_info
|
| 3 |
from .generate import generate_qas
|
| 4 |
from .init import init_llm
|
| 5 |
from .judge import judge_statement
|
graphgen/operators/extract/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
from .extract_info import extract_info
|
graphgen/operators/extract/extract_info.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
|
| 3 |
+
import gradio as gr
|
| 4 |
+
|
| 5 |
+
from graphgen.bases import BaseKVStorage, BaseLLMWrapper
|
| 6 |
+
from graphgen.models.extractor import SchemaGuidedExtractor
|
| 7 |
+
from graphgen.utils import logger, run_concurrent
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
async def extract_info(
|
| 11 |
+
llm_client: BaseLLMWrapper,
|
| 12 |
+
chunk_storage: BaseKVStorage,
|
| 13 |
+
extract_config: dict,
|
| 14 |
+
progress_bar: gr.Progress = None,
|
| 15 |
+
):
|
| 16 |
+
"""
|
| 17 |
+
Extract information from chunks
|
| 18 |
+
:param llm_client: LLM client
|
| 19 |
+
:param chunk_storage: storage for chunks
|
| 20 |
+
:param extract_config
|
| 21 |
+
:param progress_bar
|
| 22 |
+
:return: extracted information
|
| 23 |
+
"""
|
| 24 |
+
|
| 25 |
+
method = extract_config.get("method")
|
| 26 |
+
if method == "schema_guided":
|
| 27 |
+
schema_file = extract_config.get("schema_file")
|
| 28 |
+
with open(schema_file, "r", encoding="utf-8") as f:
|
| 29 |
+
schema = json.load(f)
|
| 30 |
+
extractor = SchemaGuidedExtractor(llm_client, schema)
|
| 31 |
+
else:
|
| 32 |
+
raise ValueError(f"Unsupported extraction method: {method}")
|
| 33 |
+
|
| 34 |
+
chunks = await chunk_storage.get_all()
|
| 35 |
+
chunks = [{k: v} for k, v in chunks.items()]
|
| 36 |
+
logger.info("Start extracting information from %d chunks", len(chunks))
|
| 37 |
+
|
| 38 |
+
results = await run_concurrent(
|
| 39 |
+
extractor.extract,
|
| 40 |
+
chunks,
|
| 41 |
+
desc="Extracting information",
|
| 42 |
+
unit="chunk",
|
| 43 |
+
progress_bar=progress_bar,
|
| 44 |
+
)
|
| 45 |
+
|
| 46 |
+
results = await extractor.merge_extractions(results)
|
| 47 |
+
return results
|
graphgen/operators/split/split_chunks.py
CHANGED
|
@@ -31,16 +31,18 @@ def split_chunks(text: str, language: str = "en", **kwargs) -> list:
|
|
| 31 |
f"Unsupported language: {language}. "
|
| 32 |
f"Supported languages are: {list(_MAPPING.keys())}"
|
| 33 |
)
|
| 34 |
-
|
|
|
|
|
|
|
|
|
|
| 35 |
return splitter.split_text(text)
|
| 36 |
|
| 37 |
|
| 38 |
async def chunk_documents(
|
| 39 |
new_docs: dict,
|
| 40 |
-
chunk_size: int = 1024,
|
| 41 |
-
chunk_overlap: int = 100,
|
| 42 |
tokenizer_instance: Tokenizer = None,
|
| 43 |
progress_bar=None,
|
|
|
|
| 44 |
) -> dict:
|
| 45 |
inserting_chunks = {}
|
| 46 |
cur_index = 1
|
|
@@ -51,11 +53,11 @@ async def chunk_documents(
|
|
| 51 |
doc_type = doc.get("type")
|
| 52 |
if doc_type == "text":
|
| 53 |
doc_language = detect_main_language(doc["content"])
|
|
|
|
| 54 |
text_chunks = split_chunks(
|
| 55 |
doc["content"],
|
| 56 |
language=doc_language,
|
| 57 |
-
|
| 58 |
-
chunk_overlap=chunk_overlap,
|
| 59 |
)
|
| 60 |
|
| 61 |
chunks = {
|
|
|
|
| 31 |
f"Unsupported language: {language}. "
|
| 32 |
f"Supported languages are: {list(_MAPPING.keys())}"
|
| 33 |
)
|
| 34 |
+
frozen_kwargs = frozenset(
|
| 35 |
+
(k, tuple(v) if isinstance(v, list) else v) for k, v in kwargs.items()
|
| 36 |
+
)
|
| 37 |
+
splitter = _get_splitter(language, frozen_kwargs)
|
| 38 |
return splitter.split_text(text)
|
| 39 |
|
| 40 |
|
| 41 |
async def chunk_documents(
|
| 42 |
new_docs: dict,
|
|
|
|
|
|
|
| 43 |
tokenizer_instance: Tokenizer = None,
|
| 44 |
progress_bar=None,
|
| 45 |
+
**kwargs,
|
| 46 |
) -> dict:
|
| 47 |
inserting_chunks = {}
|
| 48 |
cur_index = 1
|
|
|
|
| 53 |
doc_type = doc.get("type")
|
| 54 |
if doc_type == "text":
|
| 55 |
doc_language = detect_main_language(doc["content"])
|
| 56 |
+
|
| 57 |
text_chunks = split_chunks(
|
| 58 |
doc["content"],
|
| 59 |
language=doc_language,
|
| 60 |
+
**kwargs,
|
|
|
|
| 61 |
)
|
| 62 |
|
| 63 |
chunks = {
|
graphgen/templates/__init__.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
| 1 |
from .coreference_resolution import COREFERENCE_RESOLUTION_PROMPT
|
| 2 |
from .description_rephrasing import DESCRIPTION_REPHRASING_PROMPT
|
|
|
|
| 3 |
from .generation import (
|
| 4 |
AGGREGATED_GENERATION_PROMPT,
|
| 5 |
ATOMIC_GENERATION_PROMPT,
|
|
|
|
| 1 |
from .coreference_resolution import COREFERENCE_RESOLUTION_PROMPT
|
| 2 |
from .description_rephrasing import DESCRIPTION_REPHRASING_PROMPT
|
| 3 |
+
from .extraction import SCHEMA_GUIDED_EXTRACTION_PROMPT
|
| 4 |
from .generation import (
|
| 5 |
AGGREGATED_GENERATION_PROMPT,
|
| 6 |
ATOMIC_GENERATION_PROMPT,
|
graphgen/templates/extraction/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
from .schema_guided_extraction import SCHEMA_GUIDED_EXTRACTION_PROMPT
|
graphgen/templates/extraction/schema_guided_extraction.py
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
TEMPLATE_EN = """You are an expert at extracting information from text based on a given schema.
|
| 2 |
+
Extract relevant information about {field} from a given contract document according to the provided schema.
|
| 3 |
+
|
| 4 |
+
Instructions:
|
| 5 |
+
1. Carefully read the entire document provided at the end of this prompt.
|
| 6 |
+
2. Extract the relevant information.
|
| 7 |
+
3. Present your findings in JSON format as specified below.
|
| 8 |
+
|
| 9 |
+
Important Notes:
|
| 10 |
+
- Extract only relevant information.
|
| 11 |
+
- Consider the context of the entire document when determining relevance.
|
| 12 |
+
- Do not be verbose, only respond with the correct format and information.
|
| 13 |
+
- Some docs may have multiple relevant excerpts -- include all that apply.
|
| 14 |
+
- Some questions may have no relevant excerpts -- just return "".
|
| 15 |
+
- Do not include additional JSON keys beyond the ones listed here.
|
| 16 |
+
- Do not include the same key multiple times in the JSON.
|
| 17 |
+
- Use English for your response.
|
| 18 |
+
|
| 19 |
+
Expected JSON keys and explanation of what they are:
|
| 20 |
+
{schema_explanation}
|
| 21 |
+
|
| 22 |
+
Expected format:
|
| 23 |
+
{{
|
| 24 |
+
"key1": "value1",
|
| 25 |
+
"key2": "value2",
|
| 26 |
+
...
|
| 27 |
+
}}
|
| 28 |
+
|
| 29 |
+
{examples}
|
| 30 |
+
|
| 31 |
+
Document to extract from:
|
| 32 |
+
{text}
|
| 33 |
+
"""
|
| 34 |
+
|
| 35 |
+
TEMPLATE_ZH = """你是一个擅长根据给定的模式从文本中提取信息的专家。
|
| 36 |
+
根据提供的模式,从合同文件中提取与{field}相关的信息。
|
| 37 |
+
操作说明:
|
| 38 |
+
1. 仔细阅读本提示末尾提供的整份文件。
|
| 39 |
+
2. 提取相关信息。
|
| 40 |
+
3. 按照下面指定的JSON格式呈现你的发现。
|
| 41 |
+
|
| 42 |
+
重要注意事项:
|
| 43 |
+
- 仅提取相关信息。
|
| 44 |
+
- 在确定相关性时,考虑整份文件的上下文。
|
| 45 |
+
- 不要冗长,只需以正确的格式和信息进行回应。
|
| 46 |
+
- 有些文件可能有多个相关摘录——请包含所有适用的内容。
|
| 47 |
+
- 有些问题可能没有相关摘录——只需返回""。
|
| 48 |
+
- 不要在JSON中包含除列出的键之外的其他键。
|
| 49 |
+
- 不要多次包含同一个键。
|
| 50 |
+
- 使用中文回答。
|
| 51 |
+
|
| 52 |
+
预期的JSON键及其说明:
|
| 53 |
+
{schema_explanation}
|
| 54 |
+
|
| 55 |
+
预期格式:
|
| 56 |
+
{{
|
| 57 |
+
"key1": "value1",
|
| 58 |
+
"key2": "value2",
|
| 59 |
+
...
|
| 60 |
+
}}
|
| 61 |
+
|
| 62 |
+
{examples}
|
| 63 |
+
要提取的文件:
|
| 64 |
+
{text}
|
| 65 |
+
"""
|
| 66 |
+
|
| 67 |
+
SCHEMA_GUIDED_EXTRACTION_PROMPT = {
|
| 68 |
+
"en": TEMPLATE_EN,
|
| 69 |
+
"zh": TEMPLATE_ZH,
|
| 70 |
+
}
|
graphgen/templates/extraction/schemas/legal_contract.json
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"type": "object",
|
| 3 |
+
"name": "legal contract",
|
| 4 |
+
"description": "A legal contract for leasing property.",
|
| 5 |
+
"properties": {
|
| 6 |
+
"leased_space": {
|
| 7 |
+
"type": "array",
|
| 8 |
+
"items": {"type": "string"},
|
| 9 |
+
"description": "Description of the space that is being leased."
|
| 10 |
+
},
|
| 11 |
+
"lessee": {
|
| 12 |
+
"type": "array",
|
| 13 |
+
"items": {"type": "string"},
|
| 14 |
+
"description": "The lessee's name (and possibly address)."
|
| 15 |
+
},
|
| 16 |
+
"lessor": {
|
| 17 |
+
"type": "array",
|
| 18 |
+
"items": {"type": "string"},
|
| 19 |
+
"description": "The lessor's name (and possibly address)."
|
| 20 |
+
},
|
| 21 |
+
"signing_date": {
|
| 22 |
+
"type": "array",
|
| 23 |
+
"items": {"type": "string"},
|
| 24 |
+
"description": "The date the contract was signed."
|
| 25 |
+
},
|
| 26 |
+
"start_date": {
|
| 27 |
+
"type": "array",
|
| 28 |
+
"items": {"type": "string"},
|
| 29 |
+
"description": "The start date of the lease."
|
| 30 |
+
},
|
| 31 |
+
"end_date": {
|
| 32 |
+
"type": "array",
|
| 33 |
+
"items": {"type": "string"},
|
| 34 |
+
"description": "The end date of the lease."
|
| 35 |
+
},
|
| 36 |
+
"term_of_payment": {
|
| 37 |
+
"type": "array",
|
| 38 |
+
"items": {"type": "string"},
|
| 39 |
+
"description": "Description of the payment terms."
|
| 40 |
+
},
|
| 41 |
+
"designated_use": {
|
| 42 |
+
"type": "array",
|
| 43 |
+
"items": {"type": "string"},
|
| 44 |
+
"description": "Designated use of the property being leased."
|
| 45 |
+
},
|
| 46 |
+
"extension_period": {
|
| 47 |
+
"type": "array",
|
| 48 |
+
"items": {"type": "string"},
|
| 49 |
+
"description": "Description of the extension options for the lease."
|
| 50 |
+
},
|
| 51 |
+
"expiration_date_of_lease": {
|
| 52 |
+
"type": "array",
|
| 53 |
+
"items": {"type": "string"},
|
| 54 |
+
"description": "The expiration date of the lease."
|
| 55 |
+
}
|
| 56 |
+
},
|
| 57 |
+
"required": ["lessee", "lessor", "start_date", "end_date"]
|
| 58 |
+
}
|
graphgen/utils/__init__.py
CHANGED
|
@@ -9,7 +9,12 @@ from .format import (
|
|
| 9 |
split_string_by_multi_markers,
|
| 10 |
write_json,
|
| 11 |
)
|
| 12 |
-
from .hash import
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
from .help_nltk import NLTKHelper
|
| 14 |
from .log import logger, parse_log, set_logger
|
| 15 |
from .loop import create_event_loop
|
|
|
|
| 9 |
split_string_by_multi_markers,
|
| 10 |
write_json,
|
| 11 |
)
|
| 12 |
+
from .hash import (
|
| 13 |
+
compute_args_hash,
|
| 14 |
+
compute_content_hash,
|
| 15 |
+
compute_dict_hash,
|
| 16 |
+
compute_mm_hash,
|
| 17 |
+
)
|
| 18 |
from .help_nltk import NLTKHelper
|
| 19 |
from .log import logger, parse_log, set_logger
|
| 20 |
from .loop import create_event_loop
|
graphgen/utils/hash.py
CHANGED
|
@@ -21,3 +21,8 @@ def compute_mm_hash(item, prefix: str = ""):
|
|
| 21 |
else:
|
| 22 |
content = str(item)
|
| 23 |
return prefix + md5(content.encode()).hexdigest()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
else:
|
| 22 |
content = str(item)
|
| 23 |
return prefix + md5(content.encode()).hexdigest()
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def compute_dict_hash(d: dict, prefix: str = ""):
|
| 27 |
+
items = tuple(sorted(d.items()))
|
| 28 |
+
return prefix + md5(str(items).encode()).hexdigest()
|
webui/app.py
CHANGED
|
@@ -103,6 +103,11 @@ def run_graphgen(params: WebuiParams, progress=gr.Progress()):
|
|
| 103 |
"name": "read",
|
| 104 |
"params": {
|
| 105 |
"input_file": params.upload_file,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 106 |
"chunk_size": params.chunk_size,
|
| 107 |
"chunk_overlap": params.chunk_overlap,
|
| 108 |
},
|
|
|
|
| 103 |
"name": "read",
|
| 104 |
"params": {
|
| 105 |
"input_file": params.upload_file,
|
| 106 |
+
},
|
| 107 |
+
},
|
| 108 |
+
{
|
| 109 |
+
"name": "chunk",
|
| 110 |
+
"params": {
|
| 111 |
"chunk_size": params.chunk_size,
|
| 112 |
"chunk_overlap": params.chunk_overlap,
|
| 113 |
},
|