github-actions[bot] commited on
Commit
283e483
·
1 Parent(s): f1eedd1

Auto-sync from demo at Fri Nov 7 10:46:57 UTC 2025

Browse files
app.py CHANGED
@@ -103,6 +103,11 @@ def run_graphgen(params: WebuiParams, progress=gr.Progress()):
103
  "name": "read",
104
  "params": {
105
  "input_file": params.upload_file,
 
 
 
 
 
106
  "chunk_size": params.chunk_size,
107
  "chunk_overlap": params.chunk_overlap,
108
  },
 
103
  "name": "read",
104
  "params": {
105
  "input_file": params.upload_file,
106
+ },
107
+ },
108
+ {
109
+ "name": "chunk",
110
+ "params": {
111
  "chunk_size": params.chunk_size,
112
  "chunk_overlap": params.chunk_overlap,
113
  },
graphgen/bases/__init__.py CHANGED
@@ -1,3 +1,4 @@
 
1
  from .base_generator import BaseGenerator
2
  from .base_kg_builder import BaseKGBuilder
3
  from .base_llm_wrapper import BaseLLMWrapper
 
1
+ from .base_extractor import BaseExtractor
2
  from .base_generator import BaseGenerator
3
  from .base_kg_builder import BaseKGBuilder
4
  from .base_llm_wrapper import BaseLLMWrapper
graphgen/bases/base_extractor.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from abc import ABC, abstractmethod
2
+ from typing import Any
3
+
4
+ from graphgen.bases.base_llm_wrapper import BaseLLMWrapper
5
+
6
+
7
+ class BaseExtractor(ABC):
8
+ """
9
+ Extract information from given text.
10
+
11
+ """
12
+
13
+ def __init__(self, llm_client: BaseLLMWrapper):
14
+ self.llm_client = llm_client
15
+
16
+ @abstractmethod
17
+ async def extract(self, chunk: dict) -> Any:
18
+ """Extract information from the given text"""
19
+
20
+ @abstractmethod
21
+ def build_prompt(self, text: str) -> str:
22
+ """Build prompt for LLM based on the given text"""
graphgen/bases/base_storage.py CHANGED
@@ -45,6 +45,9 @@ class BaseKVStorage(Generic[T], StorageNameSpace):
45
  ) -> list[Union[T, None]]:
46
  raise NotImplementedError
47
 
 
 
 
48
  async def filter_keys(self, data: list[str]) -> set[str]:
49
  """return un-exist keys"""
50
  raise NotImplementedError
 
45
  ) -> list[Union[T, None]]:
46
  raise NotImplementedError
47
 
48
+ async def get_all(self) -> dict[str, T]:
49
+ raise NotImplementedError
50
+
51
  async def filter_keys(self, data: list[str]) -> set[str]:
52
  """return un-exist keys"""
53
  raise NotImplementedError
graphgen/configs/aggregated_config.yaml CHANGED
@@ -2,8 +2,11 @@ pipeline:
2
  - name: read
3
  params:
4
  input_file: resources/input_examples/jsonl_demo.jsonl # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
5
- chunk_size: 1024 # chunk size for text splitting
6
- chunk_overlap: 100 # chunk overlap for text splitting
 
 
 
7
 
8
  - name: build_kg
9
 
 
2
  - name: read
3
  params:
4
  input_file: resources/input_examples/jsonl_demo.jsonl # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
5
+
6
+ - name: chunk
7
+ params:
8
+ chunk_size: 1024 # chunk size for text splitting
9
+ chunk_overlap: 100 # chunk overlap for text splitting
10
 
11
  - name: build_kg
12
 
graphgen/configs/atomic_config.yaml CHANGED
@@ -2,6 +2,9 @@ pipeline:
2
  - name: read
3
  params:
4
  input_file: resources/input_examples/json_demo.json # input file path, support json, jsonl, txt, csv, pdf. See resources/input_examples for examples
 
 
 
5
  chunk_size: 1024 # chunk size for text splitting
6
  chunk_overlap: 100 # chunk overlap for text splitting
7
 
 
2
  - name: read
3
  params:
4
  input_file: resources/input_examples/json_demo.json # input file path, support json, jsonl, txt, csv, pdf. See resources/input_examples for examples
5
+
6
+ - name: chunk
7
+ params:
8
  chunk_size: 1024 # chunk size for text splitting
9
  chunk_overlap: 100 # chunk overlap for text splitting
10
 
graphgen/configs/cot_config.yaml CHANGED
@@ -2,8 +2,11 @@ pipeline:
2
  - name: read
3
  params:
4
  input_file: resources/input_examples/txt_demo.txt # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
5
- chunk_size: 1024 # chunk size for text splitting
6
- chunk_overlap: 100 # chunk overlap for text splitting
 
 
 
7
 
8
  - name: build_kg
9
 
 
2
  - name: read
3
  params:
4
  input_file: resources/input_examples/txt_demo.txt # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
5
+
6
+ - name: chunk
7
+ params:
8
+ chunk_size: 1024 # chunk size for text splitting
9
+ chunk_overlap: 100 # chunk overlap for text splitting
10
 
11
  - name: build_kg
12
 
graphgen/configs/multi_hop_config.yaml CHANGED
@@ -2,6 +2,9 @@ pipeline:
2
  - name: read
3
  params:
4
  input_file: resources/input_examples/csv_demo.csv # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
 
 
 
5
  chunk_size: 1024 # chunk size for text splitting
6
  chunk_overlap: 100 # chunk overlap for text splitting
7
 
 
2
  - name: read
3
  params:
4
  input_file: resources/input_examples/csv_demo.csv # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
5
+
6
+ - name: chunk
7
+ params:
8
  chunk_size: 1024 # chunk size for text splitting
9
  chunk_overlap: 100 # chunk overlap for text splitting
10
 
graphgen/configs/schema_guided_config.yaml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ pipeline:
2
+ - name: read
3
+ params:
4
+ input_file: resources/input_examples/extract_demo.txt # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
5
+
6
+ - name: chunk
7
+ params:
8
+ chunk_size: 20480
9
+ chunk_overlap: 2000
10
+ separators: []
11
+
12
+ - name: extract
13
+ params:
14
+ method: schema_guided # extraction method, support: schema_guided
15
+ schema_file: graphgen/templates/extraction/schemas/legal_contract.json # schema file path for schema_guided method
graphgen/configs/vqa_config.yaml CHANGED
@@ -2,8 +2,11 @@ pipeline:
2
  - name: read
3
  params:
4
  input_file: resources/input_examples/vqa_demo.json # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
5
- chunk_size: 1024 # chunk size for text splitting
6
- chunk_overlap: 100 # chunk overlap for text splitting
 
 
 
7
 
8
  - name: build_kg
9
 
 
2
  - name: read
3
  params:
4
  input_file: resources/input_examples/vqa_demo.json # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
5
+
6
+ - name: chunk
7
+ params:
8
+ chunk_size: 1024 # chunk size for text splitting
9
+ chunk_overlap: 100 # chunk overlap for text splitting
10
 
11
  - name: build_kg
12
 
graphgen/graphgen.py CHANGED
@@ -18,6 +18,7 @@ from graphgen.models import (
18
  from graphgen.operators import (
19
  build_kg,
20
  chunk_documents,
 
21
  generate_qas,
22
  init_llm,
23
  judge_statement,
@@ -70,6 +71,7 @@ class GraphGen:
70
  self.search_storage: JsonKVStorage = JsonKVStorage(
71
  self.working_dir, namespace="search"
72
  )
 
73
  self.rephrase_storage: JsonKVStorage = JsonKVStorage(
74
  self.working_dir, namespace="rephrase"
75
  )
@@ -80,6 +82,10 @@ class GraphGen:
80
  os.path.join(self.working_dir, "data", "graphgen", f"{self.unique_id}"),
81
  namespace="qa",
82
  )
 
 
 
 
83
 
84
  # webui
85
  self.progress_bar: gr.Progress = progress_bar
@@ -103,16 +109,30 @@ class GraphGen:
103
  _add_doc_keys = await self.full_docs_storage.filter_keys(list(new_docs.keys()))
104
  new_docs = {k: v for k, v in new_docs.items() if k in _add_doc_keys}
105
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
  if len(new_docs) == 0:
107
  logger.warning("All documents are already in the storage")
108
  return
109
 
110
  inserting_chunks = await chunk_documents(
111
  new_docs,
112
- read_config["chunk_size"],
113
- read_config["chunk_overlap"],
114
  self.tokenizer_instance,
115
  self.progress_bar,
 
116
  )
117
 
118
  _add_chunk_keys = await self.chunks_storage.filter_keys(
@@ -126,12 +146,12 @@ class GraphGen:
126
  logger.warning("All chunks are already in the storage")
127
  return
128
 
129
- await self.full_docs_storage.upsert(new_docs)
130
- await self.full_docs_storage.index_done_callback()
131
  await self.chunks_storage.upsert(inserting_chunks)
132
  await self.chunks_storage.index_done_callback()
 
 
133
 
134
- @op("build_kg", deps=["read"])
135
  @async_to_sync_method
136
  async def build_kg(self):
137
  """
@@ -161,7 +181,7 @@ class GraphGen:
161
 
162
  return _add_entities_and_relations
163
 
164
- @op("search", deps=["read"])
165
  @async_to_sync_method
166
  async def search(self, search_config: Dict):
167
  logger.info(
@@ -248,6 +268,26 @@ class GraphGen:
248
  await self.partition_storage.upsert(batches)
249
  return batches
250
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
251
  @op("generate", deps=["partition"])
252
  @async_to_sync_method
253
  async def generate(self, generate_config: Dict):
 
18
  from graphgen.operators import (
19
  build_kg,
20
  chunk_documents,
21
+ extract_info,
22
  generate_qas,
23
  init_llm,
24
  judge_statement,
 
71
  self.search_storage: JsonKVStorage = JsonKVStorage(
72
  self.working_dir, namespace="search"
73
  )
74
+
75
  self.rephrase_storage: JsonKVStorage = JsonKVStorage(
76
  self.working_dir, namespace="rephrase"
77
  )
 
82
  os.path.join(self.working_dir, "data", "graphgen", f"{self.unique_id}"),
83
  namespace="qa",
84
  )
85
+ self.extract_storage: JsonKVStorage = JsonKVStorage(
86
+ os.path.join(self.working_dir, "data", "graphgen", f"{self.unique_id}"),
87
+ namespace="extraction",
88
+ )
89
 
90
  # webui
91
  self.progress_bar: gr.Progress = progress_bar
 
109
  _add_doc_keys = await self.full_docs_storage.filter_keys(list(new_docs.keys()))
110
  new_docs = {k: v for k, v in new_docs.items() if k in _add_doc_keys}
111
 
112
+ if len(new_docs) == 0:
113
+ logger.warning("All documents are already in the storage")
114
+ return
115
+
116
+ await self.full_docs_storage.upsert(new_docs)
117
+ await self.full_docs_storage.index_done_callback()
118
+
119
+ @op("chunk", deps=["read"])
120
+ @async_to_sync_method
121
+ async def chunk(self, chunk_config: Dict):
122
+ """
123
+ chunk documents into smaller pieces from full_docs_storage if not already present
124
+ """
125
+
126
+ new_docs = await self.meta_storage.get_new_data(self.full_docs_storage)
127
  if len(new_docs) == 0:
128
  logger.warning("All documents are already in the storage")
129
  return
130
 
131
  inserting_chunks = await chunk_documents(
132
  new_docs,
 
 
133
  self.tokenizer_instance,
134
  self.progress_bar,
135
+ **chunk_config,
136
  )
137
 
138
  _add_chunk_keys = await self.chunks_storage.filter_keys(
 
146
  logger.warning("All chunks are already in the storage")
147
  return
148
 
 
 
149
  await self.chunks_storage.upsert(inserting_chunks)
150
  await self.chunks_storage.index_done_callback()
151
+ await self.meta_storage.mark_done(self.full_docs_storage)
152
+ await self.meta_storage.index_done_callback()
153
 
154
+ @op("build_kg", deps=["chunk"])
155
  @async_to_sync_method
156
  async def build_kg(self):
157
  """
 
181
 
182
  return _add_entities_and_relations
183
 
184
+ @op("search", deps=["chunk"])
185
  @async_to_sync_method
186
  async def search(self, search_config: Dict):
187
  logger.info(
 
268
  await self.partition_storage.upsert(batches)
269
  return batches
270
 
271
+ @op("extract", deps=["chunk"])
272
+ @async_to_sync_method
273
+ async def extract(self, extract_config: Dict):
274
+ logger.info("Extracting information from given chunks...")
275
+
276
+ results = await extract_info(
277
+ self.synthesizer_llm_client,
278
+ self.chunks_storage,
279
+ extract_config,
280
+ progress_bar=self.progress_bar,
281
+ )
282
+ if not results:
283
+ logger.warning("No information extracted")
284
+ return
285
+
286
+ await self.extract_storage.upsert(results)
287
+ await self.extract_storage.index_done_callback()
288
+ await self.meta_storage.mark_done(self.chunks_storage)
289
+ await self.meta_storage.index_done_callback()
290
+
291
  @op("generate", deps=["partition"])
292
  @async_to_sync_method
293
  async def generate(self, generate_config: Dict):
graphgen/models/extractor/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from .schema_guided_extractor import SchemaGuidedExtractor
graphgen/models/extractor/key_information_extractor.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # TODO: text2json
graphgen/models/extractor/schema_guided_extractor.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from typing import Dict, List
3
+
4
+ from graphgen.bases import BaseExtractor, BaseLLMWrapper
5
+ from graphgen.templates import SCHEMA_GUIDED_EXTRACTION_PROMPT
6
+ from graphgen.utils import compute_dict_hash, detect_main_language, logger
7
+
8
+
9
+ class SchemaGuidedExtractor(BaseExtractor):
10
+ """
11
+ Use JSON/YAML Schema or Pydantic Model to guide the LLM to extract structured information from text.
12
+
13
+ Usage example:
14
+ schema = {
15
+ "type": "legal contract",
16
+ "description": "A legal contract for leasing property.",
17
+ "properties": {
18
+ "end_date": {"type": "string", "description": "The end date of the lease."},
19
+ "leased_space": {"type": "string", "description": "Description of the space that is being leased."},
20
+ "lessee": {"type": "string", "description": "The lessee's name (and possibly address)."},
21
+ "lessor": {"type": "string", "description": "The lessor's name (and possibly address)."},
22
+ "signing_date": {"type": "string", "description": "The date the contract was signed."},
23
+ "start_date": {"type": "string", "description": "The start date of the lease."},
24
+ "term_of_payment": {"type": "string", "description": "Description of the payment terms."},
25
+ "designated_use": {"type": "string",
26
+ "description": "Description of the designated use of the property being leased."},
27
+ "extension_period": {"type": "string",
28
+ "description": "Description of the extension options for the lease."},
29
+ "expiration_date_of_lease": {"type": "string", "description": "The expiration data of the lease."}
30
+ },
31
+ "required": ["lessee", "lessor", "start_date", "end_date"]
32
+ }
33
+ extractor = SchemaGuidedExtractor(llm_client, schema)
34
+ result = extractor.extract(text)
35
+
36
+ """
37
+
38
+ def __init__(self, llm_client: BaseLLMWrapper, schema: dict):
39
+ super().__init__(llm_client)
40
+ self.schema = schema
41
+ self.required_keys = self.schema.get("required")
42
+ if not self.required_keys:
43
+ # If no required keys are specified, use all keys from the schema as default
44
+ self.required_keys = list(self.schema.get("properties", {}).keys())
45
+
46
+ def build_prompt(self, text: str) -> str:
47
+ schema_explanation = ""
48
+ for field, details in self.schema.get("properties", {}).items():
49
+ description = details.get("description", "No description provided.")
50
+ schema_explanation += f'- "{field}": {description}\n'
51
+
52
+ lang = detect_main_language(text)
53
+
54
+ prompt = SCHEMA_GUIDED_EXTRACTION_PROMPT[lang].format(
55
+ field=self.schema.get("name", "the document"),
56
+ schema_explanation=schema_explanation,
57
+ examples="",
58
+ text=text,
59
+ )
60
+ return prompt
61
+
62
+ async def extract(self, chunk: dict) -> dict:
63
+ text = chunk.get("text", "")
64
+ prompt = self.build_prompt(text)
65
+ response = await self.llm_client.generate_answer(prompt)
66
+ try:
67
+ extracted_info = json.loads(response)
68
+ # Ensure all required keys are present
69
+ for key in self.required_keys:
70
+ if key not in extracted_info:
71
+ extracted_info[key] = ""
72
+ if any(extracted_info[key] == "" for key in self.required_keys):
73
+ logger.debug("Missing required keys in extraction: %s", extracted_info)
74
+ return {}
75
+ main_keys_info = {key: extracted_info[key] for key in self.required_keys}
76
+ logger.debug("Extracted info: %s", extracted_info)
77
+ return {compute_dict_hash(main_keys_info, prefix="extract"): extracted_info}
78
+ except json.JSONDecodeError:
79
+ logger.error("Failed to parse extraction response: %s", response)
80
+ return {}
81
+
82
+ async def merge_extractions(
83
+ self, extraction_list: List[Dict[str, dict]]
84
+ ) -> Dict[str, dict]:
85
+ """
86
+ Merge multiple extraction results based on their hashes.
87
+ :param extraction_list: List of extraction results, each is a dict with hash as key and record as value.
88
+ :return: Merged extraction results.
89
+ """
90
+ merged: Dict[str, dict] = {}
91
+ for ext in extraction_list:
92
+ for h, rec in ext.items():
93
+ if h not in merged:
94
+ merged[h] = rec.copy()
95
+ else:
96
+ for k, v in rec.items():
97
+ if k not in merged[h] or merged[h][k] == v:
98
+ merged[h][k] = v
99
+ else:
100
+ merged[h][k] = f"{merged[h][k]}<SEP>{v}"
101
+ return merged
graphgen/models/reader/txt_reader.py CHANGED
@@ -5,10 +5,6 @@ from graphgen.bases.base_reader import BaseReader
5
 
6
  class TXTReader(BaseReader):
7
  def read(self, file_path: str) -> List[Dict[str, Any]]:
8
- docs = []
9
  with open(file_path, "r", encoding="utf-8") as f:
10
- for line in f:
11
- line = line.strip()
12
- if line:
13
- docs.append({self.text_column: line})
14
  return self.filter(docs)
 
5
 
6
  class TXTReader(BaseReader):
7
  def read(self, file_path: str) -> List[Dict[str, Any]]:
 
8
  with open(file_path, "r", encoding="utf-8") as f:
9
+ docs = [{"type": "text", self.text_column: f.read()}]
 
 
 
10
  return self.filter(docs)
graphgen/models/storage/json_storage.py CHANGED
@@ -39,6 +39,9 @@ class JsonKVStorage(BaseKVStorage):
39
  for id in ids
40
  ]
41
 
 
 
 
42
  async def filter_keys(self, data: list[str]) -> set[str]:
43
  return {s for s in data if s not in self._data}
44
 
 
39
  for id in ids
40
  ]
41
 
42
+ async def get_all(self) -> dict[str, str]:
43
+ return self._data
44
+
45
  async def filter_keys(self, data: list[str]) -> set[str]:
46
  return {s for s in data if s not in self._data}
47
 
graphgen/operators/__init__.py CHANGED
@@ -1,4 +1,5 @@
1
  from .build_kg import build_kg
 
2
  from .generate import generate_qas
3
  from .init import init_llm
4
  from .judge import judge_statement
 
1
  from .build_kg import build_kg
2
+ from .extract import extract_info
3
  from .generate import generate_qas
4
  from .init import init_llm
5
  from .judge import judge_statement
graphgen/operators/extract/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from .extract_info import extract_info
graphgen/operators/extract/extract_info.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+
3
+ import gradio as gr
4
+
5
+ from graphgen.bases import BaseKVStorage, BaseLLMWrapper
6
+ from graphgen.models.extractor import SchemaGuidedExtractor
7
+ from graphgen.utils import logger, run_concurrent
8
+
9
+
10
+ async def extract_info(
11
+ llm_client: BaseLLMWrapper,
12
+ chunk_storage: BaseKVStorage,
13
+ extract_config: dict,
14
+ progress_bar: gr.Progress = None,
15
+ ):
16
+ """
17
+ Extract information from chunks
18
+ :param llm_client: LLM client
19
+ :param chunk_storage: storage for chunks
20
+ :param extract_config
21
+ :param progress_bar
22
+ :return: extracted information
23
+ """
24
+
25
+ method = extract_config.get("method")
26
+ if method == "schema_guided":
27
+ schema_file = extract_config.get("schema_file")
28
+ with open(schema_file, "r", encoding="utf-8") as f:
29
+ schema = json.load(f)
30
+ extractor = SchemaGuidedExtractor(llm_client, schema)
31
+ else:
32
+ raise ValueError(f"Unsupported extraction method: {method}")
33
+
34
+ chunks = await chunk_storage.get_all()
35
+ chunks = [{k: v} for k, v in chunks.items()]
36
+ logger.info("Start extracting information from %d chunks", len(chunks))
37
+
38
+ results = await run_concurrent(
39
+ extractor.extract,
40
+ chunks,
41
+ desc="Extracting information",
42
+ unit="chunk",
43
+ progress_bar=progress_bar,
44
+ )
45
+
46
+ results = await extractor.merge_extractions(results)
47
+ return results
graphgen/operators/split/split_chunks.py CHANGED
@@ -31,16 +31,18 @@ def split_chunks(text: str, language: str = "en", **kwargs) -> list:
31
  f"Unsupported language: {language}. "
32
  f"Supported languages are: {list(_MAPPING.keys())}"
33
  )
34
- splitter = _get_splitter(language, frozenset(kwargs.items()))
 
 
 
35
  return splitter.split_text(text)
36
 
37
 
38
  async def chunk_documents(
39
  new_docs: dict,
40
- chunk_size: int = 1024,
41
- chunk_overlap: int = 100,
42
  tokenizer_instance: Tokenizer = None,
43
  progress_bar=None,
 
44
  ) -> dict:
45
  inserting_chunks = {}
46
  cur_index = 1
@@ -51,11 +53,11 @@ async def chunk_documents(
51
  doc_type = doc.get("type")
52
  if doc_type == "text":
53
  doc_language = detect_main_language(doc["content"])
 
54
  text_chunks = split_chunks(
55
  doc["content"],
56
  language=doc_language,
57
- chunk_size=chunk_size,
58
- chunk_overlap=chunk_overlap,
59
  )
60
 
61
  chunks = {
 
31
  f"Unsupported language: {language}. "
32
  f"Supported languages are: {list(_MAPPING.keys())}"
33
  )
34
+ frozen_kwargs = frozenset(
35
+ (k, tuple(v) if isinstance(v, list) else v) for k, v in kwargs.items()
36
+ )
37
+ splitter = _get_splitter(language, frozen_kwargs)
38
  return splitter.split_text(text)
39
 
40
 
41
  async def chunk_documents(
42
  new_docs: dict,
 
 
43
  tokenizer_instance: Tokenizer = None,
44
  progress_bar=None,
45
+ **kwargs,
46
  ) -> dict:
47
  inserting_chunks = {}
48
  cur_index = 1
 
53
  doc_type = doc.get("type")
54
  if doc_type == "text":
55
  doc_language = detect_main_language(doc["content"])
56
+
57
  text_chunks = split_chunks(
58
  doc["content"],
59
  language=doc_language,
60
+ **kwargs,
 
61
  )
62
 
63
  chunks = {
graphgen/templates/__init__.py CHANGED
@@ -1,5 +1,6 @@
1
  from .coreference_resolution import COREFERENCE_RESOLUTION_PROMPT
2
  from .description_rephrasing import DESCRIPTION_REPHRASING_PROMPT
 
3
  from .generation import (
4
  AGGREGATED_GENERATION_PROMPT,
5
  ATOMIC_GENERATION_PROMPT,
 
1
  from .coreference_resolution import COREFERENCE_RESOLUTION_PROMPT
2
  from .description_rephrasing import DESCRIPTION_REPHRASING_PROMPT
3
+ from .extraction import SCHEMA_GUIDED_EXTRACTION_PROMPT
4
  from .generation import (
5
  AGGREGATED_GENERATION_PROMPT,
6
  ATOMIC_GENERATION_PROMPT,
graphgen/templates/extraction/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from .schema_guided_extraction import SCHEMA_GUIDED_EXTRACTION_PROMPT
graphgen/templates/extraction/schema_guided_extraction.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ TEMPLATE_EN = """You are an expert at extracting information from text based on a given schema.
2
+ Extract relevant information about {field} from a given contract document according to the provided schema.
3
+
4
+ Instructions:
5
+ 1. Carefully read the entire document provided at the end of this prompt.
6
+ 2. Extract the relevant information.
7
+ 3. Present your findings in JSON format as specified below.
8
+
9
+ Important Notes:
10
+ - Extract only relevant information.
11
+ - Consider the context of the entire document when determining relevance.
12
+ - Do not be verbose, only respond with the correct format and information.
13
+ - Some docs may have multiple relevant excerpts -- include all that apply.
14
+ - Some questions may have no relevant excerpts -- just return "".
15
+ - Do not include additional JSON keys beyond the ones listed here.
16
+ - Do not include the same key multiple times in the JSON.
17
+ - Use English for your response.
18
+
19
+ Expected JSON keys and explanation of what they are:
20
+ {schema_explanation}
21
+
22
+ Expected format:
23
+ {{
24
+ "key1": "value1",
25
+ "key2": "value2",
26
+ ...
27
+ }}
28
+
29
+ {examples}
30
+
31
+ Document to extract from:
32
+ {text}
33
+ """
34
+
35
+ TEMPLATE_ZH = """你是一个擅长根据给定的模式从文本中提取信息的专家。
36
+ 根据提供的模式,从合同文件中提取与{field}相关的信息。
37
+ 操作说明:
38
+ 1. 仔细阅读本提示末尾提供的整份文件。
39
+ 2. 提取相关信息。
40
+ 3. 按照下面指定的JSON格式呈现你的发现。
41
+
42
+ 重要注意事项:
43
+ - 仅提取相关信息。
44
+ - 在确定相关性时,考虑整份文件的上下文。
45
+ - 不要冗长,只需以正确的格式和信息进行回应。
46
+ - 有些文件可能有多个相关摘录——请包含所有适用的内容。
47
+ - 有些问题可能没有相关摘录——只需返回""。
48
+ - 不要在JSON中包含除列出的键之外的其他键。
49
+ - 不要多次包含同一个键。
50
+ - 使用中文回答。
51
+
52
+ 预期的JSON键及其说明:
53
+ {schema_explanation}
54
+
55
+ 预期格式:
56
+ {{
57
+ "key1": "value1",
58
+ "key2": "value2",
59
+ ...
60
+ }}
61
+
62
+ {examples}
63
+ 要提取的文件:
64
+ {text}
65
+ """
66
+
67
+ SCHEMA_GUIDED_EXTRACTION_PROMPT = {
68
+ "en": TEMPLATE_EN,
69
+ "zh": TEMPLATE_ZH,
70
+ }
graphgen/templates/extraction/schemas/legal_contract.json ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "type": "object",
3
+ "name": "legal contract",
4
+ "description": "A legal contract for leasing property.",
5
+ "properties": {
6
+ "leased_space": {
7
+ "type": "array",
8
+ "items": {"type": "string"},
9
+ "description": "Description of the space that is being leased."
10
+ },
11
+ "lessee": {
12
+ "type": "array",
13
+ "items": {"type": "string"},
14
+ "description": "The lessee's name (and possibly address)."
15
+ },
16
+ "lessor": {
17
+ "type": "array",
18
+ "items": {"type": "string"},
19
+ "description": "The lessor's name (and possibly address)."
20
+ },
21
+ "signing_date": {
22
+ "type": "array",
23
+ "items": {"type": "string"},
24
+ "description": "The date the contract was signed."
25
+ },
26
+ "start_date": {
27
+ "type": "array",
28
+ "items": {"type": "string"},
29
+ "description": "The start date of the lease."
30
+ },
31
+ "end_date": {
32
+ "type": "array",
33
+ "items": {"type": "string"},
34
+ "description": "The end date of the lease."
35
+ },
36
+ "term_of_payment": {
37
+ "type": "array",
38
+ "items": {"type": "string"},
39
+ "description": "Description of the payment terms."
40
+ },
41
+ "designated_use": {
42
+ "type": "array",
43
+ "items": {"type": "string"},
44
+ "description": "Designated use of the property being leased."
45
+ },
46
+ "extension_period": {
47
+ "type": "array",
48
+ "items": {"type": "string"},
49
+ "description": "Description of the extension options for the lease."
50
+ },
51
+ "expiration_date_of_lease": {
52
+ "type": "array",
53
+ "items": {"type": "string"},
54
+ "description": "The expiration date of the lease."
55
+ }
56
+ },
57
+ "required": ["lessee", "lessor", "start_date", "end_date"]
58
+ }
graphgen/utils/__init__.py CHANGED
@@ -9,7 +9,12 @@ from .format import (
9
  split_string_by_multi_markers,
10
  write_json,
11
  )
12
- from .hash import compute_args_hash, compute_content_hash, compute_mm_hash
 
 
 
 
 
13
  from .help_nltk import NLTKHelper
14
  from .log import logger, parse_log, set_logger
15
  from .loop import create_event_loop
 
9
  split_string_by_multi_markers,
10
  write_json,
11
  )
12
+ from .hash import (
13
+ compute_args_hash,
14
+ compute_content_hash,
15
+ compute_dict_hash,
16
+ compute_mm_hash,
17
+ )
18
  from .help_nltk import NLTKHelper
19
  from .log import logger, parse_log, set_logger
20
  from .loop import create_event_loop
graphgen/utils/hash.py CHANGED
@@ -21,3 +21,8 @@ def compute_mm_hash(item, prefix: str = ""):
21
  else:
22
  content = str(item)
23
  return prefix + md5(content.encode()).hexdigest()
 
 
 
 
 
 
21
  else:
22
  content = str(item)
23
  return prefix + md5(content.encode()).hexdigest()
24
+
25
+
26
+ def compute_dict_hash(d: dict, prefix: str = ""):
27
+ items = tuple(sorted(d.items()))
28
+ return prefix + md5(str(items).encode()).hexdigest()
webui/app.py CHANGED
@@ -103,6 +103,11 @@ def run_graphgen(params: WebuiParams, progress=gr.Progress()):
103
  "name": "read",
104
  "params": {
105
  "input_file": params.upload_file,
 
 
 
 
 
106
  "chunk_size": params.chunk_size,
107
  "chunk_overlap": params.chunk_overlap,
108
  },
 
103
  "name": "read",
104
  "params": {
105
  "input_file": params.upload_file,
106
+ },
107
+ },
108
+ {
109
+ "name": "chunk",
110
+ "params": {
111
  "chunk_size": params.chunk_size,
112
  "chunk_overlap": params.chunk_overlap,
113
  },