Spaces:

chenzihong
/

GraphGen

Sleeping

App Files Files Community

github-actions[bot] commited on Nov 7

Commit

283e483

1 Parent(s): f1eedd1

Auto-sync from demo at Fri Nov 7 10:46:57 UTC 2025

Browse files

Files changed (27) hide show

app.py +5 -0
graphgen/bases/__init__.py +1 -0
graphgen/bases/base_extractor.py +22 -0
graphgen/bases/base_storage.py +3 -0
graphgen/configs/aggregated_config.yaml +5 -2
graphgen/configs/atomic_config.yaml +3 -0
graphgen/configs/cot_config.yaml +5 -2
graphgen/configs/multi_hop_config.yaml +3 -0
graphgen/configs/schema_guided_config.yaml +15 -0
graphgen/configs/vqa_config.yaml +5 -2
graphgen/graphgen.py +46 -6
graphgen/models/extractor/__init__.py +1 -0
graphgen/models/extractor/key_information_extractor.py +1 -0
graphgen/models/extractor/schema_guided_extractor.py +101 -0
graphgen/models/reader/txt_reader.py +1 -5
graphgen/models/storage/json_storage.py +3 -0
graphgen/operators/__init__.py +1 -0
graphgen/operators/extract/__init__.py +1 -0
graphgen/operators/extract/extract_info.py +47 -0
graphgen/operators/split/split_chunks.py +7 -5
graphgen/templates/__init__.py +1 -0
graphgen/templates/extraction/__init__.py +1 -0
graphgen/templates/extraction/schema_guided_extraction.py +70 -0
graphgen/templates/extraction/schemas/legal_contract.json +58 -0
graphgen/utils/__init__.py +6 -1
graphgen/utils/hash.py +5 -0
webui/app.py +5 -0

app.py CHANGED Viewed

@@ -103,6 +103,11 @@ def run_graphgen(params: WebuiParams, progress=gr.Progress()):
             "name": "read",
             "params": {
                 "input_file": params.upload_file,
                 "chunk_size": params.chunk_size,
                 "chunk_overlap": params.chunk_overlap,
             },

             "name": "read",
             "params": {
                 "input_file": params.upload_file,
+            },
+        },
+        {
+            "name": "chunk",
+            "params": {
                 "chunk_size": params.chunk_size,
                 "chunk_overlap": params.chunk_overlap,
             },

graphgen/bases/__init__.py CHANGED Viewed

@@ -1,3 +1,4 @@
 from .base_generator import BaseGenerator
 from .base_kg_builder import BaseKGBuilder
 from .base_llm_wrapper import BaseLLMWrapper

+from .base_extractor import BaseExtractor
 from .base_generator import BaseGenerator
 from .base_kg_builder import BaseKGBuilder
 from .base_llm_wrapper import BaseLLMWrapper

graphgen/bases/base_extractor.py ADDED Viewed

	@@ -0,0 +1,22 @@

+from abc import ABC, abstractmethod
+from typing import Any
+from graphgen.bases.base_llm_wrapper import BaseLLMWrapper
+class BaseExtractor(ABC):
+    """
+    Extract information from given text.
+    """
+    def __init__(self, llm_client: BaseLLMWrapper):
+        self.llm_client = llm_client
+    @abstractmethod
+    async def extract(self, chunk: dict) -> Any:
+        """Extract information from the given text"""
+    @abstractmethod
+    def build_prompt(self, text: str) -> str:
+        """Build prompt for LLM based on the given text"""

graphgen/bases/base_storage.py CHANGED Viewed

@@ -45,6 +45,9 @@ class BaseKVStorage(Generic[T], StorageNameSpace):
     ) -> list[Union[T, None]]:
         raise NotImplementedError
     async def filter_keys(self, data: list[str]) -> set[str]:
         """return un-exist keys"""
         raise NotImplementedError

     ) -> list[Union[T, None]]:
         raise NotImplementedError
+    async def get_all(self) -> dict[str, T]:
+        raise NotImplementedError
     async def filter_keys(self, data: list[str]) -> set[str]:
         """return un-exist keys"""
         raise NotImplementedError

graphgen/configs/aggregated_config.yaml CHANGED Viewed

@@ -2,8 +2,11 @@ pipeline:
   - name: read
     params:
       input_file: resources/input_examples/jsonl_demo.jsonl # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
-      chunk_size: 1024 # chunk size for text splitting
-      chunk_overlap: 100 # chunk overlap for text splitting
   - name: build_kg

   - name: read
     params:
       input_file: resources/input_examples/jsonl_demo.jsonl # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
+  - name: chunk
+    params:
+        chunk_size: 1024 # chunk size for text splitting
+        chunk_overlap: 100 # chunk overlap for text splitting
   - name: build_kg

graphgen/configs/atomic_config.yaml CHANGED Viewed

@@ -2,6 +2,9 @@ pipeline:
   - name: read
     params:
       input_file: resources/input_examples/json_demo.json # input file path, support json, jsonl, txt, csv, pdf. See resources/input_examples for examples
       chunk_size: 1024 # chunk size for text splitting
       chunk_overlap: 100 # chunk overlap for text splitting

   - name: read
     params:
       input_file: resources/input_examples/json_demo.json # input file path, support json, jsonl, txt, csv, pdf. See resources/input_examples for examples
+  - name: chunk
+    params:
       chunk_size: 1024 # chunk size for text splitting
       chunk_overlap: 100 # chunk overlap for text splitting

graphgen/configs/cot_config.yaml CHANGED Viewed

@@ -2,8 +2,11 @@ pipeline:
   - name: read
     params:
       input_file: resources/input_examples/txt_demo.txt  # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
-      chunk_size: 1024 # chunk size for text splitting
-      chunk_overlap: 100 # chunk overlap for text splitting
   - name: build_kg

   - name: read
     params:
       input_file: resources/input_examples/txt_demo.txt  # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
+  - name: chunk
+    params:
+        chunk_size: 1024 # chunk size for text splitting
+        chunk_overlap: 100 # chunk overlap for text splitting
   - name: build_kg

graphgen/configs/multi_hop_config.yaml CHANGED Viewed

@@ -2,6 +2,9 @@ pipeline:
   - name: read
     params:
       input_file: resources/input_examples/csv_demo.csv # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
       chunk_size: 1024 # chunk size for text splitting
       chunk_overlap: 100 # chunk overlap for text splitting

   - name: read
     params:
       input_file: resources/input_examples/csv_demo.csv # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
+  - name: chunk
+    params:
       chunk_size: 1024 # chunk size for text splitting
       chunk_overlap: 100 # chunk overlap for text splitting

graphgen/configs/schema_guided_config.yaml ADDED Viewed

	@@ -0,0 +1,15 @@

+pipeline:
+  - name: read
+    params:
+      input_file: resources/input_examples/extract_demo.txt # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
+  - name: chunk
+    params:
+      chunk_size: 20480
+      chunk_overlap: 2000
+      separators: []
+  - name: extract
+    params:
+      method: schema_guided # extraction method, support: schema_guided
+      schema_file: graphgen/templates/extraction/schemas/legal_contract.json # schema file path for schema_guided method

graphgen/configs/vqa_config.yaml CHANGED Viewed

@@ -2,8 +2,11 @@ pipeline:
   - name: read
     params:
       input_file: resources/input_examples/vqa_demo.json # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
-      chunk_size: 1024 # chunk size for text splitting
-      chunk_overlap: 100 # chunk overlap for text splitting
   - name: build_kg

   - name: read
     params:
       input_file: resources/input_examples/vqa_demo.json # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
+  - name: chunk
+    params:
+        chunk_size: 1024 # chunk size for text splitting
+        chunk_overlap: 100 # chunk overlap for text splitting
   - name: build_kg

graphgen/graphgen.py CHANGED Viewed

@@ -18,6 +18,7 @@ from graphgen.models import (
 from graphgen.operators import (
     build_kg,
     chunk_documents,
     generate_qas,
     init_llm,
     judge_statement,
@@ -70,6 +71,7 @@ class GraphGen:
         self.search_storage: JsonKVStorage = JsonKVStorage(
             self.working_dir, namespace="search"
         )
         self.rephrase_storage: JsonKVStorage = JsonKVStorage(
             self.working_dir, namespace="rephrase"
         )
@@ -80,6 +82,10 @@ class GraphGen:
             os.path.join(self.working_dir, "data", "graphgen", f"{self.unique_id}"),
             namespace="qa",
         )
         # webui
         self.progress_bar: gr.Progress = progress_bar
@@ -103,16 +109,30 @@ class GraphGen:
         _add_doc_keys = await self.full_docs_storage.filter_keys(list(new_docs.keys()))
         new_docs = {k: v for k, v in new_docs.items() if k in _add_doc_keys}
         if len(new_docs) == 0:
             logger.warning("All documents are already in the storage")
             return
         inserting_chunks = await chunk_documents(
             new_docs,
-            read_config["chunk_size"],
-            read_config["chunk_overlap"],
             self.tokenizer_instance,
             self.progress_bar,
         )
         _add_chunk_keys = await self.chunks_storage.filter_keys(
@@ -126,12 +146,12 @@ class GraphGen:
             logger.warning("All chunks are already in the storage")
             return
-        await self.full_docs_storage.upsert(new_docs)
-        await self.full_docs_storage.index_done_callback()
         await self.chunks_storage.upsert(inserting_chunks)
         await self.chunks_storage.index_done_callback()
-    @op("build_kg", deps=["read"])
     @async_to_sync_method
     async def build_kg(self):
         """
@@ -161,7 +181,7 @@ class GraphGen:
         return _add_entities_and_relations
-    @op("search", deps=["read"])
     @async_to_sync_method
     async def search(self, search_config: Dict):
         logger.info(
@@ -248,6 +268,26 @@ class GraphGen:
         await self.partition_storage.upsert(batches)
         return batches
     @op("generate", deps=["partition"])
     @async_to_sync_method
     async def generate(self, generate_config: Dict):

 from graphgen.operators import (
     build_kg,
     chunk_documents,
+    extract_info,
     generate_qas,
     init_llm,
     judge_statement,
         self.search_storage: JsonKVStorage = JsonKVStorage(
             self.working_dir, namespace="search"
         )
         self.rephrase_storage: JsonKVStorage = JsonKVStorage(
             self.working_dir, namespace="rephrase"
         )
             os.path.join(self.working_dir, "data", "graphgen", f"{self.unique_id}"),
             namespace="qa",
         )
+        self.extract_storage: JsonKVStorage = JsonKVStorage(
+            os.path.join(self.working_dir, "data", "graphgen", f"{self.unique_id}"),
+            namespace="extraction",
+        )
         # webui
         self.progress_bar: gr.Progress = progress_bar
         _add_doc_keys = await self.full_docs_storage.filter_keys(list(new_docs.keys()))
         new_docs = {k: v for k, v in new_docs.items() if k in _add_doc_keys}
+        if len(new_docs) == 0:
+            logger.warning("All documents are already in the storage")
+            return
+        await self.full_docs_storage.upsert(new_docs)
+        await self.full_docs_storage.index_done_callback()
+    @op("chunk", deps=["read"])
+    @async_to_sync_method
+    async def chunk(self, chunk_config: Dict):
+        """
+        chunk documents into smaller pieces from full_docs_storage if not already present
+        """
+        new_docs = await self.meta_storage.get_new_data(self.full_docs_storage)
         if len(new_docs) == 0:
             logger.warning("All documents are already in the storage")
             return
         inserting_chunks = await chunk_documents(
             new_docs,
             self.tokenizer_instance,
             self.progress_bar,
+            **chunk_config,
         )
         _add_chunk_keys = await self.chunks_storage.filter_keys(
             logger.warning("All chunks are already in the storage")
             return
         await self.chunks_storage.upsert(inserting_chunks)
         await self.chunks_storage.index_done_callback()
+        await self.meta_storage.mark_done(self.full_docs_storage)
+        await self.meta_storage.index_done_callback()
+    @op("build_kg", deps=["chunk"])
     @async_to_sync_method
     async def build_kg(self):
         """
         return _add_entities_and_relations
+    @op("search", deps=["chunk"])
     @async_to_sync_method
     async def search(self, search_config: Dict):
         logger.info(
         await self.partition_storage.upsert(batches)
         return batches
+    @op("extract", deps=["chunk"])
+    @async_to_sync_method
+    async def extract(self, extract_config: Dict):
+        logger.info("Extracting information from given chunks...")
+        results = await extract_info(
+            self.synthesizer_llm_client,
+            self.chunks_storage,
+            extract_config,
+            progress_bar=self.progress_bar,
+        )
+        if not results:
+            logger.warning("No information extracted")
+            return
+        await self.extract_storage.upsert(results)
+        await self.extract_storage.index_done_callback()
+        await self.meta_storage.mark_done(self.chunks_storage)
+        await self.meta_storage.index_done_callback()
     @op("generate", deps=["partition"])
     @async_to_sync_method
     async def generate(self, generate_config: Dict):

graphgen/models/extractor/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .schema_guided_extractor import SchemaGuidedExtractor

graphgen/models/extractor/key_information_extractor.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # TODO: text2json

graphgen/models/extractor/schema_guided_extractor.py ADDED Viewed

	@@ -0,0 +1,101 @@

+import json
+from typing import Dict, List
+from graphgen.bases import BaseExtractor, BaseLLMWrapper
+from graphgen.templates import SCHEMA_GUIDED_EXTRACTION_PROMPT
+from graphgen.utils import compute_dict_hash, detect_main_language, logger
+class SchemaGuidedExtractor(BaseExtractor):
+    """
+    Use JSON/YAML Schema or Pydantic Model to guide the LLM to extract structured information from text.
+    Usage example:
+        schema = {
+                "type": "legal contract",
+                "description": "A legal contract for leasing property.",
+                "properties": {
+                    "end_date": {"type": "string", "description": "The end date of the lease."},
+                    "leased_space": {"type": "string", "description": "Description of the space that is being leased."},
+                    "lessee": {"type": "string", "description": "The lessee's name (and possibly address)."},
+                    "lessor": {"type": "string", "description": "The lessor's name (and possibly address)."},
+                    "signing_date": {"type": "string", "description": "The date the contract was signed."},
+                    "start_date": {"type": "string", "description": "The start date of the lease."},
+                    "term_of_payment": {"type": "string", "description": "Description of the payment terms."},
+                    "designated_use": {"type": "string",
+                    "description": "Description of the designated use of the property being leased."},
+                    "extension_period": {"type": "string",
+                    "description": "Description of the extension options for the lease."},
+                    "expiration_date_of_lease": {"type": "string", "description": "The expiration data of the lease."}
+                },
+                "required": ["lessee", "lessor", "start_date", "end_date"]
+            }
+        extractor = SchemaGuidedExtractor(llm_client, schema)
+        result = extractor.extract(text)
+    """
+    def __init__(self, llm_client: BaseLLMWrapper, schema: dict):
+        super().__init__(llm_client)
+        self.schema = schema
+        self.required_keys = self.schema.get("required")
+        if not self.required_keys:
+            # If no required keys are specified, use all keys from the schema as default
+            self.required_keys = list(self.schema.get("properties", {}).keys())
+    def build_prompt(self, text: str) -> str:
+        schema_explanation = ""
+        for field, details in self.schema.get("properties", {}).items():
+            description = details.get("description", "No description provided.")
+            schema_explanation += f'- "{field}": {description}\n'
+        lang = detect_main_language(text)
+        prompt = SCHEMA_GUIDED_EXTRACTION_PROMPT[lang].format(
+            field=self.schema.get("name", "the document"),
+            schema_explanation=schema_explanation,
+            examples="",
+            text=text,
+        )
+        return prompt
+    async def extract(self, chunk: dict) -> dict:
+        text = chunk.get("text", "")
+        prompt = self.build_prompt(text)
+        response = await self.llm_client.generate_answer(prompt)
+        try:
+            extracted_info = json.loads(response)
+            # Ensure all required keys are present
+            for key in self.required_keys:
+                if key not in extracted_info:
+                    extracted_info[key] = ""
+            if any(extracted_info[key] == "" for key in self.required_keys):
+                logger.debug("Missing required keys in extraction: %s", extracted_info)
+                return {}
+            main_keys_info = {key: extracted_info[key] for key in self.required_keys}
+            logger.debug("Extracted info: %s", extracted_info)
+            return {compute_dict_hash(main_keys_info, prefix="extract"): extracted_info}
+        except json.JSONDecodeError:
+            logger.error("Failed to parse extraction response: %s", response)
+            return {}
+    async def merge_extractions(
+        self, extraction_list: List[Dict[str, dict]]
+    ) -> Dict[str, dict]:
+        """
+        Merge multiple extraction results based on their hashes.
+        :param extraction_list: List of extraction results, each is a dict with hash as key and record as value.
+        :return: Merged extraction results.
+        """
+        merged: Dict[str, dict] = {}
+        for ext in extraction_list:
+            for h, rec in ext.items():
+                if h not in merged:
+                    merged[h] = rec.copy()
+                else:
+                    for k, v in rec.items():
+                        if k not in merged[h] or merged[h][k] == v:
+                            merged[h][k] = v
+                        else:
+                            merged[h][k] = f"{merged[h][k]}<SEP>{v}"
+        return merged

graphgen/models/reader/txt_reader.py CHANGED Viewed

@@ -5,10 +5,6 @@ from graphgen.bases.base_reader import BaseReader
 class TXTReader(BaseReader):
     def read(self, file_path: str) -> List[Dict[str, Any]]:
-        docs = []
         with open(file_path, "r", encoding="utf-8") as f:
-            for line in f:
-                line = line.strip()
-                if line:
-                    docs.append({self.text_column: line})
         return self.filter(docs)

 class TXTReader(BaseReader):
     def read(self, file_path: str) -> List[Dict[str, Any]]:
         with open(file_path, "r", encoding="utf-8") as f:
+            docs = [{"type": "text", self.text_column: f.read()}]
         return self.filter(docs)

graphgen/models/storage/json_storage.py CHANGED Viewed

@@ -39,6 +39,9 @@ class JsonKVStorage(BaseKVStorage):
             for id in ids
         ]
     async def filter_keys(self, data: list[str]) -> set[str]:
         return {s for s in data if s not in self._data}

             for id in ids
         ]
+    async def get_all(self) -> dict[str, str]:
+        return self._data
     async def filter_keys(self, data: list[str]) -> set[str]:
         return {s for s in data if s not in self._data}

graphgen/operators/__init__.py CHANGED Viewed

@@ -1,4 +1,5 @@
 from .build_kg import build_kg
 from .generate import generate_qas
 from .init import init_llm
 from .judge import judge_statement

 from .build_kg import build_kg
+from .extract import extract_info
 from .generate import generate_qas
 from .init import init_llm
 from .judge import judge_statement

graphgen/operators/extract/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .extract_info import extract_info

graphgen/operators/extract/extract_info.py ADDED Viewed

	@@ -0,0 +1,47 @@

+import json
+import gradio as gr
+from graphgen.bases import BaseKVStorage, BaseLLMWrapper
+from graphgen.models.extractor import SchemaGuidedExtractor
+from graphgen.utils import logger, run_concurrent
+async def extract_info(
+    llm_client: BaseLLMWrapper,
+    chunk_storage: BaseKVStorage,
+    extract_config: dict,
+    progress_bar: gr.Progress = None,
+):
+    """
+    Extract information from chunks
+    :param llm_client: LLM client
+    :param chunk_storage: storage for chunks
+    :param extract_config
+    :param progress_bar
+    :return: extracted information
+    """
+    method = extract_config.get("method")
+    if method == "schema_guided":
+        schema_file = extract_config.get("schema_file")
+        with open(schema_file, "r", encoding="utf-8") as f:
+            schema = json.load(f)
+        extractor = SchemaGuidedExtractor(llm_client, schema)
+    else:
+        raise ValueError(f"Unsupported extraction method: {method}")
+    chunks = await chunk_storage.get_all()
+    chunks = [{k: v} for k, v in chunks.items()]
+    logger.info("Start extracting information from %d chunks", len(chunks))
+    results = await run_concurrent(
+        extractor.extract,
+        chunks,
+        desc="Extracting information",
+        unit="chunk",
+        progress_bar=progress_bar,
+    )
+    results = await extractor.merge_extractions(results)
+    return results

graphgen/operators/split/split_chunks.py CHANGED Viewed

@@ -31,16 +31,18 @@ def split_chunks(text: str, language: str = "en", **kwargs) -> list:
             f"Unsupported language: {language}. "
             f"Supported languages are: {list(_MAPPING.keys())}"
         )
-    splitter = _get_splitter(language, frozenset(kwargs.items()))
     return splitter.split_text(text)
 async def chunk_documents(
     new_docs: dict,
-    chunk_size: int = 1024,
-    chunk_overlap: int = 100,
     tokenizer_instance: Tokenizer = None,
     progress_bar=None,
 ) -> dict:
     inserting_chunks = {}
     cur_index = 1
@@ -51,11 +53,11 @@ async def chunk_documents(
         doc_type = doc.get("type")
         if doc_type == "text":
             doc_language = detect_main_language(doc["content"])
             text_chunks = split_chunks(
                 doc["content"],
                 language=doc_language,
-                chunk_size=chunk_size,
-                chunk_overlap=chunk_overlap,
             )
             chunks = {

             f"Unsupported language: {language}. "
             f"Supported languages are: {list(_MAPPING.keys())}"
         )
+    frozen_kwargs = frozenset(
+        (k, tuple(v) if isinstance(v, list) else v) for k, v in kwargs.items()
+    )
+    splitter = _get_splitter(language, frozen_kwargs)
     return splitter.split_text(text)
 async def chunk_documents(
     new_docs: dict,
     tokenizer_instance: Tokenizer = None,
     progress_bar=None,
+    **kwargs,
 ) -> dict:
     inserting_chunks = {}
     cur_index = 1
         doc_type = doc.get("type")
         if doc_type == "text":
             doc_language = detect_main_language(doc["content"])
             text_chunks = split_chunks(
                 doc["content"],
                 language=doc_language,
+                **kwargs,
             )
             chunks = {

graphgen/templates/__init__.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from .coreference_resolution import COREFERENCE_RESOLUTION_PROMPT
 from .description_rephrasing import DESCRIPTION_REPHRASING_PROMPT
 from .generation import (
     AGGREGATED_GENERATION_PROMPT,
     ATOMIC_GENERATION_PROMPT,

 from .coreference_resolution import COREFERENCE_RESOLUTION_PROMPT
 from .description_rephrasing import DESCRIPTION_REPHRASING_PROMPT
+from .extraction import SCHEMA_GUIDED_EXTRACTION_PROMPT
 from .generation import (
     AGGREGATED_GENERATION_PROMPT,
     ATOMIC_GENERATION_PROMPT,

graphgen/templates/extraction/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .schema_guided_extraction import SCHEMA_GUIDED_EXTRACTION_PROMPT

graphgen/templates/extraction/schema_guided_extraction.py ADDED Viewed

	@@ -0,0 +1,70 @@

+TEMPLATE_EN = """You are an expert at extracting information from text based on a given schema.
+Extract relevant information about {field} from a given contract document according to the provided schema.
+Instructions:
+1. Carefully read the entire document provided at the end of this prompt.
+2. Extract the relevant information.
+3. Present your findings in JSON format as specified below.
+Important Notes:
+- Extract only relevant information.
+- Consider the context of the entire document when determining relevance.
+- Do not be verbose, only respond with the correct format and information.
+- Some docs may have multiple relevant excerpts -- include all that apply.
+- Some questions may have no relevant excerpts -- just return "".
+- Do not include additional JSON keys beyond the ones listed here.
+- Do not include the same key multiple times in the JSON.
+- Use English for your response.
+Expected JSON keys and explanation of what they are:
+{schema_explanation}
+Expected format:
+{{
+    "key1": "value1",
+    "key2": "value2",
+    ...
+}}
+{examples}
+Document to extract from:
+{text}
+"""
+TEMPLATE_ZH = """你是一个擅长根据给定的模式从文本中提取信息的专家。
+根据提供的模式，从合同文件中提取与{field}相关的信息。
+操作说明：
+1. 仔细阅读本提示末尾提供的整份文件。
+2. 提取相关信息。
+3. 按照下面指定的JSON格式呈现你的发现。
+重要注意事项：
+- 仅提取相关信息。
+- 在确定相关性时，考虑整份文件的上下文。
+- 不要冗长，只需以正确的格式和信息进行回应。
+- 有些文件可能有多个相关摘录——请包含所有适用的内容。
+- 有些问题可能没有相关摘录——只需返回""。
+- 不要在JSON中包含除列出的键之外的其他键。
+- 不要多次包含同一个键。
+- 使用中文回答。
+预期的JSON键及其说明：
+{schema_explanation}
+预期格式：
+{{
+    "key1": "value1",
+    "key2": "value2",
+    ...
+}}
+{examples}
+要提取的文件：
+{text}
+"""
+SCHEMA_GUIDED_EXTRACTION_PROMPT = {
+    "en": TEMPLATE_EN,
+    "zh": TEMPLATE_ZH,
+}

graphgen/templates/extraction/schemas/legal_contract.json ADDED Viewed

	@@ -0,0 +1,58 @@

+{
+  "type": "object",
+  "name": "legal contract",
+  "description": "A legal contract for leasing property.",
+  "properties": {
+      "leased_space": {
+          "type": "array",
+          "items": {"type": "string"},
+          "description": "Description of the space that is being leased."
+      },
+      "lessee": {
+          "type": "array",
+          "items": {"type": "string"},
+          "description": "The lessee's name (and possibly address)."
+      },
+      "lessor": {
+          "type": "array",
+          "items": {"type": "string"},
+          "description": "The lessor's name (and possibly address)."
+      },
+      "signing_date": {
+          "type": "array",
+          "items": {"type": "string"},
+          "description": "The date the contract was signed."
+      },
+      "start_date": {
+          "type": "array",
+          "items": {"type": "string"},
+          "description": "The start date of the lease."
+      },
+      "end_date": {
+          "type": "array",
+          "items": {"type": "string"},
+          "description": "The end date of the lease."
+      },
+      "term_of_payment": {
+          "type": "array",
+          "items": {"type": "string"},
+          "description": "Description of the payment terms."
+      },
+      "designated_use": {
+          "type": "array",
+          "items": {"type": "string"},
+          "description": "Designated use of the property being leased."
+      },
+      "extension_period": {
+          "type": "array",
+          "items": {"type": "string"},
+          "description": "Description of the extension options for the lease."
+      },
+      "expiration_date_of_lease": {
+          "type": "array",
+          "items": {"type": "string"},
+          "description": "The expiration date of the lease."
+      }
+  },
+    "required": ["lessee", "lessor", "start_date", "end_date"]
+}

graphgen/utils/__init__.py CHANGED Viewed

@@ -9,7 +9,12 @@ from .format import (
     split_string_by_multi_markers,
     write_json,
 )
-from .hash import compute_args_hash, compute_content_hash, compute_mm_hash
 from .help_nltk import NLTKHelper
 from .log import logger, parse_log, set_logger
 from .loop import create_event_loop

     split_string_by_multi_markers,
     write_json,
 )
+from .hash import (
+    compute_args_hash,
+    compute_content_hash,
+    compute_dict_hash,
+    compute_mm_hash,
+)
 from .help_nltk import NLTKHelper
 from .log import logger, parse_log, set_logger
 from .loop import create_event_loop

graphgen/utils/hash.py CHANGED Viewed

@@ -21,3 +21,8 @@ def compute_mm_hash(item, prefix: str = ""):
     else:
         content = str(item)
     return prefix + md5(content.encode()).hexdigest()

     else:
         content = str(item)
     return prefix + md5(content.encode()).hexdigest()
+def compute_dict_hash(d: dict, prefix: str = ""):
+    items = tuple(sorted(d.items()))
+    return prefix + md5(str(items).encode()).hexdigest()

webui/app.py CHANGED Viewed

@@ -103,6 +103,11 @@ def run_graphgen(params: WebuiParams, progress=gr.Progress()):
             "name": "read",
             "params": {
                 "input_file": params.upload_file,
                 "chunk_size": params.chunk_size,
                 "chunk_overlap": params.chunk_overlap,
             },

             "name": "read",
             "params": {
                 "input_file": params.upload_file,
+            },
+        },
+        {
+            "name": "chunk",
+            "params": {
                 "chunk_size": params.chunk_size,
                 "chunk_overlap": params.chunk_overlap,
             },