| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347 |
- import os
- import json
- import re
- from typing import List, Dict, Any
- from openai import OpenAI
- import config.config
- from tools.documents_extractor import DocumentReader
- from application_extractor.ocr_PP_StructureV3 import LayoutParserClient_application
- from application_extractor.rectify_OCR_result import RectifyClient_application
- from transcript_extractor.rectify_transcript import RectifyClient_transcript
- from evidence_extractor.ocr_paddle_ocr_vl import LayoutParserClient_evidence
- from law_rag.run import law_rag_run
- from backend.embedding import compute_embedding, cosine_similarity
- from backend.db import fetch_similar_cases, store_case_record, init_case_db
- from backend.text_utils import list_files_by_ext, parse_json_from_text, extract_evidence_text_from_ocr
- def call_deepseek_json(system_prompt: str, user_content: str, temperature: float = 0.0) -> Any:
- client = OpenAI(api_key=config.config.DEEPSEEK_API, base_url="https://api.deepseek.com")
- response = client.chat.completions.create(
- model="deepseek-chat",
- messages=[
- {"role": "system", "content": system_prompt},
- {"role": "user", "content": user_content}
- ],
- temperature=temperature,
- stream=False
- )
- content = response.choices[0].message.content.strip()
- parsed = parse_json_from_text(content)
- return parsed if parsed is not None else content
- def extract_application_text(application_dir: str) -> str:
- image_exts = [".png", ".jpg", ".jpeg", ".bmp", ".gif"]
- doc_exts = [".pdf", ".doc", ".docx"]
- image_files = list_files_by_ext(application_dir, image_exts)
- doc_files = list_files_by_ext(application_dir, doc_exts)
- if image_files:
- ocr_client = LayoutParserClient_application()
- raw_text = ocr_client.parse(image_files)
- rectify_client = RectifyClient_application()
- return rectify_client.extract_legal_document(raw_text)
- if doc_files:
- reader = DocumentReader()
- contents = [reader.process_input(path) for path in doc_files]
- raw_text = "\n\n".join(contents)
- rectify_client = RectifyClient_application()
- return rectify_client.extract_legal_document(raw_text)
- return ""
- def extract_transcript_text(transcript_dir: str) -> str:
- doc_exts = [".pdf", ".doc", ".docx", ".png", ".jpg", ".jpeg", ".bmp", ".gif"]
- files = list_files_by_ext(transcript_dir, doc_exts)
- if not files:
- return ""
- reader = DocumentReader()
- contents = [reader.process_input(path) for path in files]
- raw_text = "\n\n".join(contents)
- rectify_client = RectifyClient_transcript()
- cleaned = rectify_client.clean_text(raw_text)
- return cleaned or raw_text
- def build_case_profile(application_text: str, transcript_text: str) -> Dict[str, Any]:
- system_prompt = """
- 你是劳动仲裁案件分析专家。请根据申请书与庭审笔录,构建案件画像。
- 仅输出JSON,不要包含解释或Markdown。
- 输出格式:
- {
- "case_profile": {
- "parties": "当事人信息与关系",
- "claims": ["仲裁请求1", "仲裁请求2"],
- "background": "事实与理由摘要",
- "timeline": ["关键时间点1", "关键时间点2"],
- "key_facts": ["关键事实1", "关键事实2"],
- "disputed_facts": ["争议事实1", "争议事实2"]
- }
- }
- """
- user_content = f"申请书:\n{application_text}\n\n庭审笔录:\n{transcript_text}"
- result = call_deepseek_json(system_prompt, user_content)
- if isinstance(result, dict) and "case_profile" in result:
- return result
- return {"case_profile": {"parties": "", "claims": [], "background": "", "timeline": [], "key_facts": [], "disputed_facts": []}}
- def extract_dispute_points(application_text: str, transcript_text: str) -> List[str]:
- system_prompt = """
- 你是劳动争议分析专家。请从申请书和庭审笔录中提取本案争议焦点。
- 如果庭审笔录中有“争议焦点”或“争议焦点为”,优先提取其内容。
- 仅输出JSON,不要包含解释或Markdown。
- 输出格式:
- {
- "dispute_points": ["争议焦点1", "争议焦点2"]
- }
- """
- user_content = f"申请书:\n{application_text}\n\n庭审笔录:\n{transcript_text}"
- result = call_deepseek_json(system_prompt, user_content)
- if isinstance(result, dict):
- points = result.get("dispute_points", [])
- if isinstance(points, list):
- return [p for p in points if isinstance(p, str) and p.strip()]
- text = transcript_text or application_text
- matches = re.findall(r"争议焦点为[::]?(.*)", text)
- if matches:
- raw = matches[0]
- parts = re.split(r"[;;。]\s*|\d+[、.]", raw)
- return [p.strip() for p in parts if p.strip()]
- return []
- def list_evidence_categories(evidence_dir: str) -> List[str]:
- if not evidence_dir or not os.path.exists(evidence_dir):
- return []
- entries = []
- for name in os.listdir(evidence_dir):
- full_path = os.path.join(evidence_dir, name)
- if os.path.isdir(full_path):
- entries.append(name)
- if entries:
- return sorted(entries)
- files = list_files_by_ext(evidence_dir, [".pdf", ".doc", ".docx", ".png", ".jpg", ".jpeg", ".bmp", ".gif"])
- return ["未分类"] if files else []
- def select_relevant_categories(dispute_points: List[str], categories: List[str]) -> Dict[str, List[str]]:
- if not dispute_points or not categories:
- return {}
- system_prompt = """
- 你是证据分析专家。请根据争议焦点选择可能相关的证据类别。
- 仅输出JSON,不要包含解释或Markdown。
- 输出格式:
- {
- "mapping": [
- {"dispute_point": "争议焦点1", "categories": ["证据类别A", "证据类别B"]},
- {"dispute_point": "争议焦点2", "categories": ["证据类别C"]}
- ]
- }
- """
- user_content = json.dumps({"dispute_points": dispute_points, "evidence_categories": categories}, ensure_ascii=False)
- result = call_deepseek_json(system_prompt, user_content)
- mapping: Dict[str, List[str]] = {}
- if isinstance(result, dict):
- items = result.get("mapping", [])
- if isinstance(items, list):
- for item in items:
- point = item.get("dispute_point")
- cats = item.get("categories", [])
- if isinstance(point, str) and isinstance(cats, list):
- valid = [c for c in cats if c in categories]
- if valid:
- mapping[point] = valid
- return mapping
- def limit_files(files: List[str], limit: int = 10) -> List[str]:
- if len(files) <= limit:
- return files
- head = limit // 2
- tail = limit - head
- return files[:head] + files[-tail:]
- def ocr_evidence_files(files: List[str]) -> List[Dict[str, Any]]:
- image_exts = [".png", ".jpg", ".jpeg", ".bmp", ".gif"]
- doc_exts = [".pdf", ".doc", ".docx"]
- images = [f for f in files if os.path.splitext(f)[1].lower() in image_exts]
- docs = [f for f in files if os.path.splitext(f)[1].lower() in doc_exts]
- results = []
- if images:
- ocr_client = LayoutParserClient_evidence()
- text = ocr_client.parse(images)
- extracted = extract_evidence_text_from_ocr(text)
- results.append(
- {
- "files": images,
- "text": extracted["text"][:2000],
- "lines": extracted["lines"][:200]
- }
- )
- if docs:
- reader = DocumentReader()
- for doc in docs:
- text = reader.process_input(doc)
- results.append({"files": [doc], "text": text[:2000]})
- return results
- def retrieve_laws(dispute_points: List[str]) -> Dict[str, List[Dict[str, str]]]:
- laws = {}
- for point in dispute_points:
- laws[point] = law_rag_run(point)
- return laws
- def final_judgement(
- case_profile: Dict[str, Any],
- dispute_points: List[str],
- law_results: Dict[str, Any],
- evidence_results: Dict[str, Any],
- similar_cases: List[Dict[str, Any]]
- ) -> Dict[str, Any]:
- system_prompt = """
- 你是劳动争议案件裁决分析专家。基于案件画像、争议焦点、证据摘要、相关法律条文与相似案例,给出最终判断。
- 仅输出JSON,不要包含解释或Markdown。
- 输出格式:
- {
- "final_decision": "最终判断结论",
- "reasoning": "综合理由",
- "dispute_point_findings": [
- {
- "dispute_point": "争议焦点1",
- "finding": "对此争议的判断",
- "evidence_used": ["证据类别A", "证据类别B"],
- "law_applied": ["法律条文ID1", "法律条文ID2"]
- }
- ]
- }
- """
- user_content = json.dumps(
- {
- "case_profile": case_profile,
- "dispute_points": dispute_points,
- "law_results": law_results,
- "evidence_results": evidence_results,
- "similar_cases": similar_cases
- },
- ensure_ascii=False
- )
- result = call_deepseek_json(system_prompt, user_content)
- if isinstance(result, dict):
- return result
- return {"final_decision": "", "reasoning": "", "dispute_point_findings": []}
- def build_case_summary_text(case_profile: Dict[str, Any], dispute_points: List[str]) -> str:
- profile = case_profile.get("case_profile", {}) if isinstance(case_profile, dict) else {}
- parts = []
- parties = profile.get("parties")
- if parties:
- parts.append(str(parties))
- claims = profile.get("claims", [])
- if isinstance(claims, list) and claims:
- parts.append(" ".join([str(c) for c in claims]))
- background = profile.get("background")
- if background:
- parts.append(str(background))
- if dispute_points:
- parts.append(" ".join(dispute_points))
- return "\n".join([p for p in parts if p])
- def compute_similar_cases(embedding: List[float]) -> List[Dict[str, Any]]:
- raw_cases = fetch_similar_cases(embedding, top_k=50)
- scored = []
- for item in raw_cases:
- score = cosine_similarity(embedding, item.get("embedding", []))
- if score <= 0:
- continue
- try:
- final_judgement = json.loads(item.get("final_judgement_json") or "{}")
- except Exception:
- final_judgement = {}
- scored.append(
- {
- "case_id": item.get("case_id"),
- "summary_text": item.get("summary_text"),
- "final_judgement": final_judgement,
- "similarity": score
- }
- )
- scored.sort(key=lambda x: x["similarity"], reverse=True)
- return scored[:3]
- def process_case_text_with_evidence(case_id: str, application_text: str, transcript_text: str, evidence_dir: str) -> Dict[str, Any]:
- case_profile = build_case_profile(application_text, transcript_text)
- dispute_points = extract_dispute_points(application_text, transcript_text)
- categories = list_evidence_categories(evidence_dir)
- mapping = select_relevant_categories(dispute_points, categories)
- selected = set()
- for cats in mapping.values():
- for cat in cats:
- selected.add(cat)
- if not selected and categories:
- if "证据清单" in categories:
- selected.add("证据清单")
- else:
- selected.update(categories)
- evidence_results = {}
- for category in sorted(selected):
- category_path = evidence_dir if category == "未分类" else os.path.join(evidence_dir, category)
- files = list_files_by_ext(category_path, [".pdf", ".doc", ".docx", ".png", ".jpg", ".jpeg", ".bmp", ".gif"])
- files = limit_files(files, 10)
- evidence_results[category] = ocr_evidence_files(files) if files else []
- law_results = retrieve_laws(dispute_points)
- summary_text = build_case_summary_text(case_profile, dispute_points)
- embedding = compute_embedding(summary_text)
- similar_cases = compute_similar_cases(embedding)
- judgement = final_judgement(case_profile, dispute_points, law_results, evidence_results, similar_cases)
- return {
- "case_profile": case_profile,
- "dispute_points": dispute_points,
- "evidence_results": evidence_results,
- "law_results": law_results,
- "summary_text": summary_text,
- "embedding": embedding,
- "similar_cases": similar_cases,
- "final_judgement": judgement
- }
- def process_case_dir(case_dir: str) -> Dict[str, Any]:
- init_case_db()
- case_id = os.path.basename(case_dir.rstrip(os.sep))
- application_dir = os.path.join(case_dir, "申请书")
- transcript_dir = os.path.join(case_dir, "庭审笔录")
- evidence_dir = os.path.join(case_dir, "证据")
- application_text = extract_application_text(application_dir)
- transcript_text = extract_transcript_text(transcript_dir)
- result = process_case_text_with_evidence(case_id, application_text, transcript_text, evidence_dir)
- store_case_record(
- case_id,
- result["summary_text"],
- result["case_profile"],
- result["dispute_points"],
- result["law_results"],
- result["evidence_results"],
- result["final_judgement"],
- result["embedding"]
- )
- return {
- "case_dir": case_dir,
- "application_text": application_text,
- "transcript_text": transcript_text,
- "case_profile": result["case_profile"],
- "dispute_points": result["dispute_points"],
- "law_results": result["law_results"],
- "evidence_results": result["evidence_results"],
- "final_judgement": result["final_judgement"],
- "similar_cases": result["similar_cases"]
- }
|