| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182 |
- """
- Bid field extraction demo - demonstrates structured information extraction
- from bidding announcements using RAG
- """
- import sys
- import os
- import json
- sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
- from openai import OpenAI
- from bdirag.document_processor import Document
- from bdirag.embedding_models import SentenceTransformerEmbedding
- from bdirag.vector_stores import FAISSStore
- from bdirag.rag_methods import BidFieldExtractionRAG
- from examples.sample_data import SAMPLE_BIDDING_DOCS
- def main():
- print("=" * 60)
- print("BidiRAG - Bid Field Extraction Demo")
- print("=" * 60)
- # Configuration
- LLM_API_KEY = os.getenv("OPENAI_API_KEY", "your-api-key-here")
- LLM_BASE_URL = os.getenv("OPENAI_BASE_URL", "https://api.openai.com/v1")
- LLM_MODEL = os.getenv("LLM_MODEL", "gpt-4o")
- # Load embedding model
- print("\n[1/3] Loading embedding model...")
- embedding_model = SentenceTransformerEmbedding(
- model_name="BAAI/bge-large-zh-v1.5",
- device="cpu"
- )
- # Create vector store and index
- print("\n[2/3] Indexing bidding documents...")
- vector_store = FAISSStore(embedding_model=embedding_model)
- documents = [
- Document(page_content=doc["content"], metadata={"title": doc["title"], "source": doc["title"]})
- for doc in SAMPLE_BIDDING_DOCS
- ]
- llm_client = OpenAI(api_key=LLM_API_KEY, base_url=LLM_BASE_URL)
- rag = BidFieldExtractionRAG(
- embedding_model=embedding_model,
- vector_store=vector_store,
- llm_client=llm_client,
- llm_model=LLM_MODEL,
- )
- rag.index_documents(documents)
- print(" Indexed {0} documents.format(len(documents))")
- # Extract fields for each bidding document
- print("\n[3/3] Extracting fields from bidding documents...")
- for doc in SAMPLE_BIDDING_DOCS:
- print("\n{0}.format('=' * 60)")
- print("Document: {0}.format(doc['title'])")
- print("{0}.format('=' * 60)")
- query = "Extract all information from {0}.format(doc['title'])"
- try:
- result = rag.query(query, k=10)
- print(f"\nExtracted JSON:")
- print(result.answer)
- print("\nLatency: {0}s.format(result.latency_total:.3f)")
- print("Retrieved {0} document chunks.format(len(result.retrieved_docs))")
- except Exception as e:
- print("ERROR: {0}.format(e)")
- print("\n\nExtraction complete!")
- if __name__ == "__main__":
- main()
|