import os
from dotenv import load_dotenv
from elasticsearch import Elasticsearch
from bs4 import BeautifulSoup
from typing import List, Dict
from sentence_transformers import SentenceTransformer

#Silenciar erros de certificado SSL
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)


load_dotenv()
ES_HOST = os.getenv("ES_HOST").strip()
ES_USER = os.getenv("ES_USER").strip()
ES_PASS = os.getenv("ES_PASS").strip()
INDEX_RAG = os.getenv("INDEX_RAG", "documentos_rag")
INDEX_ORIGEM = os.getenv("INDEX_ORIGEM", "decisoes")
VERIFY_CERTS = False

EMBED_MODEL = 'all-MiniLM-L6-v2'
#EMBED_MODEL = 'paraphrase-multilingual-MiniLM-L12-v2'
embedder = SentenceTransformer(EMBED_MODEL)

MAX_VECTOR_RESULTS = 100  # Number of initial vector matches to consider
MAX_FINAL_RESULTS = 5   # Final number of documents to return

es = Elasticsearch(
	hosts=[ES_HOST],
	basic_auth=(ES_USER, ES_PASS),
	verify_certs=VERIFY_CERTS
)


def extract_html_text(html):
    soup = BeautifulSoup(html, "html.parser")

    # Remove unwanted tags
    for tag in soup(['style', 'script', 'input', 'img']):
        tag.decompose()

    # Get the text, stripping excessive whitespace
    text = soup.get_text(separator='\n', strip=True)

    # Optional: clean up multiple newlines
    lines = [line.strip() for line in text.split('\n') if line.strip()]
    cleaned_text = '\n'.join(lines)

    return cleaned_text



def retrieve_semantic_docs(query: str) -> List[Dict]:
    """Simplified but effective search function"""
    try:
        # 1. Get embedding and perform vector search
        embedding = embedder.encode(query).tolist()
        
        # Vector search in documentos_rag index
        vector_res = es.search(
            index=INDEX_RAG,
            body={
                "size": MAX_VECTOR_RESULTS,
                "query": {
                    "script_score": {
                        "query": {"match_all": {}},
                        "script": {
                            "source": "cosineSimilarity(params.query_vector, 'embedding') + 1.0",
                            "params": {"query_vector": embedding}
                        }
                    }
                },
                "_source": ["conteudo"]
            }
        )
        
        # 2. Collect content from vector results
        paragraphs = []
        for hit in vector_res['hits']['hits']:
            if 'conteudo' in hit['_source']:
                paragraphs.append(hit['_source']['conteudo'])
        
        if not paragraphs:
            return [{"error": "No content found in vector search"}]
        
        # 3. Search in decisoes index using simple match query
        matched_docs = []
        for paragraph in paragraphs[:MAX_VECTOR_RESULTS+1]:
            try:
                full_res = es.search(
                    index=INDEX_ORIGEM,
                    body={
                        "size": 1,
                        "query": {
                            "match": {
                                "conteudoHtml": paragraph
                            }
                        }
                    }
                )
                
                if full_res['hits']['hits']:
                    doc = full_res['hits']['hits'][0]['_source']
                    matched_docs.append({
                        "conteudo_html": extract_html_text(doc.get("conteudoHtml", "[Sem conteúdo]")),
                        "texto_publicacao": doc.get("textoPublicacao", "N/A"),
                        "numero_publicacao": doc.get("numeroPublicacao", "N/A"),
                        "numero_unico": doc.get("decisao", {}).get("numeroUnico", "N/A"),
                        "data_publicacao": doc.get("dataPublicacao", "N/A")
                    })
                    
            except Exception as e:
                continue  # Skip failed searches
        
        # Return top results (remove duplicates by numero_unico)
        unique_docs = {}
        for doc in matched_docs:
            if "numero_unico" in doc:
                unique_docs[doc["numero_unico"]] = doc
                
        return list(unique_docs.values())[:MAX_FINAL_RESULTS]
    
    except Exception as e:
        return [{"error": f"Search error: {str(e)}"}]

