o
    mh                     @   s   d dl Z d dlmZ d dlmZ d dlmZ d dlmZm	Z	 d dl
mZ d dlZeejj e  e d Ze d Ze d	 Ze d
dZe ddZdZdZeeZdZdZeegeefedZdd Zdedee	 fddZdS )    N)load_dotenv)Elasticsearch)BeautifulSoup)ListDict)SentenceTransformerES_HOSTES_USERES_PASS	INDEX_RAGdocumentos_ragINDEX_ORIGEMdecisoesFzall-MiniLM-L6-v2d      )hosts
basic_authverify_certsc                 C   sT   t | d}|g dD ]}|  q|jddd}dd |dD }d|}|S )Nzhtml.parser)stylescriptinputimg
T)	separatorstripc                 S   s   g | ]
}|  r|  qS  )r   ).0liner   r   $/var/www/observatorio/data_search.py
<listcomp>.   s    z%extract_html_text.<locals>.<listcomp>)r   	decomposeget_textsplitjoin)htmlsouptagtextlinescleaned_textr   r   r   extract_html_text#   s   


r*   queryreturnc                 C   s  zt |  }tjttddi idd|iddidgdd	}g }|d
 d
 D ]}d|d v r9||d d  q(|sBddigW S g }|dtd  D ]Y}zEtjtddd|iidd	}|d
 d
 r|d
 d
 d d }|t	|
dd|
dd|
dd|
di 
dd|
ddd W qL ty }	 zW Y d}	~	qLd}	~	ww i }
|D ]}d|v r||
|d < qt|
 dt W S  ty }	 zddt|	 igW  Y d}	~	S d}	~	ww )z(Simplified but effective search functionscript_score	match_allz8cosineSimilarity(params.query_vector, 'embedding') + 1.0query_vector)sourceparams)r+   r   conteudo)sizer+   _source)indexbodyhitsr4   errorz!No content found in vector searchN   matchconteudoHtml)r3   r+   r   u   [Sem conteúdo]textoPublicacaozN/AnumeroPublicacaodecisaonumeroUnicodataPublicacao)conteudo_htmltexto_publicacaonumero_publicacaonumero_unicodata_publicacaorD   zSearch error: )embedderencodetolistessearchr   MAX_VECTOR_RESULTSappendr   r*   get	ExceptionlistvaluesMAX_FINAL_RESULTSstr)r+   	embedding
vector_res
paragraphshitmatched_docs	paragraphfull_resdoceunique_docsr   r   r   retrieve_semantic_docs5   st   	


 r]   ) osdotenvr   elasticsearchr   bs4r   typingr   r   sentence_transformersr   urllib3disable_warnings
exceptionsInsecureRequestWarninggetenvr   r   r	   r
   r   r   VERIFY_CERTSEMBED_MODELrF   rK   rQ   rI   r*   rR   r]   r   r   r   r   <module>   s4    