o
    +i,                     @   s  d Z ddlZddlZddlZddlZddlmZ ddlmZm	Z	 ddl
mZ ddlmZ ddlmZ ddlmZ ddlmZ ded	ed
efddZdejded
ee fddZeddddddeej dedee dee deded
ejfddZdeded
eej fddZ e	dZ!e	dZ"d ee! d!ee" d
ee#e!e"f  fd"d#Z$ed$dd%ded&ed'ee% dee d
df
d(d)Z&dS )*zfBeta utility functions to assist in common eval workflows.

These functions may change in the future.
    N)Sequence)OptionalTypeVar)
evaluation)	warn_beta)Clientrun_dictid_mapreturnc                 C   sf   | d }|  D ]\}}|t|t|}q|| d< | dr(|| d  | d< | ds1i | d< | S )zConvert the IDs in the run dictionary using the provided ID map.

    Parameters:
    - run_dict: The dictionary representing a run.
    - id_map: The dictionary mapping old IDs to new IDs.

    Returns:
    - dict: The updated run dictionary.
    dotted_orderparent_run_idextra)itemsreplacestrget)r   r	   dokv r   R/var/www/html/psymed-ai/venv/lib/python3.10/site-packages/langsmith/beta/_evals.py_convert_ids   s   


r   rootrun_to_example_mapc                    s   | g}t  }| j|i g }|rJ| }|jh dd} |d t   |d <  |d  |d<  |d  |d< |jrC||j || |s fdd|D }|| j	 |d d< |S )	a  Convert the root run and its child runs to a list of dictionaries.

    Parameters:
    - root: The root run to convert.
    - run_to_example_map: The dictionary mapping run IDs to example IDs.

    Returns:
    - The list of converted run dictionaries.
    >   
session_idchild_run_idsparent_run_ids)excludeidtrace_idc                    s   g | ]}t | qS r   )r   .0rr	   r   r   
<listcomp>A       z%_convert_root_run.<locals>.<listcomp>r   reference_example_id)
uuiduuid4r   popdictr   
child_runsextendappendr   )r   r   runs_r   resultssrcsrc_dictresultr   r#   r   _convert_root_run*   s"   


	r3   F)test_project_nameclientload_child_runsinclude_outputsrunsdataset_namer4   r5   r6   r7   c                   st  | s	t d|   pt   j|d}|rdd | D nd} jdd | D |dd | D |jd s9| }n
 fd	d| D }|pPd
t jdd  }t	 j
|d}	dd |	D |	d jrj|	d jn|	d j}
fdd|D } j||jd|
 dd}|D ])}|d |d  }tjjtjjd|d< |d | |d<  jdi |d|i q |j}|S )a  Convert the following runs to a dataset + test.

    This makes it easy to sample prod runs into a new regression testing
    workflow and compare against a candidate system.

    Internally, this function does the following:
        1. Create a dataset from the provided production run inputs.
        2. Create a new test project.
        3. Clone the production runs and re-upload against the dataset.

    Parameters:
    - runs: A sequence of runs to be executed as a test.
    - dataset_name: The name of the dataset to associate with the test runs.
    - client: An optional LangSmith client instance. If not provided, a new client will
        be created.
    - load_child_runs: Whether to load child runs when copying runs.

    Returns:
    - The project containing the cloned runs.

    Example:
    --------
    ```python
    import langsmith
    import random

    client = langsmith.Client()

    # Randomly sample 100 runs from a prod project
    runs = list(client.list_runs(project_name="My Project", execution_order=1))
    sampled_runs = random.sample(runs, min(len(runs), 100))

    runs_as_test(runs, dataset_name="Random Runs")

    # Select runs named "extractor" whose root traces received good feedback
    runs = client.list_runs(
        project_name="<your_project>",
        filter='eq(name, "extractor")',
        trace_filter='and(eq(feedback_key, "user_score"), eq(feedback_score, 1))',
    )
    runs_as_test(runs, dataset_name="Extraction Good")
    ```
    z1Expected a non-empty sequence of runs. Received: )r9   c                 S      g | ]}|j qS r   )outputsr    r   r   r   r$          z(convert_runs_to_test.<locals>.<listcomp>Nc                 S   r:   r   )inputsr    r   r   r   r$      r<   c                 S   r:   r   )r   r    r   r   r   r$      r<   )r=   r;   source_run_ids
dataset_idc                    s   g | ]
} j |jd qS ))r6   )read_runr   r    )r5   r6   r   r   r$      s    zprod-baseline-   c                 S   s   i | ]}|j |jqS r   )source_run_idr   )r!   er   r   r   
<dictcomp>   r%   z(convert_runs_to_test.<locals>.<dictcomp>r   c                    s    g | ]}t | D ]}|q	qS r   )r3   )r!   root_runr   )r   r   r   r$      s    zprod-baseline)whichdataset_version)project_namereference_dataset_idmetadataend_time
start_time)tzrH   r   )
ValueErrorrtget_cached_clientcreate_datasetcreate_examplesr   r'   r(   hexlistlist_examplesmodified_at
created_atcreate_project	isoformatdatetimenowtimezoneutc
create_runupdate_project)r8   r9   r4   r5   r6   r7   dsr;   runs_to_copyexamplesrG   	to_createprojectnew_runlatency_r   )r5   r6   r   r   convert_runs_to_testF   sP   5
	rh   rH   c           	      C   s   |j | d}tt}g }i }|D ]}|jd ur!||j | n|| |||j< q| D ]\}}t|dd d|| _	q0|S )N)rH   c                 S   s   | j S N)r   )r"   r   r   r   <lambda>   s    z%_load_nested_traces.<locals>.<lambda>)key)
	list_runscollectionsdefaultdictrT   r   r-   r   r   sortedr+   )	rH   r5   r8   treemapr/   all_runsrunrun_idr+   r   r   r   _load_nested_traces   s   

rt   TUlist1list2c                 C   s   t t| |S ri   )rT   	itertoolsproduct)rw   rx   r   r   r   _outer_product   s   r{   
   )max_concurrencyr5   
evaluatorsr}   c                C   s   ddl m} g }|D ]#}t|tjr|| q
t|r%|t| q
tdt	| |p3t
 }t| |}||d}|j|jgtt|| R  }	W d   n1 sXw   Y  |	D ]}
q_dS )a  Compute test metrics for a given test name using a list of evaluators.

    Args:
        project_name (str): The name of the test project to evaluate.
        evaluators (list): A list of evaluators to compute metrics with.
        max_concurrency (Optional[int], optional): The maximum number of concurrent
            evaluations. Defaults to 10.
        client (Optional[Client], optional): The client to use for evaluations.
            Defaults to None.

    Returns:
        None: This function does not return any value.
    r   )ContextThreadPoolExecutorz5Evaluation not yet implemented for evaluator of type )max_workersN)	langsmithr   
isinstancels_evalRunEvaluatorr-   callablerun_evaluatorNotImplementedErrortyperO   rP   rt   mapevaluate_runzipr{   )rH   r~   r}   r5   r   evaluators_functracesexecutorr/   rg   r   r   r   compute_test_metrics   s,   

r   )'__doc__rm   rZ   ry   r'   collections.abcr   typingr   r   langsmith.run_trees	run_treesrO   langsmith.schemasschemas
ls_schemasr   r   r   #langsmith._internal._beta_decoratorr   langsmith.clientr   r*   r   RunrT   r3   r   boolTracerSessionrh   rt   ru   rv   tupler{   intr   r   r   r   r   <module>   sh    j*