euroeval.metrics.tool_calling¶

[docs] module euroeval.metrics.tool_calling
"""Tool calling metric."""import collections.abc as cimport jsonimport loggingimport typing as tfrom ..logging_utils importlog_onceift.TYPE_CHECKING:from datasets.arrow_dataset importDatasetfrom ..data_models importBenchmarkConfig,DatasetConfigfrom ..metrics.base importMetricclass ToolCallingAccuracy(Metric):[docs]
    """Metric for tool calling."""def __call__(self,predictions:c.Sequence,references:c.Sequence,dataset:"Dataset",dataset_config:"DatasetConfig",benchmark_config:"BenchmarkConfig",)->float|None:        """Calculate tool calling accuracy.        Args:            predictions:                Predicted "labels" - meaning tool calls in this context.            references:                Ground truth data - NB: format is different from predictions,                since ground truth contains lists of possible outputs rather than                a single 'truth'.            dataset:                Dataset - used for tool information like required arguments.            dataset_config:                Part of interface - not used here.            benchmark_config:                Part of interface - not used here.        Returns:            The score (accuracy).            Returns None if any of predictions, references or dataset["function"]            sequences are empty - meaning a score could not be calculated.        """function_descriptions=[json.loads(f)forfindataset["function"]]results=[]forxinzip(predictions,references,function_descriptions):results.append(_evaluate_function_toolcall_response(*x))ifnotresults:returnNoneelse:returnsum(results)/len(results)def _evaluate_function_toolcall_response(pred_calls_str:str,ref_calls_str:str,descriptions:list[dict])->bool:    """Logic to evaluate tool call response against reference (ground truth).    Args:        pred_calls_str:            Predicted function calls as json string        ref_calls_str:            Referenced function calls as json string        descriptions:            Function descriptions (in dataset and given as input to models)    Returns:        True: success, False: failure    """# try deserialize predictiontry:pred_calls_dict=json.loads(pred_calls_str)assertisinstance(pred_calls_dict,dict)assert"tool_calls"inpred_calls_dictpred_calls=pred_calls_dict["tool_calls"]except(json.JSONDecodeError,AssertionError):returnFalseref_calls=json.loads(ref_calls_str)# number of predicted function calls should equal the referenceiflen(pred_calls)!=len(ref_calls):returnFalseforpred_call,ref_call,descriptioninzip(pred_calls,ref_calls,descriptions):# each predicted function call should be a dictifnotisinstance(pred_call,dict):returnFalse# get predicted function nameif"function"notinpred_call:log_once("Tool call prediction did not contain required keyword 'function'.",level=logging.DEBUG,)returnFalseelse:pred_name:str=pred_call["function"]# get predicted argumentsif"arguments"notinpred_call:log_once("Tool call prediction did not contain required keyword 'arguments'.",level=logging.DEBUG,)returnFalseelse:pred_args:dict=pred_call["arguments"]ref_name:strref_args:dict# reference calls are packed into an extra list by BFCL default for some reasonref_name,ref_args=list(ref_call.items())[0]# did we predict the right function to call?ifpred_name!=ref_name:returnFalse# get requires arguments from function descriptionsparameters=description.get("parameters",None)required_args=(parameters.get("required",None)ifisinstance(parameters,dict)elseNone)forkey,valuesinref_args.items():# we only care about required argumentsifrequired_argsandkeynotinrequired_args:continue# every predicted argument should be in the list of expected valuesifkeynotinpred_argsorpred_args[key]notinvalues:returnFalsereturnTruetool_calling_accuracy=ToolCallingAccuracy(name="tool_calling_accuracy",pretty_name="Tool Calling Accuracy")