Spaces:

CausalNLP
/

causal-agent

Running

App Files Files Community

causal-agent / main /run_cais.py

FireShadow

Initial clean commit

1721aea 4 months ago

raw

history blame

4.7 kB

	## This file runs the CAIS pipeline for a list of queries provided in a CSV file

	import os, re, io, time, json, logging, contextlib, textwrap
	from typing import Dict, Any
	import pandas as pd
	import argparse
	from auto_causal.agent import run_causal_analysis

	# Constants
	RATE_LIMIT_SECONDS = 2

	def run_cais(desc, question, df):
	"""
	A wrapper function to run the causal analysis pipeline
	Args:
	desc (str): Description of the dataset
	question (str): Natural language query associated with the dataset
	df (str): Path to the csv file assocated with the dataset

	Returns:
	dict: Results from the CAIS pipeline
	"""

	return run_causal_analysis(query=question, dataset_path=df, dataset_description=desc)

	def parse_args():

	parser = argparse.ArgumentParser(description="Run batch causal analysis.")
	parser.add_argument("-m", "--metadata_path", type=str, required=True,
	help="Path to the CSV file with queries, descriptions, and file names etc")
	parser.add_argument("-d", "--data_dir", type=str, required=True,
	help="Path to the folder containing the data in CSV format")
	parser.add_argument("-o", "--output_dir", type=str, required=True,
	help="Path to the folder where the output is saved output")
	parser.add_argument("-n", "--output_name", type=str, default="cais_results.json",)
	parser.add_argument("-l", "--llm_name", type=str, required=True,
	help="Name of the LLM used to be used")
	return parser.parse_args()

	def main():

	args = parse_args()
	metadata_path = args.metadata_path
	data_dir = args.data_dir
	output_dir = args.output_dir
	output_name = args.output_name
	os.environ["LLM_MODEL"] = args.llm_name
	print("[main] Starting batch processing…")

	if not os.path.exists(metadata_path):
	logging.error(f"Meta file not found: {metadata_path}")
	return

	meta_df = pd.read_csv(metadata_path)
	print(f"[main] Loaded metadata CSV with {len(meta_df)} rows.")

	results: Dict[int, Dict[str, Any]] = {}

	for idx, row in meta_df.iterrows():
	data_path = os.path.join(data_dir, str(row["data_files"]))
	print(f"\n[main] Row {idx+1}/{len(meta_df)} → Dataset: {data_path}")

	try:
	res = run_cais(desc=row["data_description"], question=row["natural_language_query"],
	df=data_path)

	# Format result according to specified structure
	formatted_result = {
	"query": row["natural_language_query"],
	"method": row["method"],
	"answer": row["answer"],
	"dataset_description": row["data_description"],
	"dataset_path": data_path,
	"keywords": row.get("keywords", "Causality, Average treatment effect"),
	"final_result": {
	"method": res['results']['results'].get("method_used"),
	"causal_effect": res['results']['results'].get("effect_estimate"),
	"standard_deviation": res['results']['results'].get("standard_error"),
	"treatment_variable": res['results']['variables'].get("treatment_variable", None),
	"outcome_variable": res['results']['variables'].get("outcome_variable", None),
	"covariates": res['results']['variables'].get("covariates", []),
	"instrument_variable": res['results']['variables'].get("instrument_variable", None),
	"running_variable": res['results']['variables'].get("running_variable", None),
	"temporal_variable": res['results']['variables'].get("time_variable", None),
	"statistical_test_results": res.get("summary", ""),
	"explanation_for_model_choice": res.get("explanation", ""),
	"regression_equation": res.get("regression_equation", "")
	}
	}
	results[idx] = formatted_result
	print(f"[main] Formatted result for row {idx+1}:", formatted_result)

	except Exception as e:
	logging.error(f"[{idx+1}] Error: {e}")
	results[idx] = {"answer": str(e)}

	time.sleep(RATE_LIMIT_SECONDS)

	os.makedirs(output_dir, exist_ok=True)
	output_json = os.path.join(output_dir, output_name)
	if not output_json.endswith(".json"):
	output_json += ".json"
	with open(output_json, "w") as f:
	json.dump(results, f, indent=4)
	print(f"[main] Done. Predictions saved to {output_json}")

	if __name__ == "__main__":
	main()