Spaces:
Sleeping
Sleeping
| import json | |
| from tempfile import mktemp | |
| import argilla as rg | |
| from huggingface_hub import HfApi | |
| from defaults import REMOTE_CODE_PATHS, SEED_DATA_PATH | |
| hf_api = HfApi() | |
| with open("DATASET_README_BASE.md") as f: | |
| DATASET_README_BASE = f.read() | |
| def create_readme(domain_seed_data, project_name, domain): | |
| # create a readme for the project that shows the domain and project name | |
| readme = DATASET_README_BASE | |
| readme += f"# {project_name}\n\n## Domain: {domain}" | |
| perspectives = domain_seed_data.get("perspectives") | |
| topics = domain_seed_data.get("topics") | |
| examples = domain_seed_data.get("examples") | |
| if perspectives: | |
| readme += "\n\n## Perspectives\n\n" | |
| for p in perspectives: | |
| readme += f"- {p}\n" | |
| if topics: | |
| readme += "\n\n## Topics\n\n" | |
| for t in topics: | |
| readme += f"- {t}\n" | |
| if examples: | |
| readme += "\n\n## Examples\n\n" | |
| for example in examples: | |
| readme += f"### {example['question']}\n\n{example['answer']}\n\n" | |
| temp_file = mktemp() | |
| with open(temp_file, "w") as f: | |
| f.write(readme) | |
| return temp_file | |
| def setup_dataset_on_hub(repo_id, hub_token): | |
| # create an empty dataset repo on the hub | |
| hf_api.create_repo( | |
| repo_id=repo_id, | |
| token=hub_token, | |
| repo_type="dataset", | |
| exist_ok=True, | |
| ) | |
| def push_dataset_to_hub( | |
| domain_seed_data_path, | |
| project_name, | |
| domain, | |
| pipeline_path, | |
| hub_username, | |
| hub_token: str, | |
| ): | |
| repo_id = f"{hub_username}/{project_name}" | |
| setup_dataset_on_hub(repo_id=repo_id, hub_token=hub_token) | |
| # upload the seed data and readme to the hub | |
| hf_api.upload_file( | |
| path_or_fileobj=domain_seed_data_path, | |
| path_in_repo="seed_data.json", | |
| token=hub_token, | |
| repo_id=repo_id, | |
| repo_type="dataset", | |
| ) | |
| # upload the readme to the hub | |
| domain_seed_data = json.load(open(domain_seed_data_path)) | |
| hf_api.upload_file( | |
| path_or_fileobj=create_readme( | |
| domain_seed_data=domain_seed_data, project_name=project_name, domain=domain | |
| ), | |
| path_in_repo="README.md", | |
| token=hub_token, | |
| repo_id=repo_id, | |
| repo_type="dataset", | |
| ) | |
| def push_pipeline_to_hub( | |
| pipeline_path, | |
| hub_username, | |
| hub_token: str, | |
| project_name, | |
| ): | |
| repo_id = f"{hub_username}/{project_name}" | |
| # upload the pipeline to the hub | |
| hf_api.upload_file( | |
| path_or_fileobj=pipeline_path, | |
| path_in_repo="pipeline.py", | |
| token=hub_token, | |
| repo_id=repo_id, | |
| repo_type="dataset", | |
| ) | |
| for code_path in REMOTE_CODE_PATHS: | |
| hf_api.upload_file( | |
| path_or_fileobj=code_path, | |
| path_in_repo=code_path, | |
| token=hub_token, | |
| repo_id=repo_id, | |
| repo_type="dataset", | |
| ) | |
| print(f"Dataset uploaded to {repo_id}") | |
| def pull_seed_data_from_repo(repo_id, hub_token): | |
| # pull the dataset repo from the hub | |
| hf_api.hf_hub_download( | |
| repo_id=repo_id, token=hub_token, repo_type="dataset", filename=SEED_DATA_PATH | |
| ) | |
| return json.load(open(SEED_DATA_PATH)) | |
| def push_argilla_dataset_to_hub( | |
| name: str, | |
| repo_id: str, | |
| url: str, | |
| api_key: str, | |
| hub_token: str, | |
| workspace: str = "admin", | |
| ): | |
| rg.init(api_url=url, api_key=api_key) | |
| feedback_dataset = rg.FeedbackDataset.from_argilla(name=name, workspace=workspace) | |
| local_dataset = feedback_dataset.pull() | |
| local_dataset.push_to_huggingface(repo_id=repo_id, token=hub_token) | |
| def push_pipeline_params( | |
| pipeline_params, | |
| hub_username, | |
| hub_token: str, | |
| project_name, | |
| ): | |
| repo_id = f"{hub_username}/{project_name}" | |
| temp_path = mktemp() | |
| with open(temp_path, "w") as f: | |
| json.dump(pipeline_params, f) | |
| # upload the pipeline to the hub | |
| hf_api.upload_file( | |
| path_or_fileobj=temp_path, | |
| path_in_repo="pipeline_params.json", | |
| token=hub_token, | |
| repo_id=repo_id, | |
| repo_type="dataset", | |
| ) | |
| print(f"Pipeline params uploaded to {repo_id}") | |