Spaces:

burtenshaw
/

yoga_nistra_config_space

Sleeping

App Files Files Community

Ben Burtenshaw commited on Apr 27, 2024

Commit

fc828f1

1 Parent(s): dfd3683

run pipeline locally

Browse files

Files changed (11) hide show

__pycache__/defaults.cpython-311.pyc +0 -0
__pycache__/domain.cpython-311.pyc +0 -0
__pycache__/hub.cpython-311.pyc +0 -0
__pycache__/infer.cpython-311.pyc +0 -0
__pycache__/pipeline.cpython-311.pyc +0 -0
__pycache__/utils.cpython-311.pyc +0 -0
pages/2_👩🏼‍🔬 Describe Domain.py +3 -1
pages/3_🌱 Generate Dataset.py +79 -37
pipeline.yaml +9 -33
pipeline_params.json +0 -0
utils.py +30 -3

__pycache__/defaults.cpython-311.pyc ADDED Viewed

Binary file (2.32 kB). View file

__pycache__/domain.cpython-311.pyc ADDED Viewed

Binary file (4.53 kB). View file

__pycache__/hub.cpython-311.pyc ADDED Viewed

Binary file (5.78 kB). View file

__pycache__/infer.cpython-311.pyc ADDED Viewed

Binary file (837 Bytes). View file

__pycache__/pipeline.cpython-311.pyc ADDED Viewed

Binary file (8.2 kB). View file

__pycache__/utils.cpython-311.pyc ADDED Viewed

Binary file (4.93 kB). View file

pages/2_👩🏼‍🔬 Describe Domain.py CHANGED Viewed

@@ -11,7 +11,7 @@ from defaults import (
     PIPELINE_PATH,
     DATASET_REPO_ID,
 )
-from utils import project_sidebar
 st.set_page_config(
@@ -212,6 +212,8 @@ domain_data = {
     "topics": topics,
     "examples": examples,
     "domain_expert_prompt": domain_expert_prompt,
 }
 with open(SEED_DATA_PATH, "w") as f:

     PIPELINE_PATH,
     DATASET_REPO_ID,
 )
+from utils import project_sidebar, create_seed_terms, create_application_instruction
 st.set_page_config(
     "topics": topics,
     "examples": examples,
     "domain_expert_prompt": domain_expert_prompt,
+    "application_instruction": create_application_instruction(domain, examples),
+    "seed_terms": create_seed_terms(topics, perspectives),
 }
 with open(SEED_DATA_PATH, "w") as f:

pages/3_🌱 Generate Dataset.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import streamlit as st
 from defaults import ARGILLA_URL
-from hub import push_pipeline_params, push_pipeline_to_hub
 from utils import project_sidebar
 st.set_page_config(
@@ -20,16 +20,27 @@ st.divider()
 st.subheader("Step 3. Run the pipeline to generate synthetic data")
 st.write("Define the distilabel pipeline for generating the dataset.")
-###############################################################
-# CONFIGURATION
-###############################################################
 hub_username = st.session_state.get("hub_username")
 project_name = st.session_state.get("project_name")
 hub_token = st.session_state.get("hub_token")
 st.divider()
 st.markdown("#### 🤖 Inference configuration")
 st.write(
@@ -43,13 +54,19 @@ with st.expander("🤗 Recommended Models"):
         "https://huggingface.co/models?pipeline_tag=text-generation&other=endpoints_compatible&sort=trending",
     )
     st.write("🔋Projects with sufficient resources could take advantage of LLama3 70b")
-    st.code("https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-70B")
     st.write("🪫Projects with less resources could take advantage of LLama 3 8b")
-    st.code("https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B")
-    st.write("🍃Projects with even less resources could take advantage of Phi-2")
-    st.code("https://api-inference.huggingface.co/models/microsoft/phi-2")
     st.write("Note Hugggingface Pro gives access to more compute resources")
     st.link_button(
@@ -58,10 +75,27 @@ with st.expander("🤗 Recommended Models"):
     )
-base_url = st.text_input(
-    label="Base URL for the Inference API",
-    value="https://api-inference.huggingface.co/models/HuggingFaceH4/zephyr-7b-beta",
 )
 st.divider()
 st.markdown("#### 🔬 Argilla API details to push the generated dataset")
 argilla_url = st.text_input("Argilla API URL", ARGILLA_URL)
@@ -84,30 +118,38 @@ if all(
     [
         argilla_api_key,
         argilla_url,
-        base_url,
-        hub_token,
         project_name,
         hub_token,
         argilla_dataset_name,
     ]
-):
-    push_pipeline_params(
-        pipeline_params={
-            "argilla_api_key": argilla_api_key,
-            "argilla_api_url": argilla_url,
-            "argilla_dataset_name": argilla_dataset_name,
-            "endpoint_base_url": base_url,
-        },
-        hub_username=hub_username,
-        hub_token=hub_token,
-        project_name=project_name,
-    )
-    push_pipeline_to_hub(
-        pipeline_path="pipeline.py",
-        hub_username=hub_username,
-        hub_token=hub_token,
-        project_name=project_name,
     )
     st.markdown(
@@ -118,7 +160,7 @@ if all(
         f"""
         # Install the distilabel library
-        pip install git+https://github.com/argilla-io/distilabel.git
         """
     )
@@ -126,8 +168,8 @@ if all(
     st.code(
         f"""
-        git clone https://huggingface.co/datasets/{hub_username}/{project_name}
-        cd {project_name}
         pip install -r requirements.txt
         """
     )
@@ -135,9 +177,9 @@ if all(
     st.markdown("Finally, you can run the pipeline using the following command:")
     st.code(
-        """
         huggingface-cli login
-        python pipeline.py""",
         language="bash",
     )
     st.markdown(

 import streamlit as st
 from defaults import ARGILLA_URL
+from hub import push_pipeline_params
 from utils import project_sidebar
 st.set_page_config(
 st.subheader("Step 3. Run the pipeline to generate synthetic data")
 st.write("Define the distilabel pipeline for generating the dataset.")
 hub_username = st.session_state.get("hub_username")
 project_name = st.session_state.get("project_name")
 hub_token = st.session_state.get("hub_token")
+###############################################################
+# CONFIGURATION
+###############################################################
 st.divider()
+st.markdown("## 🧰 Pipeline Configuration")
+st.write(
+    "Now we need to define the configuration for the pipeline that will generate the synthetic data."
+)
+st.write(
+    "⚠️ Model and parameter choice significantly affect the quality of the generated data. \
+    We reccomend that you start with a few samples and review the data. The scale up from there."
+)
 st.markdown("#### 🤖 Inference configuration")
 st.write(
         "https://huggingface.co/models?pipeline_tag=text-generation&other=endpoints_compatible&sort=trending",
     )
     st.write("🔋Projects with sufficient resources could take advantage of LLama3 70b")
+    st.code(
+        "https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B-Instruct"
+    )
     st.write("🪫Projects with less resources could take advantage of LLama 3 8b")
+    st.code(
+        "https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B-Instruct"
+    )
+    st.write("🍃Projects with even less resources could use Phi-3-mini-4k-instruct")
+    st.code(
+        "https://api-inference.huggingface.co/models/microsoft/Phi-3-mini-4k-instruct"
+    )
     st.write("Note Hugggingface Pro gives access to more compute resources")
     st.link_button(
     )
+self_instruct_base_url = st.text_input(
+    label="Model base URL for instruction generation",
+    value="https://api-inference.huggingface.co/models/microsoft/Phi-3-mini-4k-instruct",
+)
+domain_expert_base_url = st.text_input(
+    label="Model base URL for domain expert response",
+    value="https://api-inference.huggingface.co/models/microsoft/Phi-3-mini-4k-instruct",
+)
+st.divider()
+st.markdown("#### 🧮 Parameters configuration")
+self_intruct_num_generations = st.slider(
+    "Number of generations for self-instruction", 1, 10, 2
 )
+domain_expert_num_generations = st.slider(
+    "Number of generations for domain expert", 1, 10, 2
+)
+self_instruct_temperature = st.slider("Temperature for self-instruction", 0.1, 1.0, 0.9)
+domain_expert_temperature = st.slider("Temperature for domain expert", 0.1, 1.0, 0.9)
 st.divider()
 st.markdown("#### 🔬 Argilla API details to push the generated dataset")
 argilla_url = st.text_input("Argilla API URL", ARGILLA_URL)
     [
         argilla_api_key,
         argilla_url,
+        self_instruct_base_url,
+        domain_expert_base_url,
+        self_intruct_num_generations,
+        domain_expert_num_generations,
+        self_instruct_temperature,
+        domain_expert_temperature,
+        hub_username,
         project_name,
         hub_token,
         argilla_dataset_name,
     ]
+) and st.button("💾 Save Pipeline Config"):
+    with st.spinner("Pushing pipeline to the Hub..."):
+        push_pipeline_params(
+            pipeline_params={
+                "argilla_api_key": argilla_api_key,
+                "argilla_api_url": argilla_url,
+                "argilla_dataset_name": argilla_dataset_name,
+                "self_instruct_base_url": self_instruct_base_url,
+                "domain_expert_base_url": domain_expert_base_url,
+                "self_instruct_temperature": self_instruct_temperature,
+                "domain_expert_temperature": domain_expert_temperature,
+                "self_intruct_num_generations": self_intruct_num_generations,
+                "domain_expert_num_generations": domain_expert_num_generations,
+            },
+            hub_username=hub_username,
+            hub_token=hub_token,
+            project_name=project_name,
+        )
+    st.success(
+        f"Pipeline configuration pushed to the dataset repo {hub_username}/{project_name} on the Hub."
     )
     st.markdown(
         f"""
         # Install the distilabel library
+        pip install distilabel
         """
     )
     st.code(
         f"""
+        git clone https://github.com/huggingface/data-is-better-together
+        cd data-is-better-together/domain-specific-datasets/pipelines
         pip install -r requirements.txt
         """
     )
     st.markdown("Finally, you can run the pipeline using the following command:")
     st.code(
+        f"""
         huggingface-cli login
+        python domain_expert_pipeline.py {hub_username}/{project_name}""",
         language="bash",
     )
     st.markdown(

pipeline.yaml CHANGED Viewed

@@ -1,5 +1,5 @@
 distilabel:
-  version: 1.0.0
 pipeline:
   name: farming
   description: null
@@ -10,31 +10,7 @@ pipeline:
       output_mappings: {}
       batch_size: 64
       data:
-      - input: animal welfare from a Family Farming perspective
-      - input: animal welfare from a Agribusiness perspective
-      - input: animal welfare from a Permaculture perspective
-      - input: animal welfare from a Agroforestery perspective
-      - input: animal welfare from a Conventional Farming perspective
-      - input: economic growth from a Family Farming perspective
-      - input: economic growth from a Agribusiness perspective
-      - input: economic growth from a Permaculture perspective
-      - input: economic growth from a Agroforestery perspective
-      - input: economic growth from a Conventional Farming perspective
-      - input: land from a Family Farming perspective
-      - input: land from a Agribusiness perspective
-      - input: land from a Permaculture perspective
-      - input: land from a Agroforestery perspective
-      - input: land from a Conventional Farming perspective
-      - input: resources from a Family Farming perspective
-      - input: resources from a Agribusiness perspective
-      - input: resources from a Permaculture perspective
-      - input: resources from a Agroforestery perspective
-      - input: resources from a Conventional Farming perspective
-      - input: efficiency from a Family Farming perspective
-      - input: efficiency from a Agribusiness perspective
-      - input: efficiency from a Permaculture perspective
-      - input: efficiency from a Agroforestery perspective
-      - input: efficiency from a Conventional Farming perspective
       runtime_parameters_info:
       - name: batch_size
         optional: true
@@ -54,7 +30,7 @@ pipeline:
         model_id: null
         endpoint_name: null
         endpoint_namespace: null
-        base_url: https://hh1rkuymnetmkw9m.eu-west-1.aws.endpoints.huggingface.cloud
         tokenizer_id: null
         model_display_name: null
         use_openai_client: false
@@ -75,14 +51,14 @@ pipeline:
         Blend interrogative (e.g., "What is the significance of x?") and imperative
         (e.g., "Detail the process of x.") styles.'
       application_description: 'You are an AI assistant than generates queries around
-        the domain of farming.
         Your should not expect basic but profound questions from your users.
         The queries should reflect a diversity of vision and economic positions and
         political positions.
-        The queries may know about different methods of farming.
         The queries can be positioned politically, economically, socially, or practically.
@@ -163,7 +139,7 @@ pipeline:
         model_id: null
         endpoint_name: null
         endpoint_namespace: null
-        base_url: https://hh1rkuymnetmkw9m.eu-west-1.aws.endpoints.huggingface.cloud
         tokenizer_id: null
         model_display_name: null
         use_openai_client: false
@@ -390,7 +366,7 @@ pipeline:
         model_id: null
         endpoint_name: null
         endpoint_namespace: null
-        base_url: https://hh1rkuymnetmkw9m.eu-west-1.aws.endpoints.huggingface.cloud
         tokenizer_id: null
         model_display_name: null
         use_openai_client: false
@@ -489,9 +465,9 @@ pipeline:
         generation: domain_expert_answer
       output_mappings: {}
       input_batch_size: 50
-      dataset_name: farming
       dataset_workspace: admin
-      api_url: https://argilla-farming.hf.space
       runtime_parameters_info:
       - name: input_batch_size
         optional: true

 distilabel:
+  version: 1.0.1
 pipeline:
   name: farming
   description: null
       output_mappings: {}
       batch_size: 64
       data:
+      - input: punctures from a Retro bikes perspective
       runtime_parameters_info:
       - name: batch_size
         optional: true
         model_id: null
         endpoint_name: null
         endpoint_namespace: null
+        base_url: https://api-inference.huggingface.co/models/HuggingFaceH4/zephyr-7b-beta
         tokenizer_id: null
         model_display_name: null
         use_openai_client: false
         Blend interrogative (e.g., "What is the significance of x?") and imperative
         (e.g., "Detail the process of x.") styles.'
       application_description: 'You are an AI assistant than generates queries around
+        the domain of Bicycle maintenance.
         Your should not expect basic but profound questions from your users.
         The queries should reflect a diversity of vision and economic positions and
         political positions.
+        The queries may know about different methods of Bicycle maintenance.
         The queries can be positioned politically, economically, socially, or practically.
         model_id: null
         endpoint_name: null
         endpoint_namespace: null
+        base_url: https://api-inference.huggingface.co/models/HuggingFaceH4/zephyr-7b-beta
         tokenizer_id: null
         model_display_name: null
         use_openai_client: false
         model_id: null
         endpoint_name: null
         endpoint_namespace: null
+        base_url: https://api-inference.huggingface.co/models/HuggingFaceH4/zephyr-7b-beta
         tokenizer_id: null
         model_display_name: null
         use_openai_client: false
         generation: domain_expert_answer
       output_mappings: {}
       input_batch_size: 50
+      dataset_name: bicycle_maintenance
       dataset_workspace: admin
+      api_url: https://burtenshaw-bicycle-maintenance-argilla-space.hf.space
       runtime_parameters_info:
       - name: input_batch_size
         optional: true

pipeline_params.json ADDED Viewed

File without changes

utils.py CHANGED Viewed

@@ -1,13 +1,13 @@
 import streamlit as st
 from defaults import (
-    ARGILLA_SPACE_REPO_ID,
     PROJECT_NAME,
     ARGILLA_URL,
     DIBT_PARENT_APP_URL,
     DATASET_URL,
     DATASET_REPO_ID,
-    ARGILLA_SPACE_REPO_ID,
 )
@@ -48,8 +48,35 @@ def project_sidebar():
     st.sidebar.divider()
     st.sidebar.link_button("🧑‍🌾 New Project", DIBT_PARENT_APP_URL)
     if st.session_state["hub_token"] is None:
         st.error("Please provide a Hub token to generate answers")
         st.stop()

+from textwrap import dedent
 import streamlit as st
 from defaults import (
     PROJECT_NAME,
     ARGILLA_URL,
     DIBT_PARENT_APP_URL,
     DATASET_URL,
     DATASET_REPO_ID,
 )
     st.sidebar.divider()
     st.sidebar.link_button("🧑‍🌾 New Project", DIBT_PARENT_APP_URL)
     if st.session_state["hub_token"] is None:
         st.error("Please provide a Hub token to generate answers")
         st.stop()
+def create_seed_terms(topics: list[str], perspectives: list[str]) -> list[str]:
+    """Create seed terms for self intruct to start from."""
+    return [
+        f"{topic} from a {perspective} perspective"
+        for topic in topics
+        for perspective in perspectives
+    ]
+def create_application_instruction(domain: str, examples: list[dict[str, str]]) -> str:
+    """Create the instruction for Self-Instruct task."""
+    system_prompt = dedent(
+        f"""You are an AI assistant than generates queries around the domain of {domain}.
+            Your should not expect basic but profound questions from your users.
+            The queries should reflect a diversxamity of vision and economic positions and political positions.
+            The queries may know about different methods of {domain}.
+            The queries can be positioned politically, economically, socially, or practically.
+            Also take into account the impact of diverse causes on diverse domains."""
+    )
+    for example in examples:
+        question = example["question"]
+        answer = example["answer"]
+        system_prompt += f"""\n- Question: {question}\n- Answer: {answer}\n"""
+    return system_prompt