Spaces:
Sleeping
Sleeping
Ben Burtenshaw
commited on
Commit
·
fc828f1
1
Parent(s):
dfd3683
run pipeline locally
Browse files- __pycache__/defaults.cpython-311.pyc +0 -0
- __pycache__/domain.cpython-311.pyc +0 -0
- __pycache__/hub.cpython-311.pyc +0 -0
- __pycache__/infer.cpython-311.pyc +0 -0
- __pycache__/pipeline.cpython-311.pyc +0 -0
- __pycache__/utils.cpython-311.pyc +0 -0
- pages/2_👩🏼🔬 Describe Domain.py +3 -1
- pages/3_🌱 Generate Dataset.py +79 -37
- pipeline.yaml +9 -33
- pipeline_params.json +0 -0
- utils.py +30 -3
__pycache__/defaults.cpython-311.pyc
ADDED
|
Binary file (2.32 kB). View file
|
|
|
__pycache__/domain.cpython-311.pyc
ADDED
|
Binary file (4.53 kB). View file
|
|
|
__pycache__/hub.cpython-311.pyc
ADDED
|
Binary file (5.78 kB). View file
|
|
|
__pycache__/infer.cpython-311.pyc
ADDED
|
Binary file (837 Bytes). View file
|
|
|
__pycache__/pipeline.cpython-311.pyc
ADDED
|
Binary file (8.2 kB). View file
|
|
|
__pycache__/utils.cpython-311.pyc
ADDED
|
Binary file (4.93 kB). View file
|
|
|
pages/2_👩🏼🔬 Describe Domain.py
CHANGED
|
@@ -11,7 +11,7 @@ from defaults import (
|
|
| 11 |
PIPELINE_PATH,
|
| 12 |
DATASET_REPO_ID,
|
| 13 |
)
|
| 14 |
-
from utils import project_sidebar
|
| 15 |
|
| 16 |
|
| 17 |
st.set_page_config(
|
|
@@ -212,6 +212,8 @@ domain_data = {
|
|
| 212 |
"topics": topics,
|
| 213 |
"examples": examples,
|
| 214 |
"domain_expert_prompt": domain_expert_prompt,
|
|
|
|
|
|
|
| 215 |
}
|
| 216 |
|
| 217 |
with open(SEED_DATA_PATH, "w") as f:
|
|
|
|
| 11 |
PIPELINE_PATH,
|
| 12 |
DATASET_REPO_ID,
|
| 13 |
)
|
| 14 |
+
from utils import project_sidebar, create_seed_terms, create_application_instruction
|
| 15 |
|
| 16 |
|
| 17 |
st.set_page_config(
|
|
|
|
| 212 |
"topics": topics,
|
| 213 |
"examples": examples,
|
| 214 |
"domain_expert_prompt": domain_expert_prompt,
|
| 215 |
+
"application_instruction": create_application_instruction(domain, examples),
|
| 216 |
+
"seed_terms": create_seed_terms(topics, perspectives),
|
| 217 |
}
|
| 218 |
|
| 219 |
with open(SEED_DATA_PATH, "w") as f:
|
pages/3_🌱 Generate Dataset.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
import streamlit as st
|
| 2 |
|
| 3 |
from defaults import ARGILLA_URL
|
| 4 |
-
from hub import push_pipeline_params
|
| 5 |
from utils import project_sidebar
|
| 6 |
|
| 7 |
st.set_page_config(
|
|
@@ -20,16 +20,27 @@ st.divider()
|
|
| 20 |
st.subheader("Step 3. Run the pipeline to generate synthetic data")
|
| 21 |
st.write("Define the distilabel pipeline for generating the dataset.")
|
| 22 |
|
| 23 |
-
###############################################################
|
| 24 |
-
# CONFIGURATION
|
| 25 |
-
###############################################################
|
| 26 |
-
|
| 27 |
hub_username = st.session_state.get("hub_username")
|
| 28 |
project_name = st.session_state.get("project_name")
|
| 29 |
hub_token = st.session_state.get("hub_token")
|
| 30 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
st.divider()
|
| 32 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
st.markdown("#### 🤖 Inference configuration")
|
| 34 |
|
| 35 |
st.write(
|
|
@@ -43,13 +54,19 @@ with st.expander("🤗 Recommended Models"):
|
|
| 43 |
"https://huggingface.co/models?pipeline_tag=text-generation&other=endpoints_compatible&sort=trending",
|
| 44 |
)
|
| 45 |
st.write("🔋Projects with sufficient resources could take advantage of LLama3 70b")
|
| 46 |
-
st.code(
|
|
|
|
|
|
|
| 47 |
|
| 48 |
st.write("🪫Projects with less resources could take advantage of LLama 3 8b")
|
| 49 |
-
st.code(
|
|
|
|
|
|
|
| 50 |
|
| 51 |
-
st.write("🍃Projects with even less resources could
|
| 52 |
-
st.code(
|
|
|
|
|
|
|
| 53 |
|
| 54 |
st.write("Note Hugggingface Pro gives access to more compute resources")
|
| 55 |
st.link_button(
|
|
@@ -58,10 +75,27 @@ with st.expander("🤗 Recommended Models"):
|
|
| 58 |
)
|
| 59 |
|
| 60 |
|
| 61 |
-
|
| 62 |
-
label="
|
| 63 |
-
value="https://api-inference.huggingface.co/models/
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
st.divider()
|
| 66 |
st.markdown("#### 🔬 Argilla API details to push the generated dataset")
|
| 67 |
argilla_url = st.text_input("Argilla API URL", ARGILLA_URL)
|
|
@@ -84,30 +118,38 @@ if all(
|
|
| 84 |
[
|
| 85 |
argilla_api_key,
|
| 86 |
argilla_url,
|
| 87 |
-
|
| 88 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
project_name,
|
| 90 |
hub_token,
|
| 91 |
argilla_dataset_name,
|
| 92 |
]
|
| 93 |
-
):
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
|
|
|
|
|
|
|
|
|
| 111 |
)
|
| 112 |
|
| 113 |
st.markdown(
|
|
@@ -118,7 +160,7 @@ if all(
|
|
| 118 |
f"""
|
| 119 |
|
| 120 |
# Install the distilabel library
|
| 121 |
-
pip install
|
| 122 |
"""
|
| 123 |
)
|
| 124 |
|
|
@@ -126,8 +168,8 @@ if all(
|
|
| 126 |
|
| 127 |
st.code(
|
| 128 |
f"""
|
| 129 |
-
git clone https://
|
| 130 |
-
cd
|
| 131 |
pip install -r requirements.txt
|
| 132 |
"""
|
| 133 |
)
|
|
@@ -135,9 +177,9 @@ if all(
|
|
| 135 |
st.markdown("Finally, you can run the pipeline using the following command:")
|
| 136 |
|
| 137 |
st.code(
|
| 138 |
-
"""
|
| 139 |
huggingface-cli login
|
| 140 |
-
python
|
| 141 |
language="bash",
|
| 142 |
)
|
| 143 |
st.markdown(
|
|
|
|
| 1 |
import streamlit as st
|
| 2 |
|
| 3 |
from defaults import ARGILLA_URL
|
| 4 |
+
from hub import push_pipeline_params
|
| 5 |
from utils import project_sidebar
|
| 6 |
|
| 7 |
st.set_page_config(
|
|
|
|
| 20 |
st.subheader("Step 3. Run the pipeline to generate synthetic data")
|
| 21 |
st.write("Define the distilabel pipeline for generating the dataset.")
|
| 22 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
hub_username = st.session_state.get("hub_username")
|
| 24 |
project_name = st.session_state.get("project_name")
|
| 25 |
hub_token = st.session_state.get("hub_token")
|
| 26 |
|
| 27 |
+
###############################################################
|
| 28 |
+
# CONFIGURATION
|
| 29 |
+
###############################################################
|
| 30 |
+
|
| 31 |
st.divider()
|
| 32 |
|
| 33 |
+
st.markdown("## 🧰 Pipeline Configuration")
|
| 34 |
+
|
| 35 |
+
st.write(
|
| 36 |
+
"Now we need to define the configuration for the pipeline that will generate the synthetic data."
|
| 37 |
+
)
|
| 38 |
+
st.write(
|
| 39 |
+
"⚠️ Model and parameter choice significantly affect the quality of the generated data. \
|
| 40 |
+
We reccomend that you start with a few samples and review the data. The scale up from there."
|
| 41 |
+
)
|
| 42 |
+
|
| 43 |
+
|
| 44 |
st.markdown("#### 🤖 Inference configuration")
|
| 45 |
|
| 46 |
st.write(
|
|
|
|
| 54 |
"https://huggingface.co/models?pipeline_tag=text-generation&other=endpoints_compatible&sort=trending",
|
| 55 |
)
|
| 56 |
st.write("🔋Projects with sufficient resources could take advantage of LLama3 70b")
|
| 57 |
+
st.code(
|
| 58 |
+
"https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B-Instruct"
|
| 59 |
+
)
|
| 60 |
|
| 61 |
st.write("🪫Projects with less resources could take advantage of LLama 3 8b")
|
| 62 |
+
st.code(
|
| 63 |
+
"https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B-Instruct"
|
| 64 |
+
)
|
| 65 |
|
| 66 |
+
st.write("🍃Projects with even less resources could use Phi-3-mini-4k-instruct")
|
| 67 |
+
st.code(
|
| 68 |
+
"https://api-inference.huggingface.co/models/microsoft/Phi-3-mini-4k-instruct"
|
| 69 |
+
)
|
| 70 |
|
| 71 |
st.write("Note Hugggingface Pro gives access to more compute resources")
|
| 72 |
st.link_button(
|
|
|
|
| 75 |
)
|
| 76 |
|
| 77 |
|
| 78 |
+
self_instruct_base_url = st.text_input(
|
| 79 |
+
label="Model base URL for instruction generation",
|
| 80 |
+
value="https://api-inference.huggingface.co/models/microsoft/Phi-3-mini-4k-instruct",
|
| 81 |
+
)
|
| 82 |
+
domain_expert_base_url = st.text_input(
|
| 83 |
+
label="Model base URL for domain expert response",
|
| 84 |
+
value="https://api-inference.huggingface.co/models/microsoft/Phi-3-mini-4k-instruct",
|
| 85 |
+
)
|
| 86 |
+
|
| 87 |
+
st.divider()
|
| 88 |
+
st.markdown("#### 🧮 Parameters configuration")
|
| 89 |
+
|
| 90 |
+
self_intruct_num_generations = st.slider(
|
| 91 |
+
"Number of generations for self-instruction", 1, 10, 2
|
| 92 |
)
|
| 93 |
+
domain_expert_num_generations = st.slider(
|
| 94 |
+
"Number of generations for domain expert", 1, 10, 2
|
| 95 |
+
)
|
| 96 |
+
self_instruct_temperature = st.slider("Temperature for self-instruction", 0.1, 1.0, 0.9)
|
| 97 |
+
domain_expert_temperature = st.slider("Temperature for domain expert", 0.1, 1.0, 0.9)
|
| 98 |
+
|
| 99 |
st.divider()
|
| 100 |
st.markdown("#### 🔬 Argilla API details to push the generated dataset")
|
| 101 |
argilla_url = st.text_input("Argilla API URL", ARGILLA_URL)
|
|
|
|
| 118 |
[
|
| 119 |
argilla_api_key,
|
| 120 |
argilla_url,
|
| 121 |
+
self_instruct_base_url,
|
| 122 |
+
domain_expert_base_url,
|
| 123 |
+
self_intruct_num_generations,
|
| 124 |
+
domain_expert_num_generations,
|
| 125 |
+
self_instruct_temperature,
|
| 126 |
+
domain_expert_temperature,
|
| 127 |
+
hub_username,
|
| 128 |
project_name,
|
| 129 |
hub_token,
|
| 130 |
argilla_dataset_name,
|
| 131 |
]
|
| 132 |
+
) and st.button("💾 Save Pipeline Config"):
|
| 133 |
+
with st.spinner("Pushing pipeline to the Hub..."):
|
| 134 |
+
push_pipeline_params(
|
| 135 |
+
pipeline_params={
|
| 136 |
+
"argilla_api_key": argilla_api_key,
|
| 137 |
+
"argilla_api_url": argilla_url,
|
| 138 |
+
"argilla_dataset_name": argilla_dataset_name,
|
| 139 |
+
"self_instruct_base_url": self_instruct_base_url,
|
| 140 |
+
"domain_expert_base_url": domain_expert_base_url,
|
| 141 |
+
"self_instruct_temperature": self_instruct_temperature,
|
| 142 |
+
"domain_expert_temperature": domain_expert_temperature,
|
| 143 |
+
"self_intruct_num_generations": self_intruct_num_generations,
|
| 144 |
+
"domain_expert_num_generations": domain_expert_num_generations,
|
| 145 |
+
},
|
| 146 |
+
hub_username=hub_username,
|
| 147 |
+
hub_token=hub_token,
|
| 148 |
+
project_name=project_name,
|
| 149 |
+
)
|
| 150 |
+
|
| 151 |
+
st.success(
|
| 152 |
+
f"Pipeline configuration pushed to the dataset repo {hub_username}/{project_name} on the Hub."
|
| 153 |
)
|
| 154 |
|
| 155 |
st.markdown(
|
|
|
|
| 160 |
f"""
|
| 161 |
|
| 162 |
# Install the distilabel library
|
| 163 |
+
pip install distilabel
|
| 164 |
"""
|
| 165 |
)
|
| 166 |
|
|
|
|
| 168 |
|
| 169 |
st.code(
|
| 170 |
f"""
|
| 171 |
+
git clone https://github.com/huggingface/data-is-better-together
|
| 172 |
+
cd data-is-better-together/domain-specific-datasets/pipelines
|
| 173 |
pip install -r requirements.txt
|
| 174 |
"""
|
| 175 |
)
|
|
|
|
| 177 |
st.markdown("Finally, you can run the pipeline using the following command:")
|
| 178 |
|
| 179 |
st.code(
|
| 180 |
+
f"""
|
| 181 |
huggingface-cli login
|
| 182 |
+
python domain_expert_pipeline.py {hub_username}/{project_name}""",
|
| 183 |
language="bash",
|
| 184 |
)
|
| 185 |
st.markdown(
|
pipeline.yaml
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
distilabel:
|
| 2 |
-
version: 1.0.
|
| 3 |
pipeline:
|
| 4 |
name: farming
|
| 5 |
description: null
|
|
@@ -10,31 +10,7 @@ pipeline:
|
|
| 10 |
output_mappings: {}
|
| 11 |
batch_size: 64
|
| 12 |
data:
|
| 13 |
-
- input:
|
| 14 |
-
- input: animal welfare from a Agribusiness perspective
|
| 15 |
-
- input: animal welfare from a Permaculture perspective
|
| 16 |
-
- input: animal welfare from a Agroforestery perspective
|
| 17 |
-
- input: animal welfare from a Conventional Farming perspective
|
| 18 |
-
- input: economic growth from a Family Farming perspective
|
| 19 |
-
- input: economic growth from a Agribusiness perspective
|
| 20 |
-
- input: economic growth from a Permaculture perspective
|
| 21 |
-
- input: economic growth from a Agroforestery perspective
|
| 22 |
-
- input: economic growth from a Conventional Farming perspective
|
| 23 |
-
- input: land from a Family Farming perspective
|
| 24 |
-
- input: land from a Agribusiness perspective
|
| 25 |
-
- input: land from a Permaculture perspective
|
| 26 |
-
- input: land from a Agroforestery perspective
|
| 27 |
-
- input: land from a Conventional Farming perspective
|
| 28 |
-
- input: resources from a Family Farming perspective
|
| 29 |
-
- input: resources from a Agribusiness perspective
|
| 30 |
-
- input: resources from a Permaculture perspective
|
| 31 |
-
- input: resources from a Agroforestery perspective
|
| 32 |
-
- input: resources from a Conventional Farming perspective
|
| 33 |
-
- input: efficiency from a Family Farming perspective
|
| 34 |
-
- input: efficiency from a Agribusiness perspective
|
| 35 |
-
- input: efficiency from a Permaculture perspective
|
| 36 |
-
- input: efficiency from a Agroforestery perspective
|
| 37 |
-
- input: efficiency from a Conventional Farming perspective
|
| 38 |
runtime_parameters_info:
|
| 39 |
- name: batch_size
|
| 40 |
optional: true
|
|
@@ -54,7 +30,7 @@ pipeline:
|
|
| 54 |
model_id: null
|
| 55 |
endpoint_name: null
|
| 56 |
endpoint_namespace: null
|
| 57 |
-
base_url: https://
|
| 58 |
tokenizer_id: null
|
| 59 |
model_display_name: null
|
| 60 |
use_openai_client: false
|
|
@@ -75,14 +51,14 @@ pipeline:
|
|
| 75 |
Blend interrogative (e.g., "What is the significance of x?") and imperative
|
| 76 |
(e.g., "Detail the process of x.") styles.'
|
| 77 |
application_description: 'You are an AI assistant than generates queries around
|
| 78 |
-
the domain of
|
| 79 |
|
| 80 |
Your should not expect basic but profound questions from your users.
|
| 81 |
|
| 82 |
The queries should reflect a diversity of vision and economic positions and
|
| 83 |
political positions.
|
| 84 |
|
| 85 |
-
The queries may know about different methods of
|
| 86 |
|
| 87 |
The queries can be positioned politically, economically, socially, or practically.
|
| 88 |
|
|
@@ -163,7 +139,7 @@ pipeline:
|
|
| 163 |
model_id: null
|
| 164 |
endpoint_name: null
|
| 165 |
endpoint_namespace: null
|
| 166 |
-
base_url: https://
|
| 167 |
tokenizer_id: null
|
| 168 |
model_display_name: null
|
| 169 |
use_openai_client: false
|
|
@@ -390,7 +366,7 @@ pipeline:
|
|
| 390 |
model_id: null
|
| 391 |
endpoint_name: null
|
| 392 |
endpoint_namespace: null
|
| 393 |
-
base_url: https://
|
| 394 |
tokenizer_id: null
|
| 395 |
model_display_name: null
|
| 396 |
use_openai_client: false
|
|
@@ -489,9 +465,9 @@ pipeline:
|
|
| 489 |
generation: domain_expert_answer
|
| 490 |
output_mappings: {}
|
| 491 |
input_batch_size: 50
|
| 492 |
-
dataset_name:
|
| 493 |
dataset_workspace: admin
|
| 494 |
-
api_url: https://argilla-
|
| 495 |
runtime_parameters_info:
|
| 496 |
- name: input_batch_size
|
| 497 |
optional: true
|
|
|
|
| 1 |
distilabel:
|
| 2 |
+
version: 1.0.1
|
| 3 |
pipeline:
|
| 4 |
name: farming
|
| 5 |
description: null
|
|
|
|
| 10 |
output_mappings: {}
|
| 11 |
batch_size: 64
|
| 12 |
data:
|
| 13 |
+
- input: punctures from a Retro bikes perspective
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
runtime_parameters_info:
|
| 15 |
- name: batch_size
|
| 16 |
optional: true
|
|
|
|
| 30 |
model_id: null
|
| 31 |
endpoint_name: null
|
| 32 |
endpoint_namespace: null
|
| 33 |
+
base_url: https://api-inference.huggingface.co/models/HuggingFaceH4/zephyr-7b-beta
|
| 34 |
tokenizer_id: null
|
| 35 |
model_display_name: null
|
| 36 |
use_openai_client: false
|
|
|
|
| 51 |
Blend interrogative (e.g., "What is the significance of x?") and imperative
|
| 52 |
(e.g., "Detail the process of x.") styles.'
|
| 53 |
application_description: 'You are an AI assistant than generates queries around
|
| 54 |
+
the domain of Bicycle maintenance.
|
| 55 |
|
| 56 |
Your should not expect basic but profound questions from your users.
|
| 57 |
|
| 58 |
The queries should reflect a diversity of vision and economic positions and
|
| 59 |
political positions.
|
| 60 |
|
| 61 |
+
The queries may know about different methods of Bicycle maintenance.
|
| 62 |
|
| 63 |
The queries can be positioned politically, economically, socially, or practically.
|
| 64 |
|
|
|
|
| 139 |
model_id: null
|
| 140 |
endpoint_name: null
|
| 141 |
endpoint_namespace: null
|
| 142 |
+
base_url: https://api-inference.huggingface.co/models/HuggingFaceH4/zephyr-7b-beta
|
| 143 |
tokenizer_id: null
|
| 144 |
model_display_name: null
|
| 145 |
use_openai_client: false
|
|
|
|
| 366 |
model_id: null
|
| 367 |
endpoint_name: null
|
| 368 |
endpoint_namespace: null
|
| 369 |
+
base_url: https://api-inference.huggingface.co/models/HuggingFaceH4/zephyr-7b-beta
|
| 370 |
tokenizer_id: null
|
| 371 |
model_display_name: null
|
| 372 |
use_openai_client: false
|
|
|
|
| 465 |
generation: domain_expert_answer
|
| 466 |
output_mappings: {}
|
| 467 |
input_batch_size: 50
|
| 468 |
+
dataset_name: bicycle_maintenance
|
| 469 |
dataset_workspace: admin
|
| 470 |
+
api_url: https://burtenshaw-bicycle-maintenance-argilla-space.hf.space
|
| 471 |
runtime_parameters_info:
|
| 472 |
- name: input_batch_size
|
| 473 |
optional: true
|
pipeline_params.json
ADDED
|
File without changes
|
utils.py
CHANGED
|
@@ -1,13 +1,13 @@
|
|
|
|
|
|
|
|
| 1 |
import streamlit as st
|
| 2 |
|
| 3 |
from defaults import (
|
| 4 |
-
ARGILLA_SPACE_REPO_ID,
|
| 5 |
PROJECT_NAME,
|
| 6 |
ARGILLA_URL,
|
| 7 |
DIBT_PARENT_APP_URL,
|
| 8 |
DATASET_URL,
|
| 9 |
DATASET_REPO_ID,
|
| 10 |
-
ARGILLA_SPACE_REPO_ID,
|
| 11 |
)
|
| 12 |
|
| 13 |
|
|
@@ -48,8 +48,35 @@ def project_sidebar():
|
|
| 48 |
st.sidebar.divider()
|
| 49 |
|
| 50 |
st.sidebar.link_button("🧑🌾 New Project", DIBT_PARENT_APP_URL)
|
| 51 |
-
|
| 52 |
if st.session_state["hub_token"] is None:
|
| 53 |
st.error("Please provide a Hub token to generate answers")
|
| 54 |
st.stop()
|
| 55 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from textwrap import dedent
|
| 2 |
+
|
| 3 |
import streamlit as st
|
| 4 |
|
| 5 |
from defaults import (
|
|
|
|
| 6 |
PROJECT_NAME,
|
| 7 |
ARGILLA_URL,
|
| 8 |
DIBT_PARENT_APP_URL,
|
| 9 |
DATASET_URL,
|
| 10 |
DATASET_REPO_ID,
|
|
|
|
| 11 |
)
|
| 12 |
|
| 13 |
|
|
|
|
| 48 |
st.sidebar.divider()
|
| 49 |
|
| 50 |
st.sidebar.link_button("🧑🌾 New Project", DIBT_PARENT_APP_URL)
|
| 51 |
+
|
| 52 |
if st.session_state["hub_token"] is None:
|
| 53 |
st.error("Please provide a Hub token to generate answers")
|
| 54 |
st.stop()
|
| 55 |
|
| 56 |
+
|
| 57 |
+
def create_seed_terms(topics: list[str], perspectives: list[str]) -> list[str]:
|
| 58 |
+
"""Create seed terms for self intruct to start from."""
|
| 59 |
+
|
| 60 |
+
return [
|
| 61 |
+
f"{topic} from a {perspective} perspective"
|
| 62 |
+
for topic in topics
|
| 63 |
+
for perspective in perspectives
|
| 64 |
+
]
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
def create_application_instruction(domain: str, examples: list[dict[str, str]]) -> str:
|
| 68 |
+
"""Create the instruction for Self-Instruct task."""
|
| 69 |
+
system_prompt = dedent(
|
| 70 |
+
f"""You are an AI assistant than generates queries around the domain of {domain}.
|
| 71 |
+
Your should not expect basic but profound questions from your users.
|
| 72 |
+
The queries should reflect a diversxamity of vision and economic positions and political positions.
|
| 73 |
+
The queries may know about different methods of {domain}.
|
| 74 |
+
The queries can be positioned politically, economically, socially, or practically.
|
| 75 |
+
Also take into account the impact of diverse causes on diverse domains."""
|
| 76 |
+
)
|
| 77 |
+
for example in examples:
|
| 78 |
+
question = example["question"]
|
| 79 |
+
answer = example["answer"]
|
| 80 |
+
system_prompt += f"""\n- Question: {question}\n- Answer: {answer}\n"""
|
| 81 |
+
|
| 82 |
+
return system_prompt
|