Spaces:

AIEcosystem
/

AcademiaMiner

Sleeping

App Files Files Community

AcademiaMiner / src /streamlit_app.py

AIEcosystem

Update src/streamlit_app.py

868bd0f verified about 2 months ago

raw

history blame contribute delete

11.8 kB

	import os
	os.environ['HF_HOME'] = '/tmp'
	import time
	import streamlit as st
	import pandas as pd
	import io
	import plotly.express as px
	import zipfile
	from streamlit_extras.stylable_container import stylable_container
	from transformers import pipeline
	from comet_ml import Experiment

	# --- App Configuration and Styling ---
	st.set_page_config(
	layout="wide",
	page_title="English Keyphrase")
	st.markdown(
	"""
	<style>
	.stApp {
	background-color: #f0f8ff; /* A single, solid color */
	color: #000000;
	font-family: 'Inter', sans-serif;
	}

	.stButton > button {
	background-color: #FF69B4;
	color: #FFFFFF;
	font-weight: bold;
	border-radius: 12px;
	transition: all 0.2s ease-in-out;
	box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
	}

	.stButton > button:hover {
	background-color: #FFB6C1;
	box-shadow: 0 6px 8px rgba(0, 0, 0, 0.15);
	transform: translateY(-2px);
	}

	/* Text Area background and text color */
	.stTextArea textarea {
	background-color: #FFC0CB; /* A nice pink color */
	color: #000000;
	border: 1px solid #FF69B4; /* A pink border to match the button */
	}

	</style>
	""",
	unsafe_allow_html=True)
	# --- Comet ML Setup ---
	COMET_API_KEY = os.environ.get("COMET_API_KEY")
	COMET_WORKSPACE = os.environ.get("COMET_WORKSPACE")
	COMET_PROJECT_NAME = os.environ.get("COMET_PROJECT_NAME")
	comet_initialized = bool(COMET_API_KEY and COMET_WORKSPACE and COMET_PROJECT_NAME)
	if not comet_initialized:
	st.warning("Comet ML not initialized. Check environment variables.")
	# --- UI Header and Notes ---
	st.subheader("AcademiaMiner", divider="rainbow")
	st.link_button("by nlpblogs", "https://nlpblogs.com", type="tertiary")
	expander = st.expander("Important notes")
	expander.write('''Entities: This AcademiaMiner extracts keyphrases from English academic and scientific papers.

	Results are presented in easy-to-read tables, visualized in an interactive tree map and a bar chart, and are available for download along with a Glossary of tags.

	How to Use:
	Type or paste your text into the text area below, then press Ctrl + Enter. Click the 'Results' button to extract and tag entities in your text data.

	Usage Limits: You can request results unlimited times for one (1) month.

	Supported Languages: English

	Technical issues: If your connection times out, please refresh the page or reopen the app's URL.

	For any errors or inquiries, please contact us at [email protected]''')
	with st.sidebar:
	st.write("Use the following code to embed the AcademiaMiner web app on your website. Feel free to adjust the width and height values to fit your page.")
	code = '''
	<iframe
	src="https://aiecosystem-academiaminer.hf.space"
	frameborder="0"
	width="850"
	height="450"
	></iframe>
	'''
	st.code(code, language="html")
	st.text("")
	st.text("")
	st.divider()
	st.subheader("🚀 Ready to build your own AI Web App?", divider="rainbow")
	st.link_button("AI Web App Builder", "https://nlpblogs.com/build-your-named-entity-recognition-app/", type="primary")
	# --- Model Loading ---
	@st.cache_resource
	def load_ner_model():
	"""Loads the keyphrase extraction model and caches it."""
	try:
	return pipeline(
	"token-classification",
	model="ml6team/keyphrase-extraction-kbir-inspec",
	aggregation_strategy="max", stride=128, ignore_labels=["O"]
	)
	except Exception as e:
	st.error(f"Failed to load NER model: {e}")
	st.stop()
	model = load_ner_model()
	# --- Main App Logic ---
	# Define the word limit
	word_limit = 200
	# Update text area with the word limit
	text = st.text_area(f"Type or paste your text below (max {word_limit} words), and then press Ctrl + Enter", height=250, key='my_text_area')
	# Calculate and display the word count
	word_count = len(text.split())
	st.markdown(f"Word count: {word_count}/{word_limit}")
	def clear_text():
	"""Clears the text area."""
	st.session_state['my_text_area'] = ""
	st.session_state.text_processed = False
	st.button("Clear text", on_click=clear_text)
	if st.button("Results"):
	# Check for word limit and empty text first
	if not text.strip():
	st.warning("Please enter some text to extract keyphrases.")
	elif word_count > word_limit:
	st.warning(f"Your text exceeds the {word_limit} word limit. Please shorten it to continue.")
	else:
	start_time_overall = time.time()
	# Initialize Comet ML experiment at the start
	experiment = None
	if comet_initialized:
	try:
	experiment = Experiment(
	api_key=COMET_API_KEY,
	workspace=COMET_WORKSPACE,
	project_name=COMET_PROJECT_NAME,
	)
	except Exception as e:
	st.warning(f"Could not initialize Comet ML experiment: {e}")
	experiment = None
	try:
	with st.spinner("Analyzing text...", ):
	# The pipeline model returns a list of dictionaries.
	entities = model(text)
	data = []
	for entity in entities:
	# 'ml6team/keyphrase-extraction-kbir-inspec' model doesn't have 'entity_group'
	# It just uses 'label'
	data.append({
	'word': entity['word'],
	'label': entity['entity_group'], # This is the correct key
	'score': entity['score'],
	'start': entity['start'],
	'end': entity['end']
	})
	if not data:
	st.warning("No keyphrases found in the text.")
	st.stop()
	df = pd.DataFrame(data)
	# --- Data Cleaning and Processing ---
	pattern = r'[^\w\s]'
	df['word'] = df['word'].replace(pattern, '', regex=True)
	df = df.replace('', 'Unknown')
	# --- All Extracted Keyphrases ---
	st.subheader("All Extracted Keyphrases", divider="rainbow")
	st.dataframe(df, use_container_width=True)
	with st.expander("See Glossary of tags"):
	st.write('''
	- text: ['entity extracted from your text data']
	- score: ['accuracy score; how accurately a tag has been assigned to a given entity']
	- label: ['label (tag) assigned to a given extracted entity']
	- start: ['index of the start of the corresponding entity']
	- end: ['index of the end of the corresponding entity']
	''')
	# --- Most Frequent Keyphrases ---
	st.subheader("Most Frequent Keyphrases", divider="rainbow")
	word_counts = df['word'].value_counts().reset_index()
	word_counts.columns = ['word', 'count']
	df_frequent = word_counts.sort_values(by='count', ascending=False).head(15)
	if not df_frequent.empty:
	tab1, tab2 = st.tabs(["Table", "Chart"])
	with tab1:
	st.dataframe(df_frequent, use_container_width=True)
	with tab2:
	fig_frequent_bar = px.bar(
	df_frequent,
	x='count',
	y='word',
	orientation='h',
	title='Top Frequent Keyphrases by Count',
	color='count',
	color_continuous_scale=px.colors.sequential.Viridis
	)
	fig_frequent_bar.update_layout(
	yaxis={'categoryorder': 'total ascending'},
	paper_bgcolor='#f0f8ff', # Sets the background color of the entire figure
	plot_bgcolor='#f0f8ff' # Sets the background color of the plotting area
	)
	st.plotly_chart(fig_frequent_bar, use_container_width=True)
	if experiment:
	experiment.log_figure(figure=fig_frequent_bar, figure_name="frequent_keyphrases_bar_chart")
	else:
	st.info("No keyphrases found with more than one occurrence.")
	# --- Treemap of All Keyphrases ---
	st.subheader("Treemap of All Keyphrases", divider="rainbow")
	# Use 'label' instead of 'entity_group'
	fig_treemap = px.treemap(
	df,
	path=[px.Constant("all"), 'label', 'word'],
	values='score',
	color='word',
	color_continuous_scale=px.colors.sequential.Plasma
	)
	fig_treemap.update_layout(margin=dict(t=50, l=25, r=25, b=25), paper_bgcolor='#f0f8ff', plot_bgcolor='#f0f8ff')
	st.plotly_chart(fig_treemap, use_container_width=True)
	if experiment:
	experiment.log_figure(figure=fig_treemap, figure_name="entity_treemap")
	# --- Download Section ---
	dfa = pd.DataFrame(
	data={
	'Column Name': ['word', 'label', 'score', 'start', 'end'],
	'Description': [
	'keyphrase extracted from your text data',
	'label (tag) assigned to a given keyphrase',
	'accuracy score; how accurately a tag has been assigned',
	'index of the start of the corresponding entity',
	'index of the end of the corresponding entity'
	]
	}
	)
	buf = io.BytesIO()
	with zipfile.ZipFile(buf, "w") as myzip:
	myzip.writestr("Summary_of_results.csv", df.to_csv(index=False))
	myzip.writestr("Most_frequent_keyphrases.csv", df_frequent.to_csv(index=False))
	myzip.writestr("Glossary_of_tags.csv", dfa.to_csv(index=False))
	with stylable_container(
	key="download_button",
	css_styles="""button { background-color: red; border: 1px solid black; padding: 5px; color: white; }""",
	):
	st.download_button(
	label="Download zip file",
	data=buf.getvalue(),
	file_name="nlpblogs_ner_results.zip",
	mime="application/zip",
	)
	st.divider()
	except Exception as e:
	st.error(f"An unexpected error occurred during processing: {e}")
	finally:
	if experiment:
	try:
	# Log parameters and tables before ending the experiment
	experiment.log_parameter("input_source_type", "text_area")
	experiment.log_parameter("input_content_length", len(text))
	experiment.log_table("predicted_entities", df)
	experiment.end()
	except Exception as comet_e:
	st.warning(f"Comet ML experiment.end() failed: {comet_e}")
	# Show elapsed time
	end_time_overall = time.time()
	elapsed_time_overall = end_time_overall - start_time_overall
	st.info(f"Results processed in {elapsed_time_overall:.2f} seconds.")