Spaces:

bor
/

scattertext-ja-novels

Sleeping

Bor Hodošček

chore: update dockerfile and deps

bce3284 15 days ago

23.3 kB

	# /// script
	# requires-python = ">=3.12"
	# dependencies = [
	# "altair==5.5.0",
	# "eli5",
	# "fugashi-plus",
	# "ipython",
	# "marimo",
	# "numpy==2.2.6",
	# "pandas==2.3.0",
	# "pyarrow",
	# "scattertext==0.2.2",
	# "scikit-learn==1.7.0",
	# "scipy>=1.13.1",
	# ]
	# ///

	import marimo

	__generated_with = "0.14.13"
	app = marimo.App(width="full", app_title="Scattertext on Japanese novels")

	with app.setup:
	import io
	import random

	import eli5
	import fugashi
	import marimo as mo
	import numpy as np
	import pandas as pd
	import scattertext as st
	import scipy
	from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

	RANDOM_SEED = 42
	random.seed(RANDOM_SEED)
	np.random.seed(RANDOM_SEED)


	@app.cell
	def function_export():
	@mo.cache
	def get_tagger() -> fugashi.Tagger:
	"""Load and cache a single fugashi Tagger instance."""
	return fugashi.Tagger("-Owakati -d ./unidic-novel -r ./unidic-novel/dicrc")

	@mo.cache
	def parse_texts(texts: list[str], tagger=get_tagger()) -> list[str]:
	"""Tokenize a list of raw strings via fugashi (MeCab)."""
	return [tagger.parse(txt).strip() for txt in texts]

	@mo.cache
	def build_corpus_cached(
	texts: list[str],
	categories: list[str],
	) -> st.Corpus:
	"""Build or reuse cached Scattertext corpus."""

	df = pd.DataFrame({"text": texts, "category": categories})
	return (
	st.CorpusFromPandas(
	df,
	category_col="category",
	text_col="text",
	nlp=st.whitespace_nlp_with_sentences,
	)
	.build()
	.get_unigram_corpus()
	.compact(st.AssociationCompactor(2000))
	)

	@mo.cache
	def chunk_texts(
	texts: list[str],
	categories: list[str],
	filenames: list[str],
	chunk_size: int = 2000,
	) -> tuple[list[str], list[str], list[str]]:
	"""Chunk each text into segments of chunk_size tokens, preserving category and filename."""
	from pathlib import Path

	def format_chunk_label(fname: str, category: str, idx: int) -> str:
	stem = Path(fname).stem
	return f"{stem}-{category}#{idx}"

	chunked_texts = []
	chunked_cats = []
	chunked_labels = []
	for text, cat, fname in zip(texts, categories, filenames):
	tokens = text.split()
	for block_idx, start in enumerate(
	range(0, len(tokens), chunk_size), start=1
	):
	chunk = " ".join(tokens[start : start + chunk_size])
	chunked_texts.append(chunk)
	chunked_cats.append(cat)
	chunked_labels.append(format_chunk_label(fname, cat, block_idx))
	return chunked_texts, chunked_cats, chunked_labels

	@mo.cache
	def train_scikit_cached(
	texts: list[str], categories: list[str], filenames: list[str]
	) -> tuple[
	st.Corpus,
	scipy.sparse.spmatrix,
	TfidfVectorizer,
	list[str],
	list[str],
	]:
	"""Fit TF-IDF + CountVectorizer & build a st.Corpus on chunked data."""

	chunk_texts_out, chunk_cats, chunk_fnames = chunk_texts(
	texts, categories, filenames
	)
	# fit TF-IDF once
	tfv = TfidfVectorizer()
	X_tfidf = tfv.fit_transform(chunk_texts_out)
	# reuse the same vocabulary for raw counts
	cv = CountVectorizer(vocabulary=tfv.vocabulary_)
	X_count = cv.transform(chunk_texts_out)
	y_codes = pd.Categorical(
	chunk_cats, categories=pd.Categorical(chunk_cats).categories
	).codes

	scikit_corpus = st.CorpusFromScikit(
	X=X_count,
	y=y_codes,
	feature_vocabulary=tfv.vocabulary_,
	category_names=list(pd.Categorical(chunk_cats).categories),
	raw_texts=chunk_texts_out,
	).build()

	return (
	scikit_corpus,
	X_tfidf,
	tfv,
	chunk_cats,
	chunk_fnames,
	)

	return build_corpus_cached, chunk_texts, parse_texts, train_scikit_cached


	@app.cell
	def intro():
	mo.md(
	r"""
	# Scattertext on Japanese novels / 近代文学作品のScattertext可視化

	## 概要

	2つの異なるカテゴリのテキストファイル群をアップロードし、その差異をScattertextで可視化します。
	オプショナルで機械学習モデルで分類をし、モデルの分類制度とモデルが識別に用いるトークンも確認できます。

	## ワークフロー

	1. テキストファイルをアップロード（デフォルトを使う場合はそのままSubmitしてください）
	2. データ内容を確認・修正
	3. チャンク＆サンプリング設定
	4. Scattertextによる可視化
	5. （任意）分類モデルによる性能検証

	> 単語分割には、[近現代口語小説UniDic](https://clrd.ninjal.ac.jp/unidic/download_all.html#unidic_novel)を使用しています。異なる時代やジャンルのテキストには不向きです。
	"""
	)
	return


	@app.cell
	def data_settings():
	# 1) Create each widget
	category_name = mo.ui.text(
	label="カテゴリ名（例：著者名・時代区分など）",
	placeholder="例：時代・性別・著者など",
	value="著者",
	full_width=True,
	)
	label_a = mo.ui.text(
	label="Aのラベル", placeholder="例：夏目漱石", value="夏目漱石", full_width=True
	)
	files_a = mo.ui.file(
	label="Aのファイルアップロード（UTF-8、.txt形式）", multiple=True, kind="area"
	)
	label_b = mo.ui.text(
	label="Bのラベル", placeholder="例：海野十三", value="海野十三", full_width=True
	)
	files_b = mo.ui.file(
	label="Bのファイルアップロード（UTF-8、.txt形式）", multiple=True, kind="area"
	)

	tpl = r"""
	## データと分析の設定

	※　初期では夏目漱石と海野十三から各2作品をサンプルコーパスにしています。設定を変更せずSubmitすると、サンプルコーパスでの分析になります。ファイルをアップロードする場合は忘れずにカテゴリとラベルも変更してください。

	※　ファイルはプレインテキスト形式必須（.txt, UTF-8エンコーディング）

	{category_name}

	### グループA
	{label_a}
	{files_a}

	### グループB
	{label_b}
	{files_b}
	"""

	data_form = (
	mo.md(tpl)
	.batch(
	# info_box=info_box,
	category_name=category_name,
	label_a=label_a,
	files_a=files_a,
	label_b=label_b,
	files_b=files_b,
	)
	.form(show_clear_button=True, bordered=True)
	)
	data_form
	return data_form, label_a, label_b


	@app.cell
	def data_check(data_form, parse_texts):
	mo.stop(data_form.value is None)

	import itertools
	from pathlib import Path

	validation_messages: list[str] = []

	if data_form.value["label_a"] == data_form.value["label_b"]:
	print("a")
	validation_messages.append(
	"⚠️ 警告: グループAとBのラベルが同じです。AとBは異なるラベルを設定してください。\n"
	)

	if not data_form.value["files_a"] and not data_form.value["files_b"]:
	validation_messages.append(
	"ℹ️ ファイルが未指定のため、デフォルトサンプルを使用しています。\n"
	)

	try:
	# Group A: either uploaded files or default (坊っちゃん + こころ)
	if data_form.value["files_a"]:
	category_a_texts = (
	f.contents.decode("utf-8") for f in data_form.value["files_a"]
	)
	category_a_names = (f.name for f in data_form.value["files_a"])
	else:
	natsume_1 = Path("Natsume_S_Bocchan.txt").read_text(encoding="utf-8")
	natsume_2 = Path("Natsume_S_Kokoro.txt").read_text(encoding="utf-8")
	category_a_texts = [natsume_1, natsume_2]
	category_a_names = ["Natsume_S_Bocchan.txt", "Natsume_S_Kokoro.txt"]

	# Group B: either uploaded files or default (地球要塞 + 火星兵団)
	if data_form.value["files_b"]:
	category_b_texts = (
	f.contents.decode("utf-8") for f in data_form.value["files_b"]
	)
	category_b_names = (f.name for f in data_form.value["files_b"])
	else:
	unno_1 = Path("Unno_J_Chikyuuyousa.txt").read_text(encoding="utf-8")
	unno_2 = Path("Unno_J_Kaseiheidan.txt").read_text(encoding="utf-8")

	category_b_texts = [unno_1, unno_2]
	category_b_names = ["Unno_J_Chikyuuyousa.txt", "Unno_J_Kaseiheidan.txt"]

	nA = len(data_form.value["files_a"]) or 2
	nB = len(data_form.value["files_b"]) or 2
	categories = [data_form.value["label_a"]] * nA + [
	data_form.value["label_b"]
	] * nB
	filenames = itertools.chain(category_a_names, category_b_names)
	texts = itertools.chain(category_a_texts, category_b_texts)

	data = pd.DataFrame(
	{
	"category": categories,
	"filename": filenames,
	"text": texts,
	}
	)

	with mo.status.spinner("コーパスを形態素解析中..."):
	data["text"] = parse_texts(list(data["text"]))

	except Exception as e:
	data = None
	validation_messages.append(
	f"❌ エラー: ファイルの読み込みに失敗しました: {str(e)}\n"
	)

	# We need the maximum number of tokens for the slider
	max_tokens = data["text"].map(lambda s: len(s.split())).max()

	mo.md(f"""
	## データ確認

	{"警告:\n" if validation_messages else ""}
	{"\n".join(map(lambda x: f"- {x}", validation_messages))}

	解析済テキスト一覧:
	{mo.ui.table(data, selection="multi", format_mapping={"text": lambda s: s[:20] + "..."})}
	""")
	return (data,)


	@app.cell
	def sampling_controls_setup():
	chunk_size = mo.ui.slider(
	start=50,
	stop=50_000,
	value=2000,
	step=50,
	label="1チャンクあたり最大トークン数",
	full_width=True,
	include_input=True,
	)
	sample_frac = mo.ui.slider(
	start=0.1,
	stop=1.0,
	value=0.2,
	step=0.05,
	label="使用割合（1.0で全データ）",
	full_width=True,
	include_input=True,
	)
	sampling_form = (
	mo.md("{chunk_size}\n{sample_frac}")
	.batch(chunk_size=chunk_size, sample_frac=sample_frac)
	.form(show_clear_button=True, bordered=False)
	)
	sampling_form
	return chunk_size, sample_frac, sampling_form


	@app.cell
	def _(build_corpus_cached, chunk_texts, data, sample_frac, sampling_form):
	mo.stop(sampling_form.value is None)

	with mo.status.spinner("コーパスをサンプリング中…"):
	chunked_texts, cats, fnames = chunk_texts(
	list(data.text),
	list(data.category),
	list(data.filename),
	sampling_form.value["chunk_size"],
	)

	if sample_frac.value < 1.0:
	N = len(chunked_texts)
	k = int(N * sampling_form.value["sample_frac"])
	idx = random.sample(range(N), k)
	chunked_texts = [chunked_texts[i] for i in idx]
	cats = [cats[i] for i in idx]
	fnames = [fnames[i] for i in idx]

	corpus = build_corpus_cached(
	chunked_texts,
	cats,
	)
	return cats, chunked_texts, corpus, fnames


	@app.cell
	def sampling_controls(chunk_size):
	mo.md("トークン数を増やすと処理時間が長くなります").callout(
	kind="info"
	) if chunk_size.value > 30_000 else None
	return


	@app.cell
	def plot_main_scatterplot(corpus, data_form, fnames):
	cat_name = data_form.value["category_name"]
	with mo.status.spinner("Scatterplot作成中…"):
	html = st.produce_scattertext_explorer(
	corpus,
	category=data_form.value["label_a"],
	category_name=f"{cat_name}: {data_form.value['label_a']}",
	not_category_name=f"{cat_name}: {data_form.value['label_b']}",
	width_in_pixels=1000,
	metadata=fnames,
	)

	mo.vstack(
	[
	mo.md(f"""
	# Scattertextの結果
	### Scattertext可視化の見方
	- （縦）上に行くほど{data_form.value["label_a"]}で相対的に多く使われるトークン
	- （横）右に行くほど{data_form.value["label_b"]}で相対的に多く使われるトークン

	HTMLをダウンロードしてブラウザで開くと見やすい
	"""),
	mo.iframe(html),
	]
	)
	return (html,)


	@app.cell
	def _(html):
	download_button = mo.download(
	data=html.encode(),
	filename="scattertext_analysis.html",
	label="可視化結果をダウンロード",
	)

	mo.md(f"{download_button}")
	return


	@app.cell
	def classification_toggle():
	run_model = mo.ui.switch(label="分類モデルを適用する")
	run_model
	return (run_model,)


	@app.cell
	def _(run_model):
	mo.stop(not run_model.value)

	mo.md(
	r"""
	# 分類モデルによる検証

	2つのカテゴリを分類するモデルを学習し、それぞれのカテゴリを分ける有効な素性（単語）がどれなのかもScattertextで観察できます。
	ここはロジスティック回帰という機械学習モデルを使用しています。
	"""
	)
	return


	@app.cell
	def _(cats, chunked_texts, fnames, run_model, train_scikit_cached):
	mo.stop(not run_model.value)

	scikit_corpus, tfidf_X, vectorizer, chunk_cats, chunk_fnames = train_scikit_cached(
	chunked_texts, cats, fnames
	)
	# also send the raw text chunks onward for download
	return chunk_cats, vectorizer


	@app.cell
	def model_selection(run_model):
	mo.stop(not run_model.value)

	model_dropdown = mo.ui.dropdown(
	options=[
	"LogisticRegression",
	"RandomForestClassifier",
	"GradientBoostingClassifier",
	],
	value="LogisticRegression",
	label="モデル選択",
	)
	model_dropdown
	return (model_dropdown,)


	@app.cell
	def hyperparameters(model_dropdown):
	lr_C = mo.ui.slider(0.01, 10.0, value=1.0, step=0.01, label="LR C")
	lr_max_iter = mo.ui.slider(100, 2000, value=1000, step=100, label="LR max_iter")
	rf_n = mo.ui.slider(10, 500, value=100, step=10, label="RF n_estimators")
	rf_max_depth = mo.ui.slider(1, 50, value=10, step=1, label="RF max_depth")
	gb_n = mo.ui.slider(10, 500, value=100, step=10, label="GB n_estimators")
	gb_lr = mo.ui.slider(0.01, 1.0, value=0.1, step=0.01, label="GB learning_rate")
	gb_md = mo.ui.slider(1, 10, value=3, step=1, label="GB max_depth")

	widgets = []
	if model_dropdown.value == "LogisticRegression":
	widgets = {"lr_C": lr_C, "lr_max_iter": lr_max_iter}
	elif model_dropdown.value == "RandomForestClassifier":
	widgets = {"rf_n": rf_n, "rf_max_depth": rf_max_depth}
	else: # GradientBoostingClassifier
	widgets = {"gb_n": gb_n, "gb_lr": gb_lr, "gb_md": gb_md}

	test_size = mo.ui.slider(0.1, 0.5, value=0.3, step=0.05, label="テストデータ比率")

	model_form = (
	mo.md("### モデルのパラメータ設定\n{widgets}\n{test_size}")
	.batch(
	widgets=mo.ui.dictionary(widgets),
	test_size=test_size,
	)
	.form(show_clear_button=True, bordered=False)
	)

	model_form
	return (model_form,)


	@app.cell
	def _(
	chunk_cats,
	chunked_texts,
	label_a,
	label_b,
	model_dropdown,
	model_form,
	run_model,
	vectorizer,
	):
	mo.stop(not run_model.value or not model_form.value)

	import altair as alt
	from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
	from sklearn.linear_model import LogisticRegression
	from sklearn.metrics import (
	classification_report,
	confusion_matrix,
	)
	from sklearn.model_selection import train_test_split

	# split the raw text chunks and labels
	texts_train, texts_test, y_train, y_test = train_test_split(
	chunked_texts,
	chunk_cats,
	test_size=model_form.value["test_size"],
	random_state=RANDOM_SEED,
	)

	# still need feature‐matrices for training the classifier
	X_train = vectorizer.transform(texts_train)
	X_test = vectorizer.transform(texts_test)

	def get_train_data():
	df = pd.DataFrame(
	{
	"text": texts_train,
	"label": y_train,
	}
	)
	print(df)
	buf = io.BytesIO()
	df.to_csv(buf, index=False)
	return buf.getvalue()

	download_training_dataset = mo.download(
	data=get_train_data,
	filename="train.csv",
	mimetype="text/csv",
	label="Download training data",
	)

	def get_test_data():
	df = pd.DataFrame(
	{
	"text": texts_test,
	"label": y_test,
	}
	)
	buf = io.BytesIO()
	df.to_csv(buf, index=False)
	return buf.getvalue()

	download_test_dataset = mo.download(
	data=get_test_data,
	filename="test.csv",
	mimetype="text/csv",
	label="Download test data",
	)

	def get_all_data():
	# concatenate train and test text+labels
	df_all = pd.DataFrame(
	{
	"text": texts_train + texts_test,
	"label": list(y_train) + list(y_test),
	}
	)
	buf = io.BytesIO()
	df_all.to_csv(buf, index=False)
	return buf.getvalue()

	download_all_dataset = mo.download(
	data=get_all_data,
	filename="all_data.csv",
	mimetype="text/csv",
	label="Download full dataset",
	)

	name = model_dropdown.value
	if name == "LogisticRegression":
	clf = LogisticRegression(
	C=model_form.value["widgets"]["lr_C"],
	max_iter=int(model_form.value["widgets"]["lr_max_iter"]),
	)
	elif name == "RandomForestClassifier":
	clf = RandomForestClassifier(
	n_estimators=int(model_form.value["widgets"]["rf_n"]),
	max_depth=int(model_form.value["widgets"]["rf_max_depth"]),
	random_state=RANDOM_SEED,
	)
	else:
	clf = GradientBoostingClassifier(
	n_estimators=int(model_form.value["widgets"]["gb_n"]),
	learning_rate=float(model_form.value["widgets"]["gb_lr"]),
	max_depth=int(model_form.value["widgets"]["gb_md"]),
	random_state=RANDOM_SEED,
	)

	clf.fit(X_train, y_train)
	if hasattr(clf, "feature_importances_"):
	term_scores = clf.feature_importances_
	else:
	term_scores = abs(clf.coef_[0])

	y_pred = clf.predict(X_test)
	report = classification_report(y_test, y_pred, output_dict=True)

	cm = confusion_matrix(y_test, y_pred, labels=clf.classes_)
	cm_df = (
	pd.DataFrame(cm, index=clf.classes_, columns=clf.classes_)
	.reset_index()
	.melt(
	id_vars="index",
	var_name="Predicted",
	value_name="count",
	)
	.rename(columns={"index": "Actual"})
	)

	# pos_idx = list(clf.classes_).index(label_a.value)
	# _proba, roc_auc = None, None
	# roc_df = None
	# if hasattr(clf, "predict_proba"):
	# probs = clf.predict_proba(X_test)[:, pos_idx]
	# y_test_arr = np.array(y_test)
	# fpr, tpr, _ = roc_curve((y_test_arr == label_a.value).astype(int), probs)
	# roc_auc = auc(fpr, tpr)
	# roc_df = pd.DataFrame({"fpr": fpr, "tpr": tpr})

	# feature_names = vectorizer.get_feature_names_out()
	# importances = (
	# pd.DataFrame({"単語": feature_names, "重要度": term_scores})
	# .sort_values("重要度", ascending=False)
	# .head(20)
	# )

	# imp_chart = (
	# alt.Chart(importances)
	# .mark_bar()
	# .encode(
	# x=alt.X("重要度:Q", title="重要度"),
	# y=alt.Y("単語:N", sort="-x"),
	# )
	# .properties(title="Top‐20 重要特徴語", width=600, height=400)
	# )
	cm_chart = (
	alt.Chart(cm_df)
	.mark_rect()
	.encode(
	x="Predicted:N",
	y="Actual:N",
	color=alt.Color("count:Q", title="Count"),
	tooltip=["Actual", "Predicted", "count"],
	)
	.properties(title="Confusion Matrix", width=250, height=250)
	)
	# roc_chart = (
	# alt.Chart(roc_df)
	# .mark_line(point=True)
	# .encode(
	# x=alt.X("fpr:Q", title="False Positive Rate"),
	# y=alt.Y("tpr:Q", title="True Positive Rate"),
	# )
	# .properties(
	# title=f"ROC Curve (AUC={roc_auc:.2f})",
	# width=400,
	# height=300,
	# )
	# )

	mo.vstack(
	[
	mo.hstack(
	[
	download_training_dataset,
	download_test_dataset,
	download_all_dataset,
	],
	align="start",
	),
	eli5.show_weights(clf, vec=vectorizer, target_names=clf.classes_, top=20),
	# mo.ui.altair_chart(imp_chart),
	mo.ui.altair_chart(cm_chart),
	# mo.ui.altair_chart(roc_chart), # Turned out to not be too informative as task is too easy?
	mo.md(f"""
	## テストセット上の分類性能

	- {label_a.value}: 精度 {report[label_a.value]["precision"]:.2%}, 再現率 {report[label_a.value]["recall"]:.2%}
	- {label_b.value}: 精度 {report[label_b.value]["precision"]:.2%}, 再現率 {report[label_b.value]["recall"]:.2%}
	"""),
	]
	)
	return


	@app.cell
	def _():
	# mo.stop(not run_model.value or not model_form.value)

	# with mo.status.spinner("分類モデルのScatterplotを作成中…"):
	# scikit_html = st.produce_scattertext_explorer(
	# corpus=scikit_corpus,
	# category=data_form.value["label_a"],
	# category_name=data_form.value["label_a"],
	# not_category_name=data_form.value["label_b"],
	# scores=term_scores,
	# terms_to_include=st.AutoTermSelector.get_selected_terms(
	# scikit_corpus, term_scores, 4000
	# ),
	# metadata=chunk_fnames,
	# transform=lambda freqs, _index, total: freqs / total.sum(),
	# rescale_x=lambda arr: arr, # identity
	# rescale_y=lambda arr: arr, # identity
	# )
	# mo.iframe(scikit_html)
	return


	if __name__ == "__main__":
	app.run()