import gradio as gr import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from sklearn.impute import SimpleImputer from sklearn.decomposition import PCA from sklearn.model_selection import train_test_split from sklearn.metrics import mean_squared_error, r2_score from sklearn.ensemble import RandomForestRegressor from xgboost import XGBRegressor from sklearn.linear_model import Ridge from catboost import CatBoostRegressor import warnings warnings.filterwarnings('ignore') def load_embeddings(embeddings_file_path): county_embeddings = pd.read_csv(embeddings_file_path).set_index('place') numeric_cols = county_embeddings.select_dtypes(include=['number']).columns county_embeddings_numeric = county_embeddings[numeric_cols] imputer = SimpleImputer(strategy='mean') county_embeddings_imputed = imputer.fit_transform(county_embeddings_numeric) pca = PCA(n_components=330) pca.fit(county_embeddings_imputed) county_embeddings_pca = pca.transform(county_embeddings_imputed) return county_embeddings, county_embeddings_pca, pca, imputer def load_unemployment_data(unemployment_file_path): unemployment_data = pd.read_csv(unemployment_file_path).set_index('place') unemployment_long = unemployment_data.reset_index().melt(id_vars='place', var_name='date', value_name='unemployment_rate') return unemployment_long def preprocess_data(county_embeddings, county_embeddings_pca, unemployment_long, pca, imputer): # Prepare data for modeling X = unemployment_long.drop('unemployment_rate', axis=1) y = unemployment_long['unemployment_rate'] # Split the data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # Merge embeddings county_embeddings.index = county_embeddings.index.astype(str) X_train['place'] = X_train['place'].astype(str) X_test['place'] = X_test['place'].astype(str) X_train = X_train.merge(county_embeddings, left_on='place', right_index=True, how='left') X_test = X_test.merge(county_embeddings, left_on='place', right_index=True, how='left') # Remove non-numeric columns numeric_cols_train = X_train.select_dtypes(include=['number']).columns X_train_numeric = X_train[numeric_cols_train] numeric_cols_test = X_test.select_dtypes(include=['number']).columns X_test_numeric = X_test[numeric_cols_test] # Impute missing values X_train_imputed = imputer.transform(X_train_numeric) X_test_imputed = imputer.transform(X_test_numeric) # Apply PCA X_train_pca = pca.transform(X_train_imputed) X_test_pca = pca.transform(X_test_imputed) return X_train_pca, X_test_pca, y_train, y_test, numeric_cols_train def train_and_evaluate_models(X_train_pca, X_test_pca, y_train, y_test, numeric_cols_train): # Define models models = { "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42), "XGBoost": XGBRegressor(n_estimators=100, random_state=42, tree_method='gpu_hist'), "Ridge Regression": Ridge(alpha=1.0), "CatBoost": CatBoostRegressor(iterations=100, random_seed=42, task_type="GPU") } results = {} feature_importances = {} for name, model in models.items(): model.fit(X_train_pca, y_train) y_pred = model.predict(X_test_pca) rmse = np.sqrt(mean_squared_error(y_test, y_pred)) r2 = r2_score(y_test, y_pred) results[name] = {'RMSE': rmse, 'R-squared': r2} # Feature importances if hasattr(model, 'feature_importances_'): importances = model.feature_importances_ feature_importances[name] = importances return results, feature_importances, numeric_cols_train def plot_feature_importance(importances, feature_names, model_name): feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances}) feature_importance_df = feature_importance_df.sort_values('Importance', ascending=False).head(20) plt.figure(figsize=(10, 8)) sns.barplot(x='Importance', y='Feature', data=feature_importance_df) plt.title(f'{model_name} Feature Importance') plt.tight_layout() plt.close() return plt.gcf() def plot_metrics(results): metrics_df = pd.DataFrame(results).T.reset_index().rename(columns={'index': 'Model'}) plt.figure(figsize=(8, 6)) sns.barplot(x='Model', y='RMSE', data=metrics_df) plt.title('RMSE for Each Model') plt.xticks(rotation=45) plt.tight_layout() plt.close() rmse_plot = plt.gcf() plt.figure(figsize=(8, 6)) sns.barplot(x='Model', y='R-squared', data=metrics_df) plt.title('R-squared for Each Model') plt.xticks(rotation=45) plt.tight_layout() plt.close() r2_plot = plt.gcf() return rmse_plot, r2_plot def main(embeddings_file_path, unemployment_file_path): # Load data county_embeddings, county_embeddings_pca, pca, imputer = load_embeddings(embeddings_file_path) unemployment_long = load_unemployment_data(unemployment_file_path) # Preprocess data X_train_pca, X_test_pca, y_train, y_test, numeric_cols_train = preprocess_data( county_embeddings, county_embeddings_pca, unemployment_long, pca, imputer ) # Train and evaluate models results, feature_importances, feature_names = train_and_evaluate_models( X_train_pca, X_test_pca, y_train, y_test, numeric_cols_train ) # Plot metrics rmse_plot, r2_plot = plot_metrics(results) # Plot feature importance for models that have it feature_importance_plots = {} for model_name, importances in feature_importances.items(): fig = plot_feature_importance(importances, [f'PC{i+1}' for i in range(len(importances))], model_name) feature_importance_plots[model_name] = fig return results, rmse_plot, r2_plot, feature_importance_plots def gradio_app(): with gr.Blocks() as demo: gr.Markdown("# County-Level Unemployment Rate Forecasting") gr.Markdown("Upload county embeddings and unemployment data to train models and visualize results.") with gr.Row(): embeddings_file = gr.File(label="Upload County Embeddings CSV") unemployment_file = gr.File(label="Upload Unemployment Data CSV") run_button = gr.Button("Run Analysis") output_results = gr.JSON(label="Model Performance Metrics") output_rmse_plot = gr.Plot(label="RMSE Comparison") output_r2_plot = gr.Plot(label="R-squared Comparison") output_feature_importance = gr.Plot(label="Feature Importances") def run_analysis(embeddings_file, unemployment_file): if embeddings_file is None or unemployment_file is None: return gr.update(value="Please upload both embeddings and unemployment data files."), None, None, None # Read files embeddings_file_path = embeddings_file.name unemployment_file_path = unemployment_file.name # Run main analysis results, rmse_plot, r2_plot, feature_importance_plots = main(embeddings_file_path, unemployment_file_path) # For simplicity, display feature importance of Random Forest (if available) fi_plot = None if 'Random Forest' in feature_importance_plots: fi_plot = feature_importance_plots['Random Forest'] return results, rmse_plot, r2_plot, fi_plot run_button.click( run_analysis, inputs=[embeddings_file, unemployment_file], outputs=[output_results, output_rmse_plot, output_r2_plot, output_feature_importance] ) demo.launch() if __name__ == "__main__": gradio_app()