AshenH commited on
Commit
980e633
·
verified ·
1 Parent(s): 2e1969a

Update tools/ts_preprocess.py

Browse files
Files changed (1) hide show
  1. tools/ts_preprocess.py +99 -87
tools/ts_preprocess.py CHANGED
@@ -1,108 +1,120 @@
1
- # space/tools/ts_preprocess.py
2
  import pandas as pd
3
  import numpy as np
4
- from typing import List
 
5
 
6
- MONTH = "MS" # month-start frequency
 
 
 
 
7
 
 
 
 
 
 
 
 
 
 
8
 
9
- def _emi(principal: float, annual_rate: float, n_months: int) -> float:
10
- """
11
- EMI formula with monthly compounding.
12
- r_m = annual_rate / 12
13
- EMI = P * r_m * (1+r_m)^n / ((1+r_m)^n - 1)
14
- """
15
- if n_months <= 0 or principal <= 0:
16
- return 0.0
17
  r = annual_rate / 12.0
18
- if r <= 0:
19
- return principal / n_months
20
- fac = (1.0 + r) ** n_months
21
- return principal * r * fac / (fac - 1.0)
22
-
23
 
24
- def _project_deposit(principal: float, annual_rate: float, months: int) -> pd.DataFrame:
25
- """
26
- Monthly path for a deposit. Value compounds monthly.
27
- """
28
- r = annual_rate / 12.0
29
- data = []
30
  bal = principal
31
- for m in range(months + 1):
32
- data.append({"step": m, "portfolio_value": bal})
33
- bal = bal * (1.0 + r)
34
- return pd.DataFrame(data)
35
-
 
 
 
 
 
 
 
36
 
37
- def _project_asset(principal: float, annual_rate: float, tenor_months: int) -> pd.DataFrame:
38
  """
39
- Monthly amortization schedule for an asset/loan using EMI.
40
  """
41
- emi = _emi(principal, annual_rate, tenor_months)
42
- r = annual_rate / 12.0
43
- data = []
44
- bal = principal
45
- for m in range(tenor_months + 1):
46
- interest = bal * r
47
- principal_pay = max(0.0, emi - interest)
48
- next_bal = max(0.0, bal - principal_pay)
49
- data.append({
50
- "step": m,
51
- "portfolio_value": bal,
52
- "emi": emi,
53
- "interest_component": interest,
54
- "principal_component": principal_pay,
55
- "remaining_balance": next_bal
56
- })
57
- bal = next_bal
58
- return pd.DataFrame(data)
59
 
 
 
 
 
60
 
61
- def build_timeseries(df: pd.DataFrame) -> pd.DataFrame:
62
  """
63
- Input df columns (example):
64
- - portfolio_date (datetime or str)
65
- - instrument_type: 'Deposit' or 'Asset'
66
- - balance: float
67
- - interest_rate: annual rate (e.g., 0.12)
68
- - time_to_maturity: months (int)
69
- - tenor_months: months (for Assets; if missing, fallback to time_to_maturity)
70
- Output:
71
- Long time-series with monthly timestamps, projected 'portfolio_value'
72
- (and EMI breakdown for Assets).
73
  """
74
  df = df.copy()
75
- if "timestamp" not in df.columns:
76
- df["timestamp"] = pd.to_datetime(df["portfolio_date"])
77
-
78
- out_frames: List[pd.DataFrame] = []
79
- for _, row in df.iterrows():
80
- itype = str(row.get("instrument_type", "")).strip().lower()
81
- start = pd.to_datetime(row["timestamp"])
82
- months = int(row.get("time_to_maturity", 0) or 0)
83
- principal = float(row.get("balance", 0.0) or 0.0)
84
- annual_rate = float(row.get("interest_rate", 0.0) or 0.0)
85
 
86
- if itype == "deposit":
87
- sched = _project_deposit(principal, annual_rate, months)
88
- elif itype == "asset":
89
- tenor = int(row.get("tenor_months", months) or months or 0)
90
- sched = _project_asset(principal, annual_rate, tenor)
91
  else:
92
- # unknown types: keep flat
93
- sched = pd.DataFrame({"step": range(months + 1), "portfolio_value": principal})
94
 
95
- # Build timestamps: month-start frequency
96
- sched["timestamp"] = pd.date_range(start=start, periods=len(sched), freq=MONTH)
97
- # Carry identifiers
98
- for col in ["instrument_type", "interest_rate"]:
99
- if col in df.columns:
100
- sched[col] = row.get(col)
101
- sched["origin_portfolio_date"] = start
102
- sched["origin_balance"] = principal
 
103
 
104
- out_frames.append(sched)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
 
106
- ts = pd.concat(out_frames, ignore_index=True)
107
- ts = ts.sort_values(["timestamp", "instrument_type"]).reset_index(drop=True)
108
- return ts
 
1
+ # tools/ts_preprocess.py
2
  import pandas as pd
3
  import numpy as np
4
+ from datetime import date
5
+ from typing import Tuple
6
 
7
+ # -------------------------
8
+ # Cash-flow projections
9
+ # -------------------------
10
+ def estimated_monthly_interest(principal: float, annual_rate: float, n_months: int) -> float:
11
+ return principal * (annual_rate / 12.0) * n_months
12
 
13
+ def project_deposit(principal: float, annual_rate: float, months: int) -> pd.DataFrame:
14
+ start = pd.Timestamp(date.today()).normalize().replace(day=1)
15
+ periods = pd.period_range(start=start, periods=max(int(months), 1), freq="M")
16
+ df = pd.DataFrame({"period": periods.to_timestamp()})
17
+ df["interest"] = principal * (annual_rate / 12.0)
18
+ df["principal_cf"] = 0.0
19
+ df.loc[df.index[-1], "principal_cf"] = principal
20
+ df["cash_flow"] = df["interest"] + df["principal_cf"]
21
+ return df
22
 
23
+ def project_asset(principal: float, annual_rate: float, months: int) -> pd.DataFrame:
24
+ m = max(int(months), 1)
 
 
 
 
 
 
25
  r = annual_rate / 12.0
26
+ if r == 0.0:
27
+ pmt = principal / m
28
+ else:
29
+ pmt = principal * (r * (1 + r) ** m) / ((1 + r) ** m - 1)
 
30
 
31
+ rows = []
32
+ start = pd.Timestamp(date.today()).normalize().replace(day=1)
 
 
 
 
33
  bal = principal
34
+ for i in range(1, m + 1):
35
+ interest = bal * r
36
+ principal_cf = pmt - interest
37
+ bal = max(0.0, bal - principal_cf)
38
+ rows.append({
39
+ "period": start + pd.offsets.MonthEnd(i),
40
+ "interest": float(interest),
41
+ "principal_cf": float(principal_cf),
42
+ "cash_flow": float(pmt),
43
+ "balance": float(bal)
44
+ })
45
+ return pd.DataFrame(rows)
46
 
47
+ def liquidity_gap(cf: pd.DataFrame) -> pd.DataFrame:
48
  """
49
+ Input columns: period, product, cash_flow
50
  """
51
+ cf = cf.copy()
52
+ cf["bucket"] = pd.PeriodIndex(cf["period"], freq="M").astype(str)
53
+ piv = cf.pivot_table(index="bucket", columns="product", values="cash_flow",
54
+ aggfunc="sum", fill_value=0.0)
55
+ # assets - liabilities(FD)
56
+ piv["gap"] = piv.get("assets", 0.0) - piv.get("fd", 0.0)
57
+ piv["cumulative_gap"] = piv["gap"].cumsum()
58
+ return piv.reset_index()
 
 
 
 
 
 
 
 
 
 
59
 
60
+ # -------------------------
61
+ # Adapter + main builder
62
+ # -------------------------
63
+ REQUIRED_MASTER_COLS = {"Portfolio_value", "Interest_rate", "months", "product"}
64
 
65
+ def _adapt_masterdataset(df: pd.DataFrame) -> pd.DataFrame:
66
  """
67
+ Accepts rows from main.masterdataset_v and returns a normalized frame
68
+ with a monthly projection per row.
 
 
 
 
 
 
 
 
69
  """
70
  df = df.copy()
71
+ assert REQUIRED_MASTER_COLS.issubset(df.columns), (
72
+ f"DataFrame must include {REQUIRED_MASTER_COLS}, got {set(df.columns)}"
73
+ )
74
+ all_cf = []
75
+ for _, r in df.iterrows():
76
+ principal = float(r["Portfolio_value"])
77
+ rate = float(r["Interest_rate"])
78
+ months = int(r["months"]) if pd.notna(r["months"]) else max(int((r.get("days_to_maturity", 0) or 0) // 30), 1)
79
+ prod = str(r["product"]).lower().strip()
 
80
 
81
+ if prod == "fd":
82
+ cf = project_deposit(principal, rate, months)
 
 
 
83
  else:
84
+ cf = project_asset(principal, rate, months)
 
85
 
86
+ # carry attributes
87
+ cf["product"] = prod
88
+ cf["contract_number"] = r.get("contract_number", None)
89
+ cf["segments"] = r.get("segments", None)
90
+ cf["currency"] = r.get("currency", None)
91
+ cf["board_currency"] = r.get("board_currency", None)
92
+ cf["Portfolio_value"] = principal
93
+ cf["Interest_rate"] = rate
94
+ all_cf.append(cf)
95
 
96
+ return pd.concat(all_cf, ignore_index=True) if all_cf else pd.DataFrame(
97
+ columns=["period", "interest", "principal_cf", "cash_flow", "product",
98
+ "contract_number", "segments", "currency", "board_currency",
99
+ "Portfolio_value", "Interest_rate"]
100
+ )
101
+
102
+ def build_timeseries(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
103
+ """
104
+ Main entrypoint used by app.py:
105
+ - If frame matches masterdataset_v, generate cashflows
106
+ - Compute liquidity gap on the result
107
+ Returns: (cashflows_df, gap_df)
108
+ """
109
+ if REQUIRED_MASTER_COLS.issubset(df.columns):
110
+ cf = _adapt_masterdataset(df)
111
+ else:
112
+ # if already a projected cashflow frame (period/product/cash_flow), pass-through
113
+ if {"period", "product", "cash_flow"}.issubset(df.columns):
114
+ cf = df.copy()
115
+ else:
116
+ # unsupported schema
117
+ return pd.DataFrame(), pd.DataFrame()
118
 
119
+ gap = liquidity_gap(cf[["period", "product", "cash_flow"]])
120
+ return cf, gap