Julian Bilcke
commited on
Commit
·
4905a7d
1
Parent(s):
a73397c
fix
Browse files- app.py +15 -5
- training_log_parser.py +1 -0
- training_service.py +7 -6
app.py
CHANGED
|
@@ -63,6 +63,8 @@ class VideoTrainerUI:
|
|
| 63 |
"""Update UI components based on training state"""
|
| 64 |
updates = {}
|
| 65 |
|
|
|
|
|
|
|
| 66 |
# Update status box with high-level information
|
| 67 |
status_text = []
|
| 68 |
if training_state["status"] != "idle":
|
|
@@ -258,10 +260,13 @@ class VideoTrainerUI:
|
|
| 258 |
|
| 259 |
def update_training_buttons(self, training_state: Dict[str, Any]) -> Dict:
|
| 260 |
"""Update training control buttons based on state"""
|
|
|
|
| 261 |
is_training = training_state["status"] in ["training", "initializing"]
|
|
|
|
|
|
|
| 262 |
is_paused = training_state["status"] == "paused"
|
| 263 |
is_completed = training_state["status"] in ["completed", "error", "stopped"]
|
| 264 |
-
|
| 265 |
return {
|
| 266 |
"start_btn": gr.Button(
|
| 267 |
interactive=not is_training and not is_paused,
|
|
@@ -289,8 +294,10 @@ class VideoTrainerUI:
|
|
| 289 |
})
|
| 290 |
|
| 291 |
def handle_pause_resume(self):
|
|
|
|
| 292 |
status = self.trainer.get_status()
|
| 293 |
-
|
|
|
|
| 294 |
result = self.trainer.resume_training()
|
| 295 |
new_state = {"status": "training"}
|
| 296 |
else:
|
|
@@ -623,6 +630,8 @@ class VideoTrainerUI:
|
|
| 623 |
|
| 624 |
status_update = status["message"]
|
| 625 |
|
|
|
|
|
|
|
| 626 |
# Parse new log lines
|
| 627 |
if logs:
|
| 628 |
last_state = None
|
|
@@ -630,6 +639,7 @@ class VideoTrainerUI:
|
|
| 630 |
state_update = self.log_parser.parse_line(line)
|
| 631 |
if state_update:
|
| 632 |
last_state = state_update
|
|
|
|
| 633 |
|
| 634 |
if last_state:
|
| 635 |
ui_updates = self.update_training_ui(last_state)
|
|
@@ -648,6 +658,8 @@ class VideoTrainerUI:
|
|
| 648 |
"message": status
|
| 649 |
}
|
| 650 |
|
|
|
|
|
|
|
| 651 |
if is_completed:
|
| 652 |
button_updates = self.handle_training_complete()
|
| 653 |
return (
|
|
@@ -1129,9 +1141,7 @@ class VideoTrainerUI:
|
|
| 1129 |
],
|
| 1130 |
outputs=[status_box, log_box]
|
| 1131 |
).success(
|
| 1132 |
-
fn=lambda: self.update_training_buttons(
|
| 1133 |
-
"status": "training"
|
| 1134 |
-
}),
|
| 1135 |
outputs=[start_btn, stop_btn, pause_resume_btn]
|
| 1136 |
)
|
| 1137 |
|
|
|
|
| 63 |
"""Update UI components based on training state"""
|
| 64 |
updates = {}
|
| 65 |
|
| 66 |
+
print("update_training_ui: training_state = ", training_state)
|
| 67 |
+
|
| 68 |
# Update status box with high-level information
|
| 69 |
status_text = []
|
| 70 |
if training_state["status"] != "idle":
|
|
|
|
| 260 |
|
| 261 |
def update_training_buttons(self, training_state: Dict[str, Any]) -> Dict:
|
| 262 |
"""Update training control buttons based on state"""
|
| 263 |
+
#print("update_training_buttons: training_state = ", training_state)
|
| 264 |
is_training = training_state["status"] in ["training", "initializing"]
|
| 265 |
+
if training_state["message"] == "No training in progress":
|
| 266 |
+
is_training = False
|
| 267 |
is_paused = training_state["status"] == "paused"
|
| 268 |
is_completed = training_state["status"] in ["completed", "error", "stopped"]
|
| 269 |
+
#print(f"update_training_buttons: is_training = {is_training}, is_paused = {is_paused}, is_completed = {is_completed}")
|
| 270 |
return {
|
| 271 |
"start_btn": gr.Button(
|
| 272 |
interactive=not is_training and not is_paused,
|
|
|
|
| 294 |
})
|
| 295 |
|
| 296 |
def handle_pause_resume(self):
|
| 297 |
+
|
| 298 |
status = self.trainer.get_status()
|
| 299 |
+
print("handle_pause_resume: status = ", status)
|
| 300 |
+
if status["status"] == "paused":
|
| 301 |
result = self.trainer.resume_training()
|
| 302 |
new_state = {"status": "training"}
|
| 303 |
else:
|
|
|
|
| 630 |
|
| 631 |
status_update = status["message"]
|
| 632 |
|
| 633 |
+
# print(f"refresh_training_status_and_logs: ", status)
|
| 634 |
+
|
| 635 |
# Parse new log lines
|
| 636 |
if logs:
|
| 637 |
last_state = None
|
|
|
|
| 639 |
state_update = self.log_parser.parse_line(line)
|
| 640 |
if state_update:
|
| 641 |
last_state = state_update
|
| 642 |
+
print("last_state = ", last_state)
|
| 643 |
|
| 644 |
if last_state:
|
| 645 |
ui_updates = self.update_training_ui(last_state)
|
|
|
|
| 658 |
"message": status
|
| 659 |
}
|
| 660 |
|
| 661 |
+
#print("refresh_training_status: current_state = ", current_state)
|
| 662 |
+
|
| 663 |
if is_completed:
|
| 664 |
button_updates = self.handle_training_complete()
|
| 665 |
return (
|
|
|
|
| 1141 |
],
|
| 1142 |
outputs=[status_box, log_box]
|
| 1143 |
).success(
|
| 1144 |
+
fn=lambda: self.update_training_buttons(),
|
|
|
|
|
|
|
| 1145 |
outputs=[start_btn, stop_btn, pause_resume_btn]
|
| 1146 |
)
|
| 1147 |
|
training_log_parser.py
CHANGED
|
@@ -73,6 +73,7 @@ class TrainingLogParser:
|
|
| 73 |
if "Training steps:" in line:
|
| 74 |
# Set status to training if we see this
|
| 75 |
self.state.status = "training"
|
|
|
|
| 76 |
if not self.state.start_time:
|
| 77 |
self.state.start_time = datetime.now()
|
| 78 |
|
|
|
|
| 73 |
if "Training steps:" in line:
|
| 74 |
# Set status to training if we see this
|
| 75 |
self.state.status = "training"
|
| 76 |
+
print("setting status to 'training'")
|
| 77 |
if not self.state.start_time:
|
| 78 |
self.state.start_time = datetime.now()
|
| 79 |
|
training_service.py
CHANGED
|
@@ -29,7 +29,7 @@ logging.basicConfig(
|
|
| 29 |
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
| 30 |
handlers=[
|
| 31 |
logging.StreamHandler(sys.stdout),
|
| 32 |
-
logging.FileHandler('training_service.log')
|
| 33 |
]
|
| 34 |
)
|
| 35 |
logger = logging.getLogger(__name__)
|
|
@@ -65,7 +65,7 @@ class TrainingService:
|
|
| 65 |
|
| 66 |
def get_status(self) -> Dict:
|
| 67 |
"""Get current training status"""
|
| 68 |
-
default_status = {'
|
| 69 |
|
| 70 |
if not self.status_file.exists():
|
| 71 |
return default_status
|
|
@@ -73,6 +73,7 @@ class TrainingService:
|
|
| 73 |
try:
|
| 74 |
with open(self.status_file, 'r') as f:
|
| 75 |
status = json.load(f)
|
|
|
|
| 76 |
|
| 77 |
# Check if process is actually running
|
| 78 |
if self.pid_file.exists():
|
|
@@ -80,12 +81,12 @@ class TrainingService:
|
|
| 80 |
pid = int(f.read().strip())
|
| 81 |
if not psutil.pid_exists(pid):
|
| 82 |
# Process died unexpectedly
|
| 83 |
-
if status['
|
| 84 |
-
status['
|
| 85 |
status['message'] = 'Training process terminated unexpectedly'
|
| 86 |
self.append_log("Training process terminated unexpectedly")
|
| 87 |
else:
|
| 88 |
-
status['
|
| 89 |
status['message'] = 'Training process not found'
|
| 90 |
return status
|
| 91 |
|
|
@@ -432,7 +433,7 @@ class TrainingService:
|
|
| 432 |
def save_status(self, state: str, **kwargs) -> None:
|
| 433 |
"""Save current training status"""
|
| 434 |
status = {
|
| 435 |
-
'
|
| 436 |
'timestamp': datetime.now().isoformat(),
|
| 437 |
**kwargs
|
| 438 |
}
|
|
|
|
| 29 |
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
| 30 |
handlers=[
|
| 31 |
logging.StreamHandler(sys.stdout),
|
| 32 |
+
logging.FileHandler(str(OUTPUT_PATH / 'training_service.log'))
|
| 33 |
]
|
| 34 |
)
|
| 35 |
logger = logging.getLogger(__name__)
|
|
|
|
| 65 |
|
| 66 |
def get_status(self) -> Dict:
|
| 67 |
"""Get current training status"""
|
| 68 |
+
default_status = {'status': 'stopped', 'message': 'No training in progress'}
|
| 69 |
|
| 70 |
if not self.status_file.exists():
|
| 71 |
return default_status
|
|
|
|
| 73 |
try:
|
| 74 |
with open(self.status_file, 'r') as f:
|
| 75 |
status = json.load(f)
|
| 76 |
+
print("status found in the json:", status)
|
| 77 |
|
| 78 |
# Check if process is actually running
|
| 79 |
if self.pid_file.exists():
|
|
|
|
| 81 |
pid = int(f.read().strip())
|
| 82 |
if not psutil.pid_exists(pid):
|
| 83 |
# Process died unexpectedly
|
| 84 |
+
if status['status'] == 'running':
|
| 85 |
+
status['status'] = 'error'
|
| 86 |
status['message'] = 'Training process terminated unexpectedly'
|
| 87 |
self.append_log("Training process terminated unexpectedly")
|
| 88 |
else:
|
| 89 |
+
status['status'] = 'stopped'
|
| 90 |
status['message'] = 'Training process not found'
|
| 91 |
return status
|
| 92 |
|
|
|
|
| 433 |
def save_status(self, state: str, **kwargs) -> None:
|
| 434 |
"""Save current training status"""
|
| 435 |
status = {
|
| 436 |
+
'status': state,
|
| 437 |
'timestamp': datetime.now().isoformat(),
|
| 438 |
**kwargs
|
| 439 |
}
|