Try avoid hf hub git rate limits

Browse files

Files changed (12) hide show

config.gin +2 -3
config.json +1 -1
small_nl24_pretrain.gin +2 -3
start_train.sh +2 -1
tasks.py +70 -37
train.py +0 -689
train/events.out.tfevents.1649073594.t1v-n-304587cf-w-0.1316481.0.v2 +0 -3
train/events.out.tfevents.1649092520.t1v-n-304587cf-w-0.1399566.0.v2 +0 -3
train/{events.out.tfevents.1649056216.t1v-n-304587cf-w-0.1239745.0.v2 → events.out.tfevents.1649705066.t1v-n-304587cf-w-0.2549834.0.v2} +2 -2
training_eval/pretrain_finnish/events.out.tfevents.1649073594.t1v-n-304587cf-w-0.1316481.1.v2 +0 -3
training_eval/pretrain_finnish/events.out.tfevents.1649092520.t1v-n-304587cf-w-0.1399566.1.v2 +0 -3
training_eval/pretrain_finnish/{events.out.tfevents.1649056216.t1v-n-304587cf-w-0.1239745.1.v2 → events.out.tfevents.1649705066.t1v-n-304587cf-w-0.2549834.1.v2} +2 -2

config.gin CHANGED Viewed

@@ -12,7 +12,7 @@ import tasks
 # Macros:
 # ==============================================================================
-BATCH_SIZE = 512
 DROPOUT_RATE = 0.0
 LABEL_SMOOTHING = 0.0
 LOSS_NORMALIZING_FACTOR = None
@@ -23,7 +23,7 @@ MODEL_DIR = '/researchdisk/t5x-small-nl24-finnish'
 OPTIMIZER = @adafactor.Adafactor()
 RANDOM_SEED = None
 SHUFFLE_TRAIN_EXAMPLES = True
-TASK_FEATURE_LENGTHS = {'inputs': 512, 'targets': 114}
 TRAIN_STEPS = 500000
 USE_CACHED_TASKS = False
 USE_HARDWARE_RNG = False
@@ -123,7 +123,6 @@ network.T5Config.vocab_size = 32128
 train_script.train.checkpoint_cfg = @utils.CheckpointConfig()
 train_script.train.eval_period = 10000
 train_script.train.eval_steps = 20
-train_script.train.hub_model_id = 'Finnish-NLP/t5x-small-nl24-finnish'
 train_script.train.infer_eval_dataset_cfg = None
 train_script.train.model = %MODEL
 train_script.train.model_dir = %MODEL_DIR

 # Macros:
 # ==============================================================================
+BATCH_SIZE = 256
 DROPOUT_RATE = 0.0
 LABEL_SMOOTHING = 0.0
 LOSS_NORMALIZING_FACTOR = None
 OPTIMIZER = @adafactor.Adafactor()
 RANDOM_SEED = None
 SHUFFLE_TRAIN_EXAMPLES = True
+TASK_FEATURE_LENGTHS = {'inputs': 512, 'targets': 512}
 TRAIN_STEPS = 500000
 USE_CACHED_TASKS = False
 USE_HARDWARE_RNG = False
 train_script.train.checkpoint_cfg = @utils.CheckpointConfig()
 train_script.train.eval_period = 10000
 train_script.train.eval_steps = 20
 train_script.train.infer_eval_dataset_cfg = None
 train_script.train.model = %MODEL
 train_script.train.model_dir = %MODEL_DIR

config.json CHANGED Viewed

@@ -7,7 +7,7 @@
   "d_kv": 64,
   "d_model": 512,
   "decoder_start_token_id": 0,
-  "dropout_rate": 0.0,
   "eos_token_id": 1,
   "feed_forward_proj": "gated-gelu",
   "initializer_factor": 1.0,

   "d_kv": 64,
   "d_model": 512,
   "decoder_start_token_id": 0,
+  "dropout_rate": 0.1,
   "eos_token_id": 1,
   "feed_forward_proj": "gated-gelu",
   "initializer_factor": 1.0,

small_nl24_pretrain.gin CHANGED Viewed

@@ -11,7 +11,6 @@ include 't5x/configs/runs/pretrain.gin'
 # ------------------- Training specification overrides --------------------------
 train_script.train:
   eval_period = 10000
-  hub_model_id = "Finnish-NLP/t5x-small-nl24-finnish"
 utils.SaveCheckpointConfig:
   period = 10000
@@ -19,7 +18,7 @@ utils.SaveCheckpointConfig:
 MIXTURE_OR_TASK_NAME = "pretrain_finnish"
 USE_CACHED_TASKS = False
-TASK_FEATURE_LENGTHS = {"inputs": 512, "targets": 114}
 TRAIN_STEPS = 500000
 DROPOUT_RATE = 0.0
-BATCH_SIZE = 512

 # ------------------- Training specification overrides --------------------------
 train_script.train:
   eval_period = 10000
 utils.SaveCheckpointConfig:
   period = 10000
 MIXTURE_OR_TASK_NAME = "pretrain_finnish"
 USE_CACHED_TASKS = False
+TASK_FEATURE_LENGTHS = {"inputs": 512, "targets": 512}
 TRAIN_STEPS = 500000
 DROPOUT_RATE = 0.0
+BATCH_SIZE = 256

start_train.sh CHANGED Viewed

@@ -2,10 +2,11 @@
 unset LD_PRELOAD
 PROJECT_DIR="/researchdisk/t5x-small-nl24-finnish"
 MODEL_DIR="/researchdisk/t5x-small-nl24-finnish"
 export PYTHONPATH=${PROJECT_DIR}
-python3 train.py \
   --gin_search_paths=${PROJECT_DIR} \
   --gin_file="small_nl24_pretrain.gin" \
   --gin.MODEL_DIR=\"${MODEL_DIR}\"

 unset LD_PRELOAD
 PROJECT_DIR="/researchdisk/t5x-small-nl24-finnish"
+T5X_DIR=${HOME}"/t5x"  # directory where the t5x is cloned.
 MODEL_DIR="/researchdisk/t5x-small-nl24-finnish"
 export PYTHONPATH=${PROJECT_DIR}
+python3 ${T5X_DIR}/t5x/train.py \
   --gin_search_paths=${PROJECT_DIR} \
   --gin_file="small_nl24_pretrain.gin" \
   --gin.MODEL_DIR=\"${MODEL_DIR}\"

tasks.py CHANGED Viewed

@@ -1,49 +1,82 @@
 import functools
 import seqio
-from t5.evaluation import metrics
 from t5.data import preprocessors
-vocabulary = seqio.SentencePieceVocabulary('spiece.model')
-output_features = {
-    'inputs': seqio.Feature(vocabulary=vocabulary, add_eos=True, required=False),
-    'targets': seqio.Feature(vocabulary=vocabulary, add_eos=True)
 }
-seqio.TaskRegistry.add(
-    'pretrain_finnish',
-    source=seqio.TextLineDataSource({
-        "train": "/researchdisk/lm_training_dataset_full_sentences/train.txt",
-        "validation": "/researchdisk/lm_training_dataset_full_sentences/validation.txt"
-        }),
     preprocessors=[
         functools.partial(
-            preprocessors.parse_tsv,
-            field_names=["text"],
-            field_delim="\n"),
-        functools.partial(
-            preprocessors.rekey, key_map={
                 "inputs": None,
-                "targets": "text"
-            }),
         seqio.preprocessors.tokenize,
-        seqio.CacheDatasetPlaceholder(),
-        preprocessors.span_corruption,
         seqio.preprocessors.append_eos_after_trim,
     ],
-    metric_fns=[metrics.accuracy],
-    output_features=output_features)
-# dataset = seqio.get_mixture_or_task("pretrain_finnish").get_dataset(
-#     sequence_length={"inputs": 512, "targets": 114},
-#     split="train",
-#     shuffle=True,
-#     num_epochs=1,
-#     #shard_info=seqio.ShardInfo(index=0, num_shards=10),
-#     use_cached=False,
-#     seed=42
-# )
-# # Print the first 5 examples.
-# for _, ex in zip(range(5), dataset.as_numpy_iterator()):
-#     print(ex)

+# adapted from https://huggingface.co/pere/pk-nb-t5x/blob/main/tasks.py
 import functools
 import seqio
+import tensorflow as tf
+import t5.data
+from datasets import load_dataset, load_from_disk
+from t5.data import postprocessors
 from t5.data import preprocessors
+from t5.evaluation import metrics
+from seqio import FunctionDataSource, utils
+TaskRegistry = seqio.TaskRegistry
+vocabulary = seqio.SentencePieceVocabulary('spiece.model', extra_ids=0)
+DEFAULT_OUTPUT_FEATURES = {
+    "inputs": seqio.Feature(
+        vocabulary=vocabulary, add_eos=True,
+        required=False),
+    "targets": seqio.Feature(
+        vocabulary=vocabulary, add_eos=True)
 }
+def gen_dataset(split, shuffle=False, seed=None, column="text", dataset=None):
+    if shuffle:
+        if seed:
+            dataset = dataset.shuffle(seed=seed)
+        else:
+            dataset = dataset.shuffle()
+    while True:
+        for item in dataset[str(split)]:
+            yield item[column]
+def dataset_fn(split, shuffle_files, seed=None, dataset=None):
+    return tf.data.Dataset.from_generator(
+        functools.partial(gen_dataset, split, shuffle_files, seed, dataset=dataset),
+        output_signature=tf.TensorSpec(shape=(), dtype=tf.string, name=dataset_name)
+    )
+@utils.map_over_dataset
+def target_to_key(x, key_map, target_key):
+    """Assign the value from the dataset to target_key in key_map"""
+    return {**key_map, target_key: x}
+# Final pretraining task used in Raffel et al., 2019 adaptated to NCC
+dataset_name = "/researchdisk/lm_training_dataset_full"
+dataset_params = {"from_disk_path": dataset_name}
+if "from_disk_path" in dataset_params:
+    dataset = load_from_disk(dataset_params.get("from_disk_path"))
+else:
+    dataset = load_dataset(**dataset_params)
+dataset_shapes = {"train": dataset["train"].num_rows, "validation": dataset["validation"].num_rows}
+TaskRegistry.add(
+    "pretrain_finnish",
+    source=seqio.FunctionDataSource(
+        dataset_fn=functools.partial(dataset_fn, dataset=dataset),
+        splits=("train", "validation"),
+        caching_permitted=False,
+        num_input_examples=dataset_shapes,
+    ),
     preprocessors=[
         functools.partial(
+            target_to_key, key_map={
                 "inputs": None,
+                "targets": None,
+            }, target_key="targets"),
         seqio.preprocessors.tokenize,
+        # seqio.CacheDatasetPlaceholder(),
+        preprocessors.span_corruption,
         seqio.preprocessors.append_eos_after_trim,
     ],
+    output_features={"targets": DEFAULT_OUTPUT_FEATURES["targets"]},
+    metric_fns=[metrics.accuracy]
+)

train.py DELETED Viewed

@@ -1,689 +0,0 @@
-# Copyright 2022 The T5X Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-r"""Script to pretrain or finetune in JAX using a SeqIO pipeline.
-"""
-import functools
-import itertools
-import math
-import os
-import time
-from typing import Callable, Iterator, Sequence, Mapping, Tuple, Type, Optional
-import subprocess
-# Set Linen to add profiling information when constructing Modules.
-# Must be set before flax imports.
-# pylint:disable=g-import-not-at-top
-os.environ['FLAX_PROFILE'] = 'true'
-# TODO(adarob): Re-enable once users are notified and tests are updated.
-os.environ['FLAX_LAZY_RNG'] = 'no'
-from absl import logging
-from clu import metric_writers
-import jax
-from jax import random
-from jax.experimental import multihost_utils
-import jax.numpy as jnp
-import numpy as np
-import seqio
-from t5x import models
-from t5x import partitioning
-from t5x import train_state as train_state_lib
-from t5x import trainer as trainer_lib
-from t5x import utils
-from t5x import checkpoint_importer
-LazyArray = checkpoint_importer.LazyArray
-import tensorflow as tf
-# Automatically search for gin files relative to the T5X package.
-_DEFAULT_GIN_SEARCH_PATHS = [
-    os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
-]
-PyTreeDef = type(jax.tree_structure(None))
-P = partitioning.PartitionSpec
-# Special key that used to distinguish train metrics.
-TRAIN_METRIC_KEY = 'train'
-# String keys that is acceptable from config.
-_ACTION_KEYS = frozenset(trainer_lib.ActionMode.__members__.keys())
-def run_actions(
-    mode: trainer_lib.ActionMode, actions: trainer_lib.ActionMapType,
-    train_state: train_state_lib.TrainState,
-    metrics_by_task: Mapping[str, trainer_lib.MetricValueMapType]) -> bool:
-  """Invokes all actions on the given mode on host 0, then broadcasts to all.
-  Args:
-    mode: The mode to run the actions. e.g., if mode is `train`, only actions
-      configured to run with `train` mode will be invoked.
-    actions: A mapping of actions that runs after train, eval or infer_eval, to
-      inspect the model and perform useful operations, e.g., early stopping.
-    train_state: The current train_state of the trainer.
-    metrics_by_task: A map of metrics keyed by task name.
-  Returns:
-    A bool indicating whether training should be halted.
-  Raises:
-    RuntimeError: When the metrics processed on host 0 is None.
-  """
-  stop_training = False
-  if jax.process_index() == 0:
-    if not metrics_by_task:
-      raise RuntimeError('Metric is unexpectedly empty on process 0')
-    for action in actions.get(mode, []):
-      stop_training |= action.run(train_state, metrics_by_task=metrics_by_task)
-  # Broadcast result from host 0 to others.
-  return bool(multihost_utils.broadcast_one_to_all(jnp.array(stop_training)))
-def train(
-    *,
-    model: models.BaseTransformerModel,
-    train_dataset_cfg: utils.DatasetConfig,
-    train_eval_dataset_cfg: Optional[utils.DatasetConfig],
-    infer_eval_dataset_cfg: Optional[utils.DatasetConfig],
-    checkpoint_cfg: utils.CheckpointConfig,
-    partitioner: partitioning.BasePartitioner,
-    trainer_cls: Type[trainer_lib.BaseTrainer],
-    model_dir: str,
-    total_steps: int,
-    eval_steps: int,
-    eval_period: int,
-    stats_period: Optional[int] = None,
-    random_seed: Optional[int],
-    use_hardware_rng: bool = False,
-    summarize_config_fn: Callable[[str, metric_writers.MetricWriter, int],
-                                  None],
-    inference_evaluator_cls: Type[seqio.Evaluator] = seqio.Evaluator,
-    get_dataset_fn: utils.GetDatasetCallable = utils.get_dataset,
-    concurrent_metrics: bool = True,
-    actions: Optional[Mapping[str, Sequence[trainer_lib.BaseAction]]] = None,
-    train_eval_get_dataset_fn: Optional[utils.GetDatasetCallable] = None,
-    run_eval_before_training: bool = False,
-    hub_model_id: str = None,
-) -> Tuple[int, train_state_lib.TrainState]:
-  """Train function.
-  Args:
-    model: The model object to use for training.
-    train_dataset_cfg: Specification for the dataset to train with.
-    train_eval_dataset_cfg: Specification for the dataset to evaluate with using
-      the train metrics and no inference (e.g., uses teacher forcing). If None,
-      train eval is disabled.
-    infer_eval_dataset_cfg: Specification for the dataset to evaluate with using
-      the inference metrics (e.g., uses sampled decoding). If None, inference
-      eval is disabled.
-    checkpoint_cfg: Specification for saving and restoring model parameters and
-      dataset state to/from checkpoints.
-    partitioner: Partitioner for model parameters and data across devices.
-    trainer_cls: An implementation of BaseTrainer.
-    model_dir: Path of directory to store checkpoints and metric summaries.
-    total_steps: The step number to stop training after. The number of actual
-      steps trained in this run will be this number minus the starting step from
-      the checkpoint.
-    eval_steps: The number of batches to process for each train-eval loop.
-    eval_period: The number of train steps between each evaluation (both
-      train-eval and infer-eval).
-    stats_period: The number of train steps between writing scalar stats. If
-      None, defaults to eval_period.
-    random_seed: A random seed to use for dropout and initialization. If None, a
-      fast, non-deterministic hardware-based RNG is used.
-    use_hardware_rng: Whether to force using the RngBitGenerator based hardware
-      rng, which takes seeds and acts similarly to software PRNG in that it
-      should be seed-deterministic. The new RngBitGenerator custom PRNG system
-      should be reproducible for a given sharding, but the numbers will change
-      for different shardings of the same model.
-    summarize_config_fn: A function that takes in the model directory, a
-      SummaryWriter, and the step number, and writes a summary of the
-    inference_evaluator_cls: seqio.Evaluator class to use for inference
-      evaluation, potentially with bound configuration args.
-    get_dataset_fn: The callable use to get the train and train-eval datasets
-      based on the DatasetConfig and shard information.
-    concurrent_metrics: If True, allow metrics computation and logging to
-      overlap with training. Will likely result in additional TPU memory usage.
-    actions: A mapping of actions that runs after train, eval or infer_eval, to
-      inspect the model and perform useful operations, e.g., early stopping. The
-      key must have a 1:1 mapping to ActionMode enum. For EVAL actions to
-        actually work, this requires `concurrent_metrics` to be turned off,
-        since chaining futures and mutating states concurrently might be
-        error-prone.
-    train_eval_get_dataset_fn: Optional callable use to get the train-eval
-      datasets based on the DatasetConfig and shard information. If missing, it
-      defaults to `get_dataset_fn`.
-    run_eval_before_training: If True, calculate training eval and inference
-      eval metrics before training begins.
-  Returns:
-    The tuple of (last_step, last_train_state).
-  """
-  logging.info('Process ID: %d', jax.process_index())
-  tf.io.gfile.makedirs(model_dir)
-  # Each "epoch" of the training loop should be the min of the eval period,
-  # checkpoint period or the full training.
-  # We compute here to ensure that the eval period and checkpoint period are
-  # divisible by this number, otherwise we fail.
-  eval_enabled = (train_eval_dataset_cfg or infer_eval_dataset_cfg)
-  eval_period = eval_period if eval_enabled else 0
-  checkpoint_period = checkpoint_cfg.save.period if checkpoint_cfg.save else 0
-  if eval_period or checkpoint_period:
-    steps_per_epoch = min(eval_period or np.inf, checkpoint_period or np.inf)
-  else:
-    steps_per_epoch = total_steps
-  stats_period = stats_period or steps_per_epoch
-  if (eval_period and eval_period % steps_per_epoch or
-      checkpoint_period and checkpoint_period % steps_per_epoch):
-    raise ValueError(
-        f'Checkpoint period ({checkpoint_period}) must evenly divide eval '
-        f'period ({eval_period}), or vice-versa.')
-  if use_hardware_rng or random_seed is None:
-    logging.info(
-        'Using fast RngBitGenerator PRNG for initialization and dropout.')
-    if random_seed is None:
-      random_seed = multihost_utils.broadcast_one_to_all(np.int32(time.time()))
-      logging.info('Random seed not provided, using RNG seed %s', random_seed)
-    else:
-      logging.warning(
-          'When using hardware RNG with a fixed seed, repeatability is only '
-          'guaranteed for fixed hardware and partitioning schemes and for a '
-          'fixed version of this code and its dependencies.')
-    utils.set_hardware_rng_ops()
-    rng = random.PRNGKey(random_seed)
-  else:
-    logging.info('Using seed for initialization and dropout RNG: %d',
-                 random_seed)
-    rng = random.PRNGKey(random_seed)
-  init_rng, trainer_rng = random.split(rng, 2)
-  # ---------------------------------------------------------------------------
-  # Initialize datasets
-  # ---------------------------------------------------------------------------
-  if (train_dataset_cfg.seed and
-      not (checkpoint_cfg.save or checkpoint_cfg.save.save_dataset)):
-    logging.warning(
-        'Providing a random seed for the train dataset with '
-        '`checkpoint_train_ds=False` is dangerous since each '
-        'preemption/restart will cause the dataset to deterministically replay '
-        'from the beginning.')
-  data_layout = partitioner.get_data_layout(train_dataset_cfg.batch_size)
-  ds_shard_id = data_layout.shard_id
-  num_ds_shards = data_layout.num_shards
-  def _verify_matching_vocabs(cfg: utils.DatasetConfig):
-    ds_vocabs = utils.get_vocabulary(cfg)
-    if (ds_vocabs[0] != model.input_vocabulary or
-        ds_vocabs[1] != model.output_vocabulary):
-      raise ValueError(f'Model and Task vocabularies do not match:\n'
-                       f'  task={cfg.mixture_or_task_name}\n'
-                       f'  ds_vocabs=({ds_vocabs[0]}, {ds_vocabs[1]})\n'
-                       f'  model.input_vocabulary={model.input_vocabulary}\n'
-                       f'  model.output_vocabulary={model.output_vocabulary}\n')
-  _verify_matching_vocabs(train_dataset_cfg)
-  train_ds = get_dataset_fn(train_dataset_cfg, ds_shard_id, num_ds_shards,
-                            model.FEATURE_CONVERTER_CLS)
-  if train_eval_dataset_cfg:
-    _verify_matching_vocabs(train_eval_dataset_cfg)
-    train_eval_datasets = utils.get_training_eval_datasets(
-        train_eval_dataset_cfg,
-        ds_shard_id,
-        num_ds_shards,
-        eval_steps,
-        model.FEATURE_CONVERTER_CLS,
-        get_dataset_fn=train_eval_get_dataset_fn if train_eval_get_dataset_fn
-        is not None else get_dataset_fn)  # type: Mapping[str, tf.data.Dataset]
-    if not train_eval_datasets:
-      logging.warning(
-          'No train_eval datasets loaded from config `train_eval_dataset_cfg`: '
-          '%s', train_eval_dataset_cfg)
-  else:
-    train_eval_datasets = {}
-  # Initialize optimizer, maybe from an existing checkpoint.
-  checkpointable_train_iter: tf.data.Iterator = iter(train_ds)  # pytype:disable=annotation-type-mismatch
-  train_iter: Iterator[trainer_lib.BatchType] = map(
-      lambda x: jax.tree_map(np.array, x), checkpointable_train_iter)
-  # The manner in which parameters are initialized follows this order of
-  # preference:
-  #  1. From a T5X checkpoint in `model_dir`, if one exists.
-  #  2. From a T5X or TF checkpoint specified by `cfg.path`, if set.
-  #  3. From scratch using `init_fn`.
-  # 1. From a T5X checkpoint in `model_dir`, if one exists.
-  if checkpoint_cfg.restore is not None:
-    state_transforms_for_restore = [
-        functools.partial(fn, is_resuming=True)
-        for fn in checkpoint_cfg.restore.state_transformation_fns
-    ]
-  else:
-    state_transforms_for_restore = []
-  restore_cfgs = [
-      utils.RestoreCheckpointConfig(
-          path=model_dir,
-          mode='latest',
-          dtype=checkpoint_cfg.save.dtype,
-          checkpointer_cls=checkpoint_cfg.save.checkpointer_cls,
-          # Restore dataset state if it is being saved.
-          restore_dataset=(checkpoint_cfg.save and
-                           checkpoint_cfg.save.save_dataset),
-          state_transformation_fns=state_transforms_for_restore)
-  ]
-  # 2. From a checkpoint specified by `checkpoint_cfg.restore.path`, if set.
-  if checkpoint_cfg.restore:
-    if checkpoint_cfg.restore.mode == 'all':
-      raise ValueError(
-          "Restore checkpoint mode 'all' is not supported in training.")
-    # TODO(dhgarrette): Split "restore" behavior into separate configurations
-    #     for the initial restoration for a new run, vs resuming a stopped run.
-    if isinstance(checkpoint_cfg.restore.path, str):
-      restore_cfgs.append(checkpoint_cfg.restore)
-    elif not checkpoint_cfg.restore.path:
-      # `path` is an empty (non-`str`) sequence, so there is nothing to restore.
-      pass
-    else:
-      raise ValueError(
-          'Restore checkpoint config may only have a single path in training.')
-  # Need to use full batch size.
-  input_shapes = {
-      k: (data_layout.batch_size, *v.shape[1:])
-      for k, v in train_ds.element_spec.items()
-  }
-  input_types = {
-      k: v.dtype.as_numpy_dtype() for k, v in train_ds.element_spec.items()
-  }
-  init_or_restore_tick = time.time()
-  train_state_initializer = utils.TrainStateInitializer(
-      optimizer_def=model.optimizer_def,
-      init_fn=model.get_initial_variables,
-      input_shapes=input_shapes,
-      input_types=input_types,
-      partitioner=partitioner)
-  #  3. From scratch using `init_fn`.
-  train_state = train_state_initializer.from_checkpoint_or_scratch(
-      restore_cfgs, init_rng=init_rng, ds_iter=checkpointable_train_iter)
-  train_state_axes = train_state_initializer.train_state_axes
-  init_or_restore_secs = time.time() - init_or_restore_tick
-  logging.info('Initialize/restore complete (%.2f seconds).',
-               init_or_restore_secs)
-  # Log the variable shapes information and write to a file.
-  log_file = os.path.join(model_dir, 'model-info.txt')
-  utils.log_model_info(log_file,
-                       train_state_initializer.global_train_state_shape,
-                       partitioner)
-  if checkpoint_period:
-    checkpointer = checkpoint_cfg.save.checkpointer_cls(
-        train_state=train_state_initializer.global_train_state_shape,
-        partitioner=partitioner,
-        checkpoints_dir=model_dir,
-        dataset_iterator=(checkpointable_train_iter
-                          if checkpoint_cfg.save.save_dataset else None),
-        save_dtype=checkpoint_cfg.save.dtype,
-        keep=checkpoint_cfg.save.keep)
-  # Restore step from last checkpoint or set to 0 if training from scratch.
-  host_step = int(train_state.step)
-  # ---------------------------------------------------------------------------
-  # Trainer
-  # ---------------------------------------------------------------------------
-  trainer: trainer_lib.BaseTrainer = trainer_cls(
-      model=model,
-      train_state=train_state,
-      partitioner=partitioner,
-      train_state_axes=train_state_axes,
-      eval_names=train_eval_datasets.keys(),
-      summary_dir=model_dir,
-      rng=trainer_rng)
-  del train_state
-  train_metrics = trainer.train_metrics_manager
-  summarize_config_fn(model_dir, train_metrics.summary_writer, host_step)
-  train_metrics.write_scalar('timing/init_or_restore_seconds',
-                             init_or_restore_secs, host_step)
-  # ----------------------------------------------------------------------------
-  # SeqIO (inference-based) evaluation setup
-  # ----------------------------------------------------------------------------
-  # Init evaluator to set up cached datasets
-  evaluator = None
-  if infer_eval_dataset_cfg is not None:
-    _verify_matching_vocabs(infer_eval_dataset_cfg)
-    evaluator = inference_evaluator_cls(
-        log_dir=os.path.join(model_dir, 'inference_eval'),
-        mixture_or_task_name=infer_eval_dataset_cfg.mixture_or_task_name,
-        feature_converter=model.FEATURE_CONVERTER_CLS(pack=False),
-        eval_split=infer_eval_dataset_cfg.split,
-        use_cached=infer_eval_dataset_cfg.use_cached,
-        seed=infer_eval_dataset_cfg.seed,
-        sequence_length=infer_eval_dataset_cfg.task_feature_lengths,
-        use_memory_cache=infer_eval_dataset_cfg.use_memory_cache)
-    if not evaluator.eval_tasks:
-      # Skip evaluaton.
-      evaluator = None
-  if evaluator is not None:
-    predict_fn = utils.get_infer_fn(
-        infer_step=model.predict_batch,
-        batch_size=infer_eval_dataset_cfg.batch_size,
-        train_state_axes=train_state_axes,
-        partitioner=partitioner)
-    score_fn = utils.get_infer_fn(
-        infer_step=model.score_batch,
-        batch_size=infer_eval_dataset_cfg.batch_size,
-        train_state_axes=train_state_axes,
-        partitioner=partitioner)
-  if actions is None:
-    actions = {}
-  if set(actions.keys()).difference(_ACTION_KEYS):
-    raise ValueError(f'actions keys must be one of {_ACTION_KEYS}, but got : '
-                     f'{actions.keys()}')
-  # Transform the string key into proper ActionMode enum.
-  actions = {trainer_lib.ActionMode[k]: v for k, v in actions.items()}
-  if concurrent_metrics and actions.get(trainer_lib.ActionMode.INFER_EVAL,
-                                        None) is not None:
-    logging.warning('Actions for INFER_EVAL will not be triggered when async '
-                    'metrics computation is enabled')
-  if concurrent_metrics and actions.get(trainer_lib.ActionMode.TRAIN,
-                                        None) is not None:
-    logging.warning('Actions for TRAIN will not be triggered when async '
-                    'metrics computation is enabled')
-  # ----------------------------------------------------------------------------
-  # Setup Eval Utility Functions
-  # ----------------------------------------------------------------------------
-  def _run_training_eval(first_run: bool = False):
-    if first_run:
-      logging.info('Compiling training eval loop.')
-      trainer.compile_eval({
-          task: utils.get_zeros_batch_like_dataset(ds)
-          for task, ds in train_eval_datasets.items()
-      })
-    logging.info('Computing training evaluation metrics.')
-    eval_batch_iters = {
-        task: ds.as_numpy_iterator()
-        for task, ds in train_eval_datasets.items()
-    }
-    eval_summaries = trainer.eval(eval_batch_iters)
-    trainer.stop_training = run_actions(trainer_lib.ActionMode.TRAIN_EVAL,
-                                        actions, trainer.train_state,
-                                        eval_summaries)
-  def _run_inference_eval():
-    """Run prediction based inference eval."""
-    if evaluator is None:
-      return
-    logging.info('Running inference evaluation.')
-    evaluate_tick = time.time()
-    all_metrics, _, _ = evaluator.evaluate(
-        compute_metrics=jax.process_index() == 0,
-        step=host_step,
-        predict_fn=functools.partial(
-            predict_fn,
-            train_state=trainer.train_state,
-            rng=jax.random.PRNGKey(0)),
-        score_fn=functools.partial(score_fn, train_state=trainer.train_state))
-    if not concurrent_metrics:
-      # Ensure metrics are finished being computed.
-      all_metrics_done = all_metrics.result() or {}
-      trainer.stop_training = run_actions(trainer_lib.ActionMode.INFER_EVAL,
-                                          actions, trainer.train_state,
-                                          all_metrics_done)
-    train_metrics.write_scalar('timing/evaluate_seconds',
-                               time.time() - evaluate_tick, host_step)
-  # Optionally run teacher-forcing training eval and SeqIO inference-base eval
-  # before training. Useful for testing how much a model knows before any
-  # finetuning.
-  if run_eval_before_training:
-    if train_eval_datasets:
-      logging.info('Running training eval before training.')
-      _run_training_eval(first_run=True)
-    if evaluator is not None:
-      logging.info('Running inference eval before training.')
-      _run_inference_eval()
-  # ----------------------------------------------------------------------------
-  # Main training loop
-  # ----------------------------------------------------------------------------
-  logging.info('Starting training loop.')
-  first_step = host_step
-  if total_steps < first_step:
-    raise ValueError(
-        f'Unexpected total_steps ({total_steps}) < checkpoint step '
-        f' ({first_step}).')
-  logging.info('Starting main loop over steps %d-%d', first_step, total_steps)
-  steps_per_epoch = min(steps_per_epoch, total_steps)
-  first_epoch = first_step // steps_per_epoch
-  num_epochs = first_epoch + math.ceil(
-      (total_steps - first_step) / steps_per_epoch)
-  logging.info('Training with artificial "epochs" of %d steps.',
-               steps_per_epoch)
-  # Kickstart training dataset and compile train loop.
-  logging.info('Kickstarting train dataset prefetch.')
-  logging.flush()
-  ds_tick = time.time()
-  # Get first batch to warm up the dataset pipeline.
-  first_batch = next(train_iter)
-  # Prepend first batch back to iterator to be used by trainer.
-  train_iter = itertools.chain([first_batch], train_iter)
-  train_metrics.write_scalar('timing/dataset_warmup_seconds',
-                             time.time() - ds_tick, host_step)
-  logging.info('Compiling train loop.')
-  logging.flush()
-  trainer.compile_train(first_batch)
-  # Main Loop over "epochs".
-  for epoch in range(first_epoch, num_epochs):
-    final_epoch = epoch == num_epochs - 1
-    logging.info('Epoch %d of %d', epoch, num_epochs)
-    # `stop_training` is requested, break out the main loop immediately.
-    if trainer.stop_training:
-      break
-    logging.info('BEGIN Train loop.')
-    try:
-      # Until the last epoch, `num_steps = steps_per_epoch`
-      num_steps = min(total_steps - host_step, steps_per_epoch)
-      epoch_end_step = host_step + num_steps
-      logging.info('Training for %d steps.', num_steps)
-      while host_step < epoch_end_step:
-        if trainer.stop_training:
-          logging.info('Saving a checkpoint before early stopping...')
-          checkpointer.save(trainer.train_state,
-                            checkpoint_cfg.save.state_transformation_fns)
-          if hub_model_id:
-            # convert checkpoint to HF Flax model and push to hub
-            checkpoint_step = trainer.train_state.step
-            checkpoint_step = checkpoint_step.get() if isinstance(checkpoint_step, LazyArray) else checkpoint_step
-            checkpoint_step = int(checkpoint_step)  # Integer, to avoid side effects in the checkpoint path.
-            config_path = os.path.join(model_dir, 'config.json')
-            subprocess.run(["python3", "convert_t5x_checkpoint_to_flax.py", f"--t5x_checkpoint_path='checkpoint_{checkpoint_step}'/'", f'--config_name="{config_path}"', "--flax_dump_folder_path='./'"])
-            subprocess.run("git lfs prune --verify-remote", shell=True)
-            subprocess.run("git add .", shell=True)
-            subprocess.run(f'git commit -m "Saving weights and logs of step {checkpoint_step}"', shell=True)
-            subprocess.Popen("git push", shell=True)
-          logging.info('Stopping training loop early since `stop_training` is '
-                       'requested.')
-          break
-        inner_num_steps = min(epoch_end_step - host_step, stats_period)
-        train_summary = trainer.train(
-            train_iter, inner_num_steps, start_step=host_step)
-        if not concurrent_metrics:
-          # Note that we always pass the dictionary of `tasks` -> summary so
-          # that the actions can be performed without special casing. The only
-          # caveat is that train would need its own special `key` given no
-          # `task` will be applied.
-          trainer.stop_training = run_actions(
-              trainer_lib.ActionMode.TRAIN, actions, trainer.train_state,
-              {TRAIN_METRIC_KEY: train_summary.result()})
-        host_step += inner_num_steps
-      logging.info('END Train loop.')
-    except trainer_lib.PreemptionError as e:
-      logging.info('Saving emergency checkpoint.')
-      checkpointer.save(trainer.train_state,
-                        checkpoint_cfg.save.state_transformation_fns)
-      logging.info('Saving emergency checkpoint done.')
-      raise e
-    step_offset = host_step - first_step
-    is_eval_epoch = eval_period and (final_epoch or
-                                     step_offset % eval_period == 0)
-    # Training Evaluation (i.e., with teacher forcing).
-    if is_eval_epoch and train_eval_datasets:
-      # Maybe less if final step < period.
-      first_run = step_offset // eval_period <= 1
-      _run_training_eval(first_run and not run_eval_before_training)
-    # Maybe save a checkpoint.
-    if checkpoint_period and (final_epoch or
-                              step_offset % checkpoint_period == 0):
-      # Make sure last train step has completed before starting the clock.
-      train_summary.result()
-      logging.info('Saving checkpoint.')
-      checkpoint_tick = time.time()
-      checkpointer.save(trainer.train_state,
-                        checkpoint_cfg.save.state_transformation_fns)
-      checkpoint_tock = time.time()
-      train_metrics.write_scalar('timing/checkpoint_seconds',
-                                 checkpoint_tock - checkpoint_tick, host_step)
-      if hub_model_id:
-        # convert checkpoint to HF Flax model and push to hub
-        checkpoint_step = trainer.train_state.step
-        checkpoint_step = checkpoint_step.get() if isinstance(checkpoint_step, LazyArray) else checkpoint_step
-        checkpoint_step = int(checkpoint_step)  # Integer, to avoid side effects in the checkpoint path.
-        config_path = os.path.join(model_dir, 'config.json')
-        subprocess.run(["python3", "convert_t5x_checkpoint_to_flax.py", f"--t5x_checkpoint_path='checkpoint_{checkpoint_step}'/'", f'--config_name="{config_path}"', "--flax_dump_folder_path='./'"])
-        subprocess.run("git lfs prune --verify-remote", shell=True)
-        subprocess.run("git add .", shell=True)
-        subprocess.run(f'git commit -m "Saving weights and logs of step {checkpoint_step}"', shell=True)
-        subprocess.Popen("git push", shell=True)
-    # Inference Evaluation (i.e., with decoding or scoring).
-    if evaluator is not None:
-      _run_inference_eval()
-  # Wait until computations are done before exiting
-  logging.info('Finished.')
-  trainer.close()
-  if evaluator:
-    evaluator.close()
-  multihost_utils.sync_global_devices('complete')
-  return host_step, trainer.train_state
-if __name__ == '__main__':
-  # pylint: disable=g-import-not-at-top
-  from absl import app
-  from absl import flags
-  import gin
-  from t5x import gin_utils
-  # pylint: enable=g-import-not-at-top
-  FLAGS = flags.FLAGS
-  jax.config.parse_flags_with_absl()
-  flags.DEFINE_multi_string(
-      'gin_file',
-      default=None,
-      help='Path to gin configuration file. Multiple paths may be passed and '
-      'will be imported in the given order, with later configurations  '
-      'overriding earlier ones.')
-  flags.DEFINE_multi_string(
-      'gin_bindings', default=[], help='Individual gin bindings.')
-  flags.DEFINE_list(
-      'gin_search_paths',
-      default=['.'],
-      help='Comma-separated list of gin config path prefixes to be prepended '
-      'to suffixes given via `--gin_file`. If a file appears in. Only the '
-      'first prefix that produces a valid path for each suffix will be '
-      'used.')
-  flags.DEFINE_string(
-      'tfds_data_dir', None,
-      'If set, this directory will be used to store datasets prepared by '
-      'TensorFlow Datasets that are not available in the public TFDS GCS '
-      'bucket. Note that this flag overrides the `tfds_data_dir` attribute of '
-      'all `Task`s.')
-  flags.DEFINE_list(
-      'seqio_additional_cache_dirs', [],
-      'Directories to search for cached Tasks in addition to defaults.')
-  def main(argv: Sequence[str]):
-    """Wrapper for pdb post mortems."""
-    _main(argv)
-  def _main(argv: Sequence[str]):
-    """True main function."""
-    if len(argv) > 1:
-      raise app.UsageError('Too many command-line arguments.')
-    if FLAGS.tfds_data_dir:
-      seqio.set_tfds_data_dir_override(FLAGS.tfds_data_dir)
-    seqio.add_global_cache_dirs(FLAGS.seqio_additional_cache_dirs)
-    # Create gin-configurable version of `train`.
-    train_using_gin = gin.configurable(train)
-    gin_utils.parse_gin_flags(
-        # User-provided gin paths take precedence if relative paths conflict.
-        FLAGS.gin_search_paths + _DEFAULT_GIN_SEARCH_PATHS,
-        FLAGS.gin_file,
-        FLAGS.gin_bindings)
-    train_using_gin()
-  gin_utils.run(main)

train/events.out.tfevents.1649073594.t1v-n-304587cf-w-0.1316481.0.v2 DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:224a0411c5fc4e0e882c7a647ff554b58fec3f79dc12f9809b26b3a319225c1d
-size 7585

train/events.out.tfevents.1649092520.t1v-n-304587cf-w-0.1399566.0.v2 DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:d5a8424af960443ad6fbb097216b8add4fb9af5298424f440afb56a27ee260b9
-size 16363

train/{events.out.tfevents.1649056216.t1v-n-304587cf-w-0.1239745.0.v2 → events.out.tfevents.1649705066.t1v-n-304587cf-w-0.2549834.0.v2} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:766ae4ead7af1adb3e58f71c6b05f02a42c73eea497118cb6250ffa8c1c0bd2c
-size 7581

 version https://git-lfs.github.com/spec/v1
+oid sha256:59fc20fda0f88a5e31f18b0ebc9497d4131b6458ac28d6d2a52875d1ba5c5b13
+size 10402

training_eval/pretrain_finnish/events.out.tfevents.1649073594.t1v-n-304587cf-w-0.1316481.1.v2 DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:7a4fbca2952ba8dbad98a5e752b6bcf0e80ea9d1376380bbbbd669b7fb0897e7
-size 1431

training_eval/pretrain_finnish/events.out.tfevents.1649092520.t1v-n-304587cf-w-0.1399566.1.v2 DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:b3efa0a2b6af0ef032441aaf0e97ac69bab14c84b90bbbfa22001dc094926fb2
-size 9261

training_eval/pretrain_finnish/{events.out.tfevents.1649056216.t1v-n-304587cf-w-0.1239745.1.v2 → events.out.tfevents.1649705066.t1v-n-304587cf-w-0.2549834.1.v2} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:dec4b61e593fed96fc1b3b8511230bfb38bd4c2a2d1e780269d761122476c3f0
-size 1431

 version https://git-lfs.github.com/spec/v1
+oid sha256:6ec5c452edec8f5036cab8e5f3d67f492e55a15ff004d4ee5847b2d2cd56f2df
+size 4024