diff --git a/stanza/models/lemma_classifier/evaluate_models.py b/stanza/models/lemma_classifier/evaluate_models.py index c2fca1809a..e5a8692fa3 100644 --- a/stanza/models/lemma_classifier/evaluate_models.py +++ b/stanza/models/lemma_classifier/evaluate_models.py @@ -154,7 +154,6 @@ def evaluate_model(model: nn.Module, eval_path: str, verbose: bool = True, is_tr # load in eval data text_batches, index_batches, label_batches, _, label_decoder = utils.load_dataset(eval_path, label_decoder=model.label_decoder) - # TODO fix this in the future text_batches, index_batches, label_batches = text_batches[: -1], index_batches[: -1], label_batches[: -1] diff --git a/stanza/models/lemma_classifier/transformer_baseline/baseline_trainer.py b/stanza/models/lemma_classifier/transformer_baseline/baseline_trainer.py index 2ed354593e..a7d51b75a7 100644 --- a/stanza/models/lemma_classifier/transformer_baseline/baseline_trainer.py +++ b/stanza/models/lemma_classifier/transformer_baseline/baseline_trainer.py @@ -66,6 +66,25 @@ def configure_weighted_loss(self, label_decoder: Mapping, counts: Mapping): logging.info(f"Using weights {weights} for weighted loss.") self.criterion = nn.BCEWithLogitsLoss(weight=weights) + def set_layer_learning_rates(self, transformer_lr: float, mlp_lr: float) -> torch.optim: + """ + Sets learning rates for each layer of the model. + Currently, the model has the transformer layer and the MLP layer, so these are tweakable. + + Returns (torch.optim): An Adam optimizer with the learning rates adjusted per layer. + """ + transformer_params, mlp_params = [], [] + for name, param in self.model.named_parameters(): + if 'transformer' in name: + transformer_params.append(param) + elif 'mlp' in name: + mlp_params.append(param) + optimizer = optim.Adam([ + {"params": transformer_params, "lr": transformer_lr}, + {"params": mlp_params, "lr": mlp_lr} + ]) + return optimizer + def train(self, num_epochs: int, save_name: str, args: Mapping, eval_file: str, **kwargs): """ @@ -90,17 +109,17 @@ def train(self, num_epochs: int, save_name: str, args: Mapping, eval_file: str, self.output_dim = len(label_decoder) logging.info(f"Using label decoder : {label_decoder}") - # TODO: fix this to make it not disregard last batch, and instead pad it or some other idea - text_batches, position_batches, label_batches = text_batches[:-1], position_batches[:-1], label_batches[:-1] + # # TODO: fix this to make it not disregard last batch, and instead pad it or some other idea + # text_batches, position_batches, label_batches = text_batches[:-1], position_batches[:-1], label_batches[:-1] - # Move data to device - label_batches = torch.stack(label_batches).to(device) - position_batches = torch.stack(position_batches).to(device) + # # Move data to device + # label_batches = torch.stack(label_batches).to(device) + # position_batches = torch.stack(position_batches).to(device) assert len(text_batches) == len(position_batches) == len(label_batches), f"Input batch sizes did not match ({len(text_batches)}, {len(position_batches)}, {len(label_batches)})." self.model = LemmaClassifierWithTransformer(output_dim=self.output_dim, transformer_name=self.transformer_name, label_decoder=label_decoder) - self.optimizer = optim.Adam(self.model.parameters(), lr=self.lr) + self.optimizer = self.set_layer_learning_rates(transformer_lr=self.lr/2, mlp_lr=self.lr) # Adam optimizer self.model.to(device) self.model.transformer.to(device) @@ -118,7 +137,8 @@ def train(self, num_epochs: int, save_name: str, args: Mapping, eval_file: str, for epoch in range(num_epochs): # go over entire dataset with each epoch for sentences, positions, labels in tqdm(zip(text_batches, position_batches, label_batches), total=len(text_batches)): - + assert len(sentences) == len(positions) == len(labels), f"Input sentences, positions, and labels are of unequal length ({len(sentences), len(positions), len(labels)})" + self.optimizer.zero_grad() outputs = self.model(positions, sentences)