Spaces:
Sleeping
Sleeping
Commit
·
60f8a12
1
Parent(s):
5e56615
Refactor: Fix B905 zip() strictness and resolve mypy error in evaluate.py
Browse files- scripts/eval_rouge.py +1 -1
- scripts/evaluate.py +4 -4
- scripts/preprocess_data.py +2 -2
- src/inference/pipeline.py +1 -1
- src/models/factory.py +6 -2
- src/training/metrics.py +2 -2
scripts/eval_rouge.py
CHANGED
|
@@ -165,7 +165,7 @@ def main() -> None:
|
|
| 165 |
references = [item[1] for item in batch]
|
| 166 |
predictions = pipeline.summarize(documents, max_length=args.max_length)
|
| 167 |
|
| 168 |
-
for reference, prediction in zip(references, predictions):
|
| 169 |
scores = scorer.score(reference, prediction)
|
| 170 |
for metric_name, score in scores.items():
|
| 171 |
score_store[metric_name]["precision"].append(score.precision)
|
|
|
|
| 165 |
references = [item[1] for item in batch]
|
| 166 |
predictions = pipeline.summarize(documents, max_length=args.max_length)
|
| 167 |
|
| 168 |
+
for reference, prediction in zip(references, predictions, strict=False):
|
| 169 |
scores = scorer.score(reference, prediction)
|
| 170 |
for metric_name, score in scores.items():
|
| 171 |
score_store[metric_name]["precision"].append(score.precision)
|
scripts/evaluate.py
CHANGED
|
@@ -9,7 +9,7 @@ import argparse
|
|
| 9 |
import json
|
| 10 |
import sys
|
| 11 |
from pathlib import Path
|
| 12 |
-
from typing import List
|
| 13 |
|
| 14 |
import torch
|
| 15 |
from sklearn.preprocessing import MultiLabelBinarizer
|
|
@@ -44,13 +44,13 @@ SPLIT_ALIASES = {
|
|
| 44 |
}
|
| 45 |
|
| 46 |
|
| 47 |
-
def _read_split(root: Path, split: str, loader) ->
|
| 48 |
aliases = SPLIT_ALIASES.get(split, (split,))
|
| 49 |
for alias in aliases:
|
| 50 |
for ext in ("jsonl", "json"):
|
| 51 |
candidate = root / f"{alias}.{ext}"
|
| 52 |
if candidate.exists():
|
| 53 |
-
return loader(str(candidate))
|
| 54 |
raise FileNotFoundError(f"Missing {split} split under {root}")
|
| 55 |
|
| 56 |
|
|
@@ -152,7 +152,7 @@ def main() -> None:
|
|
| 152 |
inputs = [example.text for example in batch]
|
| 153 |
predictions = pipeline.predict_emotions(inputs)
|
| 154 |
target_matrix = emotion_binarizer.transform([list(example.emotions) for example in batch])
|
| 155 |
-
for pred, target_row in zip(predictions, target_matrix):
|
| 156 |
vector = torch.zeros(len(metadata.emotion), dtype=torch.float32)
|
| 157 |
for label in pred.labels:
|
| 158 |
idx = label_to_index.get(label)
|
|
|
|
| 9 |
import json
|
| 10 |
import sys
|
| 11 |
from pathlib import Path
|
| 12 |
+
from typing import Any, List, cast
|
| 13 |
|
| 14 |
import torch
|
| 15 |
from sklearn.preprocessing import MultiLabelBinarizer
|
|
|
|
| 44 |
}
|
| 45 |
|
| 46 |
|
| 47 |
+
def _read_split(root: Path, split: str, loader) -> List[Any]:
|
| 48 |
aliases = SPLIT_ALIASES.get(split, (split,))
|
| 49 |
for alias in aliases:
|
| 50 |
for ext in ("jsonl", "json"):
|
| 51 |
candidate = root / f"{alias}.{ext}"
|
| 52 |
if candidate.exists():
|
| 53 |
+
return cast(List[Any], loader(str(candidate)))
|
| 54 |
raise FileNotFoundError(f"Missing {split} split under {root}")
|
| 55 |
|
| 56 |
|
|
|
|
| 152 |
inputs = [example.text for example in batch]
|
| 153 |
predictions = pipeline.predict_emotions(inputs)
|
| 154 |
target_matrix = emotion_binarizer.transform([list(example.emotions) for example in batch])
|
| 155 |
+
for pred, target_row in zip(predictions, target_matrix, strict=False):
|
| 156 |
vector = torch.zeros(len(metadata.emotion), dtype=torch.float32)
|
| 157 |
for label in pred.labels:
|
| 158 |
idx = label_to_index.get(label)
|
scripts/preprocess_data.py
CHANGED
|
@@ -282,8 +282,8 @@ def preprocess_topic(
|
|
| 282 |
random_state=seed,
|
| 283 |
stratify=topics,
|
| 284 |
)
|
| 285 |
-
train_records = list(zip(train_texts, train_topics))
|
| 286 |
-
val_rows = list(zip(val_texts, val_topics))
|
| 287 |
|
| 288 |
def to_records(pairs: Sequence[Tuple[str, str]]) -> Iterator[Dict[str, object]]:
|
| 289 |
for text, topic in pairs:
|
|
|
|
| 282 |
random_state=seed,
|
| 283 |
stratify=topics,
|
| 284 |
)
|
| 285 |
+
train_records = list(zip(train_texts, train_topics, strict=False))
|
| 286 |
+
val_rows = list(zip(val_texts, val_topics, strict=False))
|
| 287 |
|
| 288 |
def to_records(pairs: Sequence[Tuple[str, str]]) -> Iterator[Dict[str, object]]:
|
| 289 |
for text, topic in pairs:
|
src/inference/pipeline.py
CHANGED
|
@@ -132,7 +132,7 @@ class InferencePipeline:
|
|
| 132 |
for row in probs.cpu():
|
| 133 |
pairs = [
|
| 134 |
(label, score)
|
| 135 |
-
for label, score in zip(self.emotion_labels, row.tolist())
|
| 136 |
if score >= decision_threshold
|
| 137 |
]
|
| 138 |
labels = [label for label, _ in pairs]
|
|
|
|
| 132 |
for row in probs.cpu():
|
| 133 |
pairs = [
|
| 134 |
(label, score)
|
| 135 |
+
for label, score in zip(self.emotion_labels, row.tolist(), strict=False)
|
| 136 |
if score >= decision_threshold
|
| 137 |
]
|
| 138 |
labels = [label for label, _ in pairs]
|
src/models/factory.py
CHANGED
|
@@ -81,7 +81,9 @@ def _load_pretrained_weights(
|
|
| 81 |
# Skip positional encoding - BART uses learned positions, I use sinusoidal
|
| 82 |
# implementation will work fine with sinusoidal encodings
|
| 83 |
|
| 84 |
-
for _i, (custom_layer, bart_layer) in enumerate(
|
|
|
|
|
|
|
| 85 |
# Self-attention
|
| 86 |
custom_layer.self_attn.W_Q.weight.data.copy_(bart_layer.self_attn.q_proj.weight.data)
|
| 87 |
custom_layer.self_attn.W_Q.bias.data.copy_(bart_layer.self_attn.q_proj.bias.data)
|
|
@@ -115,7 +117,9 @@ def _load_pretrained_weights(
|
|
| 115 |
decoder.embedding.weight.data.copy_(bart.decoder.embed_tokens.weight.data)
|
| 116 |
# Skip positional encoding - BART uses learned positions, we use sinusoidal
|
| 117 |
|
| 118 |
-
for _i, (custom_layer, bart_layer) in enumerate(
|
|
|
|
|
|
|
| 119 |
# Self-attention
|
| 120 |
custom_layer.self_attn.W_Q.weight.data.copy_(bart_layer.self_attn.q_proj.weight.data)
|
| 121 |
custom_layer.self_attn.W_Q.bias.data.copy_(bart_layer.self_attn.q_proj.bias.data)
|
|
|
|
| 81 |
# Skip positional encoding - BART uses learned positions, I use sinusoidal
|
| 82 |
# implementation will work fine with sinusoidal encodings
|
| 83 |
|
| 84 |
+
for _i, (custom_layer, bart_layer) in enumerate(
|
| 85 |
+
zip(encoder.layers, bart.encoder.layers, strict=False)
|
| 86 |
+
):
|
| 87 |
# Self-attention
|
| 88 |
custom_layer.self_attn.W_Q.weight.data.copy_(bart_layer.self_attn.q_proj.weight.data)
|
| 89 |
custom_layer.self_attn.W_Q.bias.data.copy_(bart_layer.self_attn.q_proj.bias.data)
|
|
|
|
| 117 |
decoder.embedding.weight.data.copy_(bart.decoder.embed_tokens.weight.data)
|
| 118 |
# Skip positional encoding - BART uses learned positions, we use sinusoidal
|
| 119 |
|
| 120 |
+
for _i, (custom_layer, bart_layer) in enumerate(
|
| 121 |
+
zip(decoder.layers, bart.decoder.layers, strict=False)
|
| 122 |
+
):
|
| 123 |
# Self-attention
|
| 124 |
custom_layer.self_attn.W_Q.weight.data.copy_(bart_layer.self_attn.q_proj.weight.data)
|
| 125 |
custom_layer.self_attn.W_Q.bias.data.copy_(bart_layer.self_attn.q_proj.bias.data)
|
src/training/metrics.py
CHANGED
|
@@ -28,7 +28,7 @@ def rouge_like(predictions: Sequence[str], references: Sequence[str]) -> float:
|
|
| 28 |
if not predictions or not references:
|
| 29 |
return 0.0
|
| 30 |
scores = []
|
| 31 |
-
for pred, ref in zip(predictions, references):
|
| 32 |
pred_tokens = pred.split()
|
| 33 |
ref_tokens = ref.split()
|
| 34 |
if not ref_tokens:
|
|
@@ -46,7 +46,7 @@ def calculate_bleu(predictions: Sequence[str], references: Sequence[str]) -> flo
|
|
| 46 |
|
| 47 |
smoother = SmoothingFunction().method1
|
| 48 |
scores = []
|
| 49 |
-
for pred, ref in zip(predictions, references):
|
| 50 |
pred_tokens = pred.split()
|
| 51 |
ref_tokens = [ref.split()] # BLEU expects list of references
|
| 52 |
scores.append(sentence_bleu(ref_tokens, pred_tokens, smoothing_function=smoother))
|
|
|
|
| 28 |
if not predictions or not references:
|
| 29 |
return 0.0
|
| 30 |
scores = []
|
| 31 |
+
for pred, ref in zip(predictions, references, strict=False):
|
| 32 |
pred_tokens = pred.split()
|
| 33 |
ref_tokens = ref.split()
|
| 34 |
if not ref_tokens:
|
|
|
|
| 46 |
|
| 47 |
smoother = SmoothingFunction().method1
|
| 48 |
scores = []
|
| 49 |
+
for pred, ref in zip(predictions, references, strict=False):
|
| 50 |
pred_tokens = pred.split()
|
| 51 |
ref_tokens = [ref.split()] # BLEU expects list of references
|
| 52 |
scores.append(sentence_bleu(ref_tokens, pred_tokens, smoothing_function=smoother))
|