In this notebook, we will download a model, dataset, and metric from Hugging Face Hub and generate an interactive HTML Model Card using Intel AI Safety Model Card Generator Tool.
1. Download and Import Dependencies
[ ]:
!pip install evaluate datasets transformers[torch] scikit-learn
[ ]:
from intel_ai_safety.model_card_gen.model_card_gen import ModelCardGen
from datasets import load_dataset
import evaluate
from transformers import AutoConfig,AutoModelForSequenceClassification,AutoTokenizer
import pandas as pd
from collections import Counter
from functools import reduce
import json
import numpy as np
2. Download Dataset from Hugging Face Datasets
[ ]:
raw_dataset = load_dataset("hatexplain")
he_dataset = raw_dataset.map(lambda e: {'text': " ".join(e['post_tokens'])})
3. Transform Dataset
[3]:
def get_common_targets(elm, ignore=['Other', 'None']):
"""
This function merges annotated targets from each annotator
into a single list when annotators agree
"""
targets = elm['annotators']['target']
counts = reduce(lambda x, y: Counter(x) + Counter(y) , targets)
result = [target for target, count in counts.items() if count > 1]
if result:
return {'target': result}
else:
return {'target': []}
he_dataset = he_dataset.map(get_common_targets)
[4]:
def get_top_communites(targets, top=10):
target_counts = reduce(lambda x, y: Counter(x) + Counter(y) , targets)
top_targets, _ = zip(*target_counts.most_common(top))
return set(top_targets)
TOP = get_top_communites(he_dataset['test']['target'])
def filter_top_target(elm):
"""
This function filters the identity groups targeted
in each item with the top 10 most common identity groups
"""
targets = set(elm['target']) & TOP
return {'target': targets}
he_dataset = he_dataset.map(filter_top_target)
[5]:
def get_label(elm):
"""
This fuction gets a ground truth label from annotators labels
"""
labels = elm['annotators']['label']
max_label = max(labels, key=labels.count)
return {'label': max_label}
he_dataset = he_dataset.map(get_label)
4. Download Model and Process Outputs
[ ]:
from torch.nn.functional import softmax
he_dataset.set_format("pt", columns=["post_tokens"], output_all_columns=True)
tokenizer = AutoTokenizer.from_pretrained("Hate-speech-CNERG/bert-base-uncased-hatexplain")
model = AutoModelForSequenceClassification.from_pretrained("Hate-speech-CNERG/bert-base-uncased-hatexplain")
def process(examples):
bert_tokens = tokenizer(examples['text'], return_tensors="pt")
output = model(**bert_tokens)
return {"output": softmax(output['logits'], dim=-1).flatten()}
test_ds = he_dataset['test'].map(process)
5. Get Bias Metric form Hugging Face
[ ]:
metric = evaluate.load('Intel/bias_auc')
print(metric)
6. Run Bias Metric and Transform output to get Metric by Group for Model Card
Model Card Generator takes two pandas dataframes as input. We will first create a metrics_by_group dataframe from the Bias AUC metric above.
[10]:
unique_subgroups = set(group for group_list in test_ds['target'] for group in group_list) - set(['Disability'])
target_groups = test_ds['target']
y_pred_prob = test_ds['output']
true_labels = test_ds['label']
class_label_map = {0 : "Hate",
1 : "Offensive",
2 : "Normal"}
num_of_classes = len(y_pred_prob[0])
metrics_by_group = pd.DataFrame()
for class_label in range(num_of_classes):
class_metric = metric
binary_class_labels = [1 if label == class_label else 0 for label in true_labels]
class_probs = [[prob[class_label], 1 - prob[class_label]] for prob in y_pred_prob]
class_metric.add_batch(target = target_groups,
label = binary_class_labels,
output = class_probs)
metric_output = class_metric.compute(subgroups = unique_subgroups)
metrics_by_group_per_class = (pd.DataFrame.from_dict(metric_output).
T.
reset_index().
rename({'index': 'group'}, axis=1))
metrics_by_group_per_class['feature'] = ['target'] * len(metrics_by_group_per_class)
metrics_by_group_per_class['label'] = [class_label_map[class_label]] * len(metrics_by_group_per_class)
metrics_by_group = pd.concat([metrics_by_group, metrics_by_group_per_class], ignore_index=True)
[11]:
metrics_by_group
[11]:
| group | Subgroup | BPSN | BNSP | feature | label | |
|---|---|---|---|---|---|---|
| 0 | None | 0.663978 | 0.051755 | 0.797641 | target | Hate |
| 1 | Islam | 0.252081 | 0.225308 | 0.068953 | target | Hate |
| 2 | Homosexual | 0.166520 | 0.104956 | 0.114978 | target | Hate |
| 3 | Other | 0.064327 | 0.036619 | 0.117924 | target | Hate |
| 4 | African | 0.179228 | 0.276113 | 0.049488 | target | Hate |
| 5 | Caucasian | 0.119734 | 0.035024 | 0.201707 | target | Hate |
| 6 | Women | 0.091592 | 0.052775 | 0.134179 | target | Hate |
| 7 | Jewish | 0.158551 | 0.284709 | 0.042195 | target | Hate |
| 8 | Arab | 0.193939 | 0.204050 | 0.072467 | target | Hate |
| 9 | Refugee | 0.217247 | 0.050079 | 0.265590 | target | Hate |
| 10 | Overall generalized mean | 0.097524 | 0.047229 | 0.060728 | target | Hate |
| 11 | None | 0.195018 | 0.278376 | 0.107284 | target | Offensive |
| 12 | Islam | 0.134600 | 0.093231 | 0.177565 | target | Offensive |
| 13 | Homosexual | 0.154586 | 0.129348 | 0.145039 | target | Offensive |
| 14 | Other | 0.169524 | 0.136259 | 0.171880 | target | Offensive |
| 15 | African | 0.201166 | 0.063841 | 0.406097 | target | Offensive |
| 16 | Caucasian | 0.223684 | 0.337492 | 0.056882 | target | Offensive |
| 17 | Women | 0.223734 | 0.199604 | 0.116902 | target | Offensive |
| 18 | Jewish | 0.141978 | 0.049844 | 0.322896 | target | Offensive |
| 19 | Arab | 0.273810 | 0.079282 | 0.335899 | target | Offensive |
| 20 | Refugee | 0.175708 | 0.234858 | 0.083398 | target | Offensive |
| 21 | Overall generalized mean | 0.167782 | 0.073353 | 0.086443 | target | Offensive |
| 22 | None | 0.143346 | 0.273709 | 0.112244 | target | Normal |
| 23 | Islam | 0.383023 | 0.170385 | 0.438840 | target | Normal |
| 24 | Homosexual | 0.314105 | 0.345418 | 0.183157 | target | Normal |
| 25 | Other | 0.130495 | 0.322379 | 0.087658 | target | Normal |
| 26 | African | 0.233568 | 0.118453 | 0.403769 | target | Normal |
| 27 | Caucasian | 0.251852 | 0.210714 | 0.265974 | target | Normal |
| 28 | Women | 0.284795 | 0.335885 | 0.176257 | target | Normal |
| 29 | Jewish | 0.236979 | 0.144365 | 0.362873 | target | Normal |
| 30 | Arab | 0.220513 | 0.200502 | 0.256000 | target | Normal |
| 31 | Refugee | 0.355429 | 0.354636 | 0.207917 | target | Normal |
| 32 | Overall generalized mean | 0.182211 | 0.168860 | 0.130460 | target | Normal |
7. Transform Output for Metrics by Threshold for Model Card
Now, we will create metrics_by_threshold containing performance metrics at threshold.
[14]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
import numpy as np
import pandas as pd
thetas = np.linspace(0, 1, 1001)
y_pred_prob = test_ds['output']
true_labels = test_ds['label']
num_of_classes = len(y_pred_prob[0])
metrics_by_threshold = pd.DataFrame()
class_label_index_map = {0 : "Hate",
1 : "Offensive",
2 : "Normal"}
for class_label in range(num_of_classes):
class_y_pred_prob = y_pred_prob[:,class_label]
binary_class_labels = [1 if label == class_label else 0 for label in true_labels]
metrics_dict_per_class ={
'threshold': thetas,
'precision': [precision_score(binary_class_labels , class_y_pred_prob > theta,zero_division=0) for theta in thetas],
'recall': [recall_score(binary_class_labels , class_y_pred_prob > theta) for theta in thetas],
'f1': [f1_score(binary_class_labels , class_y_pred_prob > theta) for theta in thetas],
'accuracy' : [accuracy_score(binary_class_labels, class_y_pred_prob > theta) for theta in thetas],
'label': [class_label_index_map[class_label]]*len(thetas)
}
metrics_by_threshold = pd.concat([metrics_by_threshold, pd.DataFrame.from_dict(metrics_dict_per_class)], ignore_index=True)
[15]:
metrics_by_threshold
[15]:
| threshold | precision | recall | f1 | accuracy | label | |
|---|---|---|---|---|---|---|
| 0 | 0.000 | 0.308732 | 1.0 | 0.471803 | 0.308732 | Hate |
| 1 | 0.001 | 0.308732 | 1.0 | 0.471803 | 0.308732 | Hate |
| 2 | 0.002 | 0.308732 | 1.0 | 0.471803 | 0.308732 | Hate |
| 3 | 0.003 | 0.308732 | 1.0 | 0.471803 | 0.308732 | Hate |
| 4 | 0.004 | 0.308732 | 1.0 | 0.471803 | 0.308732 | Hate |
| ... | ... | ... | ... | ... | ... | ... |
| 2998 | 0.996 | 0.000000 | 0.0 | 0.000000 | 0.715177 | Normal |
| 2999 | 0.997 | 0.000000 | 0.0 | 0.000000 | 0.715177 | Normal |
| 3000 | 0.998 | 0.000000 | 0.0 | 0.000000 | 0.715177 | Normal |
| 3001 | 0.999 | 0.000000 | 0.0 | 0.000000 | 0.715177 | Normal |
| 3002 | 1.000 | 0.000000 | 0.0 | 0.000000 | 0.715177 | Normal |
3003 rows × 6 columns
[20]:
metrics_by_threshold.to_csv('multiclass_metrics_by_threshold.csv', index=False)
metrics_by_group.to_csv('multiclass_metrics_by_group.csv', index=False)
8. Build Model Card
Simply add the dataframes into the ModelCardGen.generate class method to build a model card.
[16]:
mc = {
"schema_version": "0.0.1",
"model_details": {
"name": "Explainable Hate Speech Detection",
"version": {
"name": "0.1",
"date": "2020"
},
"graphics": {},
"citations": [
{
"citation": '''@article{mathew2020hatexplain,
title={HateXplain: A Benchmark Dataset for Explainable Hate Speech Detection},
author={Mathew, Binny and Saha, Punyajoy and Yimam, Seid Muhie and Biemann, Chris and Goyal, Pawan and Mukherjee, Animesh},
journal={arXiv preprint arXiv:2012.10289},
year={2020}}'''
},
],
"overview": 'The model is used for classifying a text as Hatespeech, Offensive, or Normal. The model is trained using data from Gab and Twitter and Human Rationales were included as part of the training data to boost the performance. The dataset and models are available here: https://github.com/punyajoy/HateXplain',
}
}
[17]:
mcg = ModelCardGen.generate(metrics_by_group=metrics_by_group, metrics_by_threshold=metrics_by_threshold, model_card=mc)
mcg
[17]:
Model Details
Overview
The model is used for classifying a text as Hatespeech, Offensive, or Normal. The model is trained using data from Gab and Twitter and Human Rationales were included as part of the training data to boost the performance. The dataset and models are available here: https://github.com/punyajoy/HateXplainModel Performance
Overall Accuracy/Precision/Recall/F1 - Label : (Hate)
Overall Accuracy/Precision/Recall/F1 - Label : (Normal)
Overall Accuracy/Precision/Recall/F1 - Label : (Offensive)
Version
name: 0.1
date: 2020
Citations
- @article{mathew2020hatexplain, title={HateXplain: A Benchmark Dataset for Explainable Hate Speech Detection}, author={Mathew, Binny and Saha, Punyajoy and Yimam, Seid Muhie and Biemann, Chris and Goyal, Pawan and Mukherjee, Animesh}, journal={arXiv preprint arXiv:2012.10289}, year={2020}}