Skip to content

fusion_bench.tasks

Image Classification Tasks

fusion_bench.tasks.clip_classification

CLIPTemplateFactory

A factory class for creating CLIP dataset templates.

This class provides methods to retrieve class names and templates for various datasets, register new datasets, and get a list of all available datasets. It uses a mapping from dataset names to their respective module paths or detailed information, facilitating dynamic import and usage of dataset-specific class names and templates.

Attributes:

  • _dataset_mapping (dict) –

    A mapping from dataset names to their respective module paths

Methods:

  • get_classnames_and_templates

    str): Retrieves class names and templates for the specified dataset.

  • register_dataset

    str, dataset_info: Dict[str, Any] = None, classnames: List[str] = None, templates: List[Callable] = None): Registers a new dataset with its associated information.

  • get_available_datasets

    Returns a list of all available dataset names.

Source code in fusion_bench/tasks/clip_classification/__init__.py
class CLIPTemplateFactory:
    """
    A factory class for creating CLIP dataset templates.

    This class provides methods to retrieve class names and templates for various datasets,
    register new datasets, and get a list of all available datasets. It uses a mapping
    from dataset names to their respective module paths or detailed information, facilitating
    dynamic import and usage of dataset-specific class names and templates.

    Attributes:
        _dataset_mapping (dict): A mapping from dataset names to their respective module paths
        or detailed information including module path, class names, and templates.

    Methods:
        get_classnames_and_templates(dataset_name: str): Retrieves class names and templates for the specified dataset.
        register_dataset(dataset_name: str, dataset_info: Dict[str, Any] = None, classnames: List[str] = None, templates: List[Callable] = None): Registers a new dataset with its associated information.
        get_available_datasets(): Returns a list of all available dataset names.
    """

    _dataset_mapping = {
        "mnist": ".mnist",
        "stanford-cars": ".stanford_cars",
        "stanford_cars": ".stanford_cars",
        "tanganke/stanford_cars": ".stanford_cars",
        "gtsrb": ".gtsrb",
        "tanganke/gtsrb": ".gtsrb",
        "resisc45": ".resisc45",
        "tanganke/resisc45": ".resisc45",
        "dtd": ".dtd",
        "tanganke/dtd": ".dtd",
        "eurosat": ".eurosat",
        "tanganke/eurosat": ".eurosat",
        "sun397": ".sun397",
        "tanganke/sun397": ".sun397",
        "cifar10": ".cifar10",
        "svhn": ".svhn",
        "cifar100": {
            "module": ".cifar100",
            "classnames": "fine_label",
            "templates": "templates",
        },
        "nateraw/rendered-sst2": ".rendered_sst2",
        "rendered-sst2": ".rendered_sst2",
        "tanganke/stl10": ".stl10",
        "stl10": ".stl10",
        "dpdl-benchmark/oxford_flowers102": ".flower102",
        "oxford_flowers102": ".flower102",
        "timm/oxford-iiit-pet": ".oxford_iiit_pet",
        "oxford-iiit-pet": ".oxford_iiit_pet",
        "imagenet": ".imagenet",
        "tiny-imagenet": ".tiny_imagenet",
        "pcam": ".pcam",
        "fer2013": ".fer2013",
        "emnist_mnist": ".emnist_mnist",
        "emnist_letters": ".emnist_letters",
        "kmnist": ".kmnist",
        "food101": ".food101",
        "fashion_mnist": ".fashion_mnist",
        "cub-200-2011": ".cub_200_2011",
        "mango-leaf-disease": ".mango_leaf_disease",
    }

    @staticmethod
    def get_classnames_and_templates(dataset_name: str):
        """
        Retrieves class names and templates for the specified dataset.

        This method looks up the dataset information in the internal mapping and dynamically imports
        the class names and templates from the specified module. It supports both simple string mappings
        and detailed dictionary mappings for datasets.

        Args:
            dataset_name (str): The name of the dataset for which to retrieve class names and templates.

        Returns:
            Tuple[List[str], List[Callable]]: A tuple containing a list of class names and a list of template callables.

        Raises:
            ValueError: If the dataset_name is not found in the internal mapping.
        """
        if dataset_name not in CLIPTemplateFactory._dataset_mapping:
            raise ValueError(
                f"Unknown dataset {dataset_name}, available datasets: {CLIPTemplateFactory._dataset_mapping.keys()}. You can register a new dataset using `CLIPTemplateFactory.register_dataset()` method."
            )

        dataset_info = CLIPTemplateFactory._dataset_mapping[dataset_name]
        # convert dataset_info to dict format: { 'module': str, 'classnames': str, 'templates': str }
        if isinstance(dataset_info, str):
            dataset_info = _check_module_name(dataset_info)
            dataset_info = {
                "module": dataset_info,
                "classnames": "classnames",
                "templates": "templates",
            }
        elif isinstance(dataset_info, dict):
            if "module" in dataset_info:
                dataset_info["module"] = _check_module_name(dataset_info["module"])

        # import classnames and templates from the specified module
        # convert to dict format: { 'labels': List[str], 'templates': List[Callable] }
        if "module" in dataset_info:
            module = importlib.import_module(dataset_info["module"])
            classnames = getattr(module, dataset_info["classnames"])
            templates = getattr(module, dataset_info["templates"])
        else:
            classnames = dataset_info["classnames"]
            templates = dataset_info["templates"]

        return classnames, templates

    @staticmethod
    def register_dataset(
        dataset_name: str,
        *,
        dataset_info: Dict[str, Any] = None,
        classnames: List[str] = None,
        templates: List[Callable] = None,
    ):
        """
        Registers a new dataset with its associated information.

        This method allows for the dynamic addition of datasets to the internal mapping. It supports
        registration through either a detailed dictionary (`dataset_info`) or separate lists of class names
        and templates. If a dataset with the same name already exists, it will be overwritten.

        The expected format and contents of `dataset_info` can vary depending on the needs of the dataset being registered, but typically, it includes the following keys:

        - "module": A string specifying the module path where the dataset's related classes and functions are located. This is used for dynamic import of the dataset's class names and templates.
        - "classnames": This key is expected to hold the name of the attribute or variable in the specified module that contains a list of class names relevant to the dataset. These class names are used to label data points in the dataset.
        - "templates": Similar to "classnames", this key should specify the name of the attribute or variable in the module that contains a list of template callables. These templates are functions or methods that define how data points should be processed or transformed.

        Args:
            dataset_name (str): The name of the dataset to register.
            dataset_info (Dict[str, Any], optional): A dictionary containing the dataset information, including module path, class names, and templates. Defaults to None.
            classnames (List[str], optional): A list of class names for the dataset. Required if `dataset_info` is not provided. Defaults to None.
            templates (List[Callable], optional): A list of template callables for the dataset. Required if `dataset_info` is not provided. Defaults to None.

        Raises:
            AssertionError: If neither `dataset_info` nor both `classnames` and `templates` are provided.
        """
        assert dataset_info is None or (
            classnames is not None and templates is not None
        ), "You must provide either `dataset_info` or both `classnames` and `templates`."

        if dataset_name in CLIPTemplateFactory._dataset_mapping:
            warnings.warn(
                f"Dataset {dataset_name} is already registered, overwriting the existing dataset information."
            )
        if dataset_info is None:
            dataset_info = {"classnames": classnames, "temolates": templates}
        CLIPTemplateFactory._dataset_mapping[dataset_name] = dataset_info

    @staticmethod
    def get_available_datasets():
        """
        Get a list of all available dataset names.

        Returns:
            List[str]: A list of dataset names.
        """
        return list(CLIPTemplateFactory._dataset_mapping.keys())
get_available_datasets() staticmethod

Get a list of all available dataset names.

Returns:

  • List[str]: A list of dataset names.

Source code in fusion_bench/tasks/clip_classification/__init__.py
@staticmethod
def get_available_datasets():
    """
    Get a list of all available dataset names.

    Returns:
        List[str]: A list of dataset names.
    """
    return list(CLIPTemplateFactory._dataset_mapping.keys())
get_classnames_and_templates(dataset_name) staticmethod

Retrieves class names and templates for the specified dataset.

This method looks up the dataset information in the internal mapping and dynamically imports the class names and templates from the specified module. It supports both simple string mappings and detailed dictionary mappings for datasets.

Parameters:

  • dataset_name (str) –

    The name of the dataset for which to retrieve class names and templates.

Returns:

  • Tuple[List[str], List[Callable]]: A tuple containing a list of class names and a list of template callables.

Raises:

  • ValueError

    If the dataset_name is not found in the internal mapping.

Source code in fusion_bench/tasks/clip_classification/__init__.py
@staticmethod
def get_classnames_and_templates(dataset_name: str):
    """
    Retrieves class names and templates for the specified dataset.

    This method looks up the dataset information in the internal mapping and dynamically imports
    the class names and templates from the specified module. It supports both simple string mappings
    and detailed dictionary mappings for datasets.

    Args:
        dataset_name (str): The name of the dataset for which to retrieve class names and templates.

    Returns:
        Tuple[List[str], List[Callable]]: A tuple containing a list of class names and a list of template callables.

    Raises:
        ValueError: If the dataset_name is not found in the internal mapping.
    """
    if dataset_name not in CLIPTemplateFactory._dataset_mapping:
        raise ValueError(
            f"Unknown dataset {dataset_name}, available datasets: {CLIPTemplateFactory._dataset_mapping.keys()}. You can register a new dataset using `CLIPTemplateFactory.register_dataset()` method."
        )

    dataset_info = CLIPTemplateFactory._dataset_mapping[dataset_name]
    # convert dataset_info to dict format: { 'module': str, 'classnames': str, 'templates': str }
    if isinstance(dataset_info, str):
        dataset_info = _check_module_name(dataset_info)
        dataset_info = {
            "module": dataset_info,
            "classnames": "classnames",
            "templates": "templates",
        }
    elif isinstance(dataset_info, dict):
        if "module" in dataset_info:
            dataset_info["module"] = _check_module_name(dataset_info["module"])

    # import classnames and templates from the specified module
    # convert to dict format: { 'labels': List[str], 'templates': List[Callable] }
    if "module" in dataset_info:
        module = importlib.import_module(dataset_info["module"])
        classnames = getattr(module, dataset_info["classnames"])
        templates = getattr(module, dataset_info["templates"])
    else:
        classnames = dataset_info["classnames"]
        templates = dataset_info["templates"]

    return classnames, templates
register_dataset(dataset_name, *, dataset_info=None, classnames=None, templates=None) staticmethod

Registers a new dataset with its associated information.

This method allows for the dynamic addition of datasets to the internal mapping. It supports registration through either a detailed dictionary (dataset_info) or separate lists of class names and templates. If a dataset with the same name already exists, it will be overwritten.

The expected format and contents of dataset_info can vary depending on the needs of the dataset being registered, but typically, it includes the following keys:

  • "module": A string specifying the module path where the dataset's related classes and functions are located. This is used for dynamic import of the dataset's class names and templates.
  • "classnames": This key is expected to hold the name of the attribute or variable in the specified module that contains a list of class names relevant to the dataset. These class names are used to label data points in the dataset.
  • "templates": Similar to "classnames", this key should specify the name of the attribute or variable in the module that contains a list of template callables. These templates are functions or methods that define how data points should be processed or transformed.

Parameters:

  • dataset_name (str) –

    The name of the dataset to register.

  • dataset_info (Dict[str, Any], default: None ) –

    A dictionary containing the dataset information, including module path, class names, and templates. Defaults to None.

  • classnames (List[str], default: None ) –

    A list of class names for the dataset. Required if dataset_info is not provided. Defaults to None.

  • templates (List[Callable], default: None ) –

    A list of template callables for the dataset. Required if dataset_info is not provided. Defaults to None.

Raises:

  • AssertionError

    If neither dataset_info nor both classnames and templates are provided.

Source code in fusion_bench/tasks/clip_classification/__init__.py
@staticmethod
def register_dataset(
    dataset_name: str,
    *,
    dataset_info: Dict[str, Any] = None,
    classnames: List[str] = None,
    templates: List[Callable] = None,
):
    """
    Registers a new dataset with its associated information.

    This method allows for the dynamic addition of datasets to the internal mapping. It supports
    registration through either a detailed dictionary (`dataset_info`) or separate lists of class names
    and templates. If a dataset with the same name already exists, it will be overwritten.

    The expected format and contents of `dataset_info` can vary depending on the needs of the dataset being registered, but typically, it includes the following keys:

    - "module": A string specifying the module path where the dataset's related classes and functions are located. This is used for dynamic import of the dataset's class names and templates.
    - "classnames": This key is expected to hold the name of the attribute or variable in the specified module that contains a list of class names relevant to the dataset. These class names are used to label data points in the dataset.
    - "templates": Similar to "classnames", this key should specify the name of the attribute or variable in the module that contains a list of template callables. These templates are functions or methods that define how data points should be processed or transformed.

    Args:
        dataset_name (str): The name of the dataset to register.
        dataset_info (Dict[str, Any], optional): A dictionary containing the dataset information, including module path, class names, and templates. Defaults to None.
        classnames (List[str], optional): A list of class names for the dataset. Required if `dataset_info` is not provided. Defaults to None.
        templates (List[Callable], optional): A list of template callables for the dataset. Required if `dataset_info` is not provided. Defaults to None.

    Raises:
        AssertionError: If neither `dataset_info` nor both `classnames` and `templates` are provided.
    """
    assert dataset_info is None or (
        classnames is not None and templates is not None
    ), "You must provide either `dataset_info` or both `classnames` and `templates`."

    if dataset_name in CLIPTemplateFactory._dataset_mapping:
        warnings.warn(
            f"Dataset {dataset_name} is already registered, overwriting the existing dataset information."
        )
    if dataset_info is None:
        dataset_info = {"classnames": classnames, "temolates": templates}
    CLIPTemplateFactory._dataset_mapping[dataset_name] = dataset_info

get_classnames_and_templates(dataset_name)

Source code in fusion_bench/tasks/clip_classification/__init__.py
def get_classnames_and_templates(dataset_name: str):
    return CLIPTemplateFactory.get_classnames_and_templates(dataset_name)

Flan-T5 Text Generation Tasks

fusion_bench.tasks.flan_t5_text_generation.glue_preprocessors

CoLA_Preprocessor

Bases: DatasetPreprocessor

dataset URL: https://huggingface.co/datasets/glue/viewer/cola

Source code in fusion_bench/tasks/flan_t5_text_generation/glue_preprocessors.py
class CoLA_Preprocessor(DatasetPreprocessor):
    """
    dataset URL: https://huggingface.co/datasets/glue/viewer/cola
    """

    def preprocess(self, sentence: str, label: int):
        assert isinstance(sentence, str)
        assert isinstance(label, int)
        input_text = self.template["input_text"].format(sentence=sentence)
        if label in [0, 1]:
            target_text = self.template["target_text"][str(label)]
        else:
            target_text = ""
        return input_text, target_text

    def __call__(self, example: Dict[str, Any]):
        """
        Preprocess the CoLA dataset into a text-to-text format.
        """
        if isinstance(example["sentence"], str):
            # not batched
            input_text, target_text = self.preprocess(
                example["sentence"], example["label"]
            )
        else:
            # batched
            input_text, target_text = [], []
            for sentence, label in zip(example["sentence"], example["label"]):
                _input_text, _target_text = self.preprocess(sentence, label)
                input_text.append(_input_text)
                target_text.append(_target_text)

        return preprocess(
            tokenizer=self.tokenizer,
            input_text=input_text,
            target_text=target_text,
            tokenizer_kwawgs=self.tokenizer_kwargs,
        )
__call__(example)

Preprocess the CoLA dataset into a text-to-text format.

Source code in fusion_bench/tasks/flan_t5_text_generation/glue_preprocessors.py
def __call__(self, example: Dict[str, Any]):
    """
    Preprocess the CoLA dataset into a text-to-text format.
    """
    if isinstance(example["sentence"], str):
        # not batched
        input_text, target_text = self.preprocess(
            example["sentence"], example["label"]
        )
    else:
        # batched
        input_text, target_text = [], []
        for sentence, label in zip(example["sentence"], example["label"]):
            _input_text, _target_text = self.preprocess(sentence, label)
            input_text.append(_input_text)
            target_text.append(_target_text)

    return preprocess(
        tokenizer=self.tokenizer,
        input_text=input_text,
        target_text=target_text,
        tokenizer_kwawgs=self.tokenizer_kwargs,
    )

MNLI_Preprocessor

Bases: DatasetPreprocessor

dataset URL: https://huggingface.co/datasets/glue/viewer/mnli/

Source code in fusion_bench/tasks/flan_t5_text_generation/glue_preprocessors.py
class MNLI_Preprocessor(DatasetPreprocessor):
    """
    dataset URL: https://huggingface.co/datasets/glue/viewer/mnli/
    """

    def preprocess(self, hypothesis, premise, label):
        assert isinstance(hypothesis, str)
        assert isinstance(premise, str)
        assert isinstance(label, int)
        input_text = self.template["input_text"].format(
            hypothesis=hypothesis, premise=premise
        )
        if label in [0, 1, 2]:
            target_text = self.template["target_text"][str(label)]
        else:
            target_text = ""
        return input_text, target_text

    def __call__(self, example):
        """
        Preprocess the MNLI dataset into a text-to-text format.
        """
        if isinstance(example["hypothesis"], str):
            # not batched
            input_text, target_text = self.preprocess(
                example["hypothesis"], example["premise"], example["label"]
            )
        else:
            # batched
            input_text, target_text = [], []
            for hypothesis, premise, label in zip(
                example["hypothesis"], example["premise"], example["label"]
            ):
                _input_text, _target_text = self.preprocess(hypothesis, premise, label)
                input_text.append(_input_text)
                target_text.append(_target_text)

        return preprocess(
            tokenizer=self.tokenizer,
            input_text=input_text,
            target_text=target_text,
            tokenizer_kwawgs=self.tokenizer_kwargs,
        )
__call__(example)

Preprocess the MNLI dataset into a text-to-text format.

Source code in fusion_bench/tasks/flan_t5_text_generation/glue_preprocessors.py
def __call__(self, example):
    """
    Preprocess the MNLI dataset into a text-to-text format.
    """
    if isinstance(example["hypothesis"], str):
        # not batched
        input_text, target_text = self.preprocess(
            example["hypothesis"], example["premise"], example["label"]
        )
    else:
        # batched
        input_text, target_text = [], []
        for hypothesis, premise, label in zip(
            example["hypothesis"], example["premise"], example["label"]
        ):
            _input_text, _target_text = self.preprocess(hypothesis, premise, label)
            input_text.append(_input_text)
            target_text.append(_target_text)

    return preprocess(
        tokenizer=self.tokenizer,
        input_text=input_text,
        target_text=target_text,
        tokenizer_kwawgs=self.tokenizer_kwargs,
    )

MRPC_Preprocessor

Bases: DatasetPreprocessor

dataset URL: https://huggingface.co/datasets/glue/viewer/mrpc

Source code in fusion_bench/tasks/flan_t5_text_generation/glue_preprocessors.py
class MRPC_Preprocessor(DatasetPreprocessor):
    """
    dataset URL: https://huggingface.co/datasets/glue/viewer/mrpc
    """

    def preprocess(self, sentence1: str, sentence2: str, label: int):
        assert isinstance(sentence1, str)
        assert isinstance(sentence2, str)
        assert isinstance(label, int)
        input_text = self.template["input_text"].format(
            sentence1=sentence1, sentence2=sentence2
        )
        if label in [0, 1]:
            target_text = self.template["target_text"][str(label)]
        else:
            target_text = ""
        return input_text, target_text

    def __call__(self, example):
        """
        Preprocess the MRPC dataset into a text-to-text format.
        """
        if isinstance(example["sentence1"], str):
            # not batched
            input_text, target_text = self.preprocess(
                example["sentence1"], example["sentence2"], example["label"]
            )
        else:
            # batched
            input_text, target_text = [], []
            for sentence1, sentence2, label in zip(
                example["sentence1"], example["sentence2"], example["label"]
            ):
                _input_text, _target_text = self.preprocess(sentence1, sentence2, label)
                input_text.append(_input_text)
                target_text.append(_target_text)

        return preprocess(
            tokenizer=self.tokenizer,
            input_text=input_text,
            target_text=target_text,
            tokenizer_kwawgs=self.tokenizer_kwargs,
        )
__call__(example)

Preprocess the MRPC dataset into a text-to-text format.

Source code in fusion_bench/tasks/flan_t5_text_generation/glue_preprocessors.py
def __call__(self, example):
    """
    Preprocess the MRPC dataset into a text-to-text format.
    """
    if isinstance(example["sentence1"], str):
        # not batched
        input_text, target_text = self.preprocess(
            example["sentence1"], example["sentence2"], example["label"]
        )
    else:
        # batched
        input_text, target_text = [], []
        for sentence1, sentence2, label in zip(
            example["sentence1"], example["sentence2"], example["label"]
        ):
            _input_text, _target_text = self.preprocess(sentence1, sentence2, label)
            input_text.append(_input_text)
            target_text.append(_target_text)

    return preprocess(
        tokenizer=self.tokenizer,
        input_text=input_text,
        target_text=target_text,
        tokenizer_kwawgs=self.tokenizer_kwargs,
    )

QNLI_Preprocessor

Bases: DatasetPreprocessor

dataset URL: https://huggingface.co/datasets/glue/viewer/qnli

Source code in fusion_bench/tasks/flan_t5_text_generation/glue_preprocessors.py
class QNLI_Preprocessor(DatasetPreprocessor):
    """
    dataset URL: https://huggingface.co/datasets/glue/viewer/qnli
    """

    def preprocess(self, question: str, sentence: str, label: int):
        assert isinstance(question, str)
        assert isinstance(sentence, str)
        assert isinstance(label, int)
        input_text = self.template["input_text"].format(
            question=question, sentence=sentence
        )
        if label in [0, 1]:
            target_text = self.template["target_text"][str(label)]
        else:
            target_text = ""
        return input_text, target_text

    def __call__(self, example):
        """
        Preprocess the QNLI dataset into a text-to-text format.
        """
        if isinstance(example["question"], str):
            # not batched
            input_text, target_text = self.preprocess(
                example["question"], example["sentence"], example["label"]
            )
        else:
            # batched
            input_text, target_text = [], []
            for question, sentence, label in zip(
                example["question"], example["sentence"], example["label"]
            ):
                _input_text, _target_text = self.preprocess(question, sentence, label)
                input_text.append(_input_text)
                target_text.append(_target_text)

        return preprocess(
            tokenizer=self.tokenizer,
            input_text=input_text,
            target_text=target_text,
            tokenizer_kwawgs=self.tokenizer_kwargs,
        )
__call__(example)

Preprocess the QNLI dataset into a text-to-text format.

Source code in fusion_bench/tasks/flan_t5_text_generation/glue_preprocessors.py
def __call__(self, example):
    """
    Preprocess the QNLI dataset into a text-to-text format.
    """
    if isinstance(example["question"], str):
        # not batched
        input_text, target_text = self.preprocess(
            example["question"], example["sentence"], example["label"]
        )
    else:
        # batched
        input_text, target_text = [], []
        for question, sentence, label in zip(
            example["question"], example["sentence"], example["label"]
        ):
            _input_text, _target_text = self.preprocess(question, sentence, label)
            input_text.append(_input_text)
            target_text.append(_target_text)

    return preprocess(
        tokenizer=self.tokenizer,
        input_text=input_text,
        target_text=target_text,
        tokenizer_kwawgs=self.tokenizer_kwargs,
    )

QQP_Preprocessor

Bases: DatasetPreprocessor

dataset URL: https://huggingface.co/datasets/glue/viewer/qqp

Source code in fusion_bench/tasks/flan_t5_text_generation/glue_preprocessors.py
class QQP_Preprocessor(DatasetPreprocessor):
    """
    dataset URL: https://huggingface.co/datasets/glue/viewer/qqp
    """

    def preprocess(self, question1, question2, label):
        assert isinstance(
            question1, str
        ), f"question1 must be a string, got {type(question1)}, question1={question1}"
        assert isinstance(
            question2, str
        ), f"question2 must be a string, got {type(question2)}, question2={question2}"
        assert isinstance(
            label, int
        ), f"label must be an int, got {type(label)}, label={label}"
        input_text: str = self.template["input_text"].format(
            question1=question1, question2=question2
        )
        if label in [0, 1]:
            target_text: str = self.template["target_text"][str(label)]
        else:
            target_text = ""
        return input_text, target_text

    def __call__(self, example):
        """
        Preprocess the QQP dataset into a text-to-text format.
        """
        if isinstance(example["question1"], str):
            # batched
            input_text, target_text = self.preprocess(
                example["question1"], example["question2"], example["label"]
            )
        else:
            # not batched
            input_text, target_text = [], []
            for question1, question2, label in zip(
                example["question1"], example["question2"], example["label"]
            ):
                _input_text, _target_text = self.preprocess(question1, question2, label)
                input_text.append(_input_text)
                target_text.append(_target_text)

        return preprocess(
            tokenizer=self.tokenizer,
            input_text=input_text,
            target_text=target_text,
            tokenizer_kwawgs=self.tokenizer_kwargs,
        )
__call__(example)

Preprocess the QQP dataset into a text-to-text format.

Source code in fusion_bench/tasks/flan_t5_text_generation/glue_preprocessors.py
def __call__(self, example):
    """
    Preprocess the QQP dataset into a text-to-text format.
    """
    if isinstance(example["question1"], str):
        # batched
        input_text, target_text = self.preprocess(
            example["question1"], example["question2"], example["label"]
        )
    else:
        # not batched
        input_text, target_text = [], []
        for question1, question2, label in zip(
            example["question1"], example["question2"], example["label"]
        ):
            _input_text, _target_text = self.preprocess(question1, question2, label)
            input_text.append(_input_text)
            target_text.append(_target_text)

    return preprocess(
        tokenizer=self.tokenizer,
        input_text=input_text,
        target_text=target_text,
        tokenizer_kwawgs=self.tokenizer_kwargs,
    )

RTE_Preprocessor

Bases: DatasetPreprocessor

dataset URL: https://huggingface.co/datasets/glue/viewer/rte

Source code in fusion_bench/tasks/flan_t5_text_generation/glue_preprocessors.py
class RTE_Preprocessor(DatasetPreprocessor):
    """
    dataset URL: https://huggingface.co/datasets/glue/viewer/rte
    """

    def preprocess(self, sentence1, sentence2, label):
        assert isinstance(sentence1, str)
        assert isinstance(sentence2, str)
        assert isinstance(label, int)

        input_text: str = self.template["input_text"].format(
            sentence1=sentence1, sentence2=sentence2
        )
        if label in [0, 1]:
            target_text: str = self.template["target_text"][str(label)]
        else:
            target_text = ""
        return input_text, target_text

    def __call__(self, example):
        """
        Preprocess the RTE dataset into a text-to-text format.
        """
        if isinstance(example["sentence1"], str):
            # not batched
            input_text, target_text = self.preprocess(
                example["sentence1"], example["sentence2"], example["label"]
            )
        else:
            # batched
            input_text, target_text = [], []
            for sentence1, sentence2, label in zip(
                example["sentence1"], example["sentence2"], example["label"]
            ):
                _input_text, _target_text = self.preprocess(sentence1, sentence2, label)
                input_text.append(_input_text)
                target_text.append(_target_text)

        return preprocess(
            tokenizer=self.tokenizer,
            input_text=input_text,
            target_text=target_text,
            tokenizer_kwawgs=self.tokenizer_kwargs,
        )
__call__(example)

Preprocess the RTE dataset into a text-to-text format.

Source code in fusion_bench/tasks/flan_t5_text_generation/glue_preprocessors.py
def __call__(self, example):
    """
    Preprocess the RTE dataset into a text-to-text format.
    """
    if isinstance(example["sentence1"], str):
        # not batched
        input_text, target_text = self.preprocess(
            example["sentence1"], example["sentence2"], example["label"]
        )
    else:
        # batched
        input_text, target_text = [], []
        for sentence1, sentence2, label in zip(
            example["sentence1"], example["sentence2"], example["label"]
        ):
            _input_text, _target_text = self.preprocess(sentence1, sentence2, label)
            input_text.append(_input_text)
            target_text.append(_target_text)

    return preprocess(
        tokenizer=self.tokenizer,
        input_text=input_text,
        target_text=target_text,
        tokenizer_kwawgs=self.tokenizer_kwargs,
    )

SST2_Preprocessor

Bases: DatasetPreprocessor

dataset URL: https://huggingface.co/datasets/glue/viewer/sst2

Source code in fusion_bench/tasks/flan_t5_text_generation/glue_preprocessors.py
class SST2_Preprocessor(DatasetPreprocessor):
    """
    dataset URL: https://huggingface.co/datasets/glue/viewer/sst2
    """

    def preprocess(self, sentence: str, label: int):
        assert isinstance(
            sentence, str
        ), f"sentence must be a string, got {type(sentence)}, sentence={sentence}"
        assert isinstance(
            label, int
        ), f"label must be an integer, got {type(label)}, label={label}"
        input_text = self.template["input_text"].format(sentence=sentence)
        if label in [0, 1]:
            target_text = self.template["target_text"][str(label)]
        else:
            target_text = ""
        return input_text, target_text

    def __call__(self, example):
        """
        Preprocess the SST2 dataset into a text-to-text format.
        """
        if isinstance(example["sentence"], str):
            # not batched
            input_text, target_text = self.preprocess(
                example["sentence"], example["label"]
            )
        else:
            # batched
            input_text, target_text = [], []
            for sentence, label in zip(example["sentence"], example["label"]):
                _input_text, _target_text = self.preprocess(sentence, label)
                input_text.append(_input_text)
                target_text.append(_target_text)

        return preprocess(
            tokenizer=self.tokenizer,
            input_text=input_text,
            target_text=target_text,
            tokenizer_kwawgs=self.tokenizer_kwargs,
        )
__call__(example)

Preprocess the SST2 dataset into a text-to-text format.

Source code in fusion_bench/tasks/flan_t5_text_generation/glue_preprocessors.py
def __call__(self, example):
    """
    Preprocess the SST2 dataset into a text-to-text format.
    """
    if isinstance(example["sentence"], str):
        # not batched
        input_text, target_text = self.preprocess(
            example["sentence"], example["label"]
        )
    else:
        # batched
        input_text, target_text = [], []
        for sentence, label in zip(example["sentence"], example["label"]):
            _input_text, _target_text = self.preprocess(sentence, label)
            input_text.append(_input_text)
            target_text.append(_target_text)

    return preprocess(
        tokenizer=self.tokenizer,
        input_text=input_text,
        target_text=target_text,
        tokenizer_kwawgs=self.tokenizer_kwargs,
    )

STSB_Preprocessor

Bases: DatasetPreprocessor

dataset URL: https://huggingface.co/datasets/glue/viewer/stsb

Source code in fusion_bench/tasks/flan_t5_text_generation/glue_preprocessors.py
class STSB_Preprocessor(DatasetPreprocessor):
    """
    dataset URL: https://huggingface.co/datasets/glue/viewer/stsb
    """

    def preprocess(self, sentence1, sentence2, label):
        assert isinstance(
            sentence1, str
        ), f"sentence1 must be a string, got {type(sentence1)}, sentence1={sentence1}"
        assert isinstance(
            sentence2, str
        ), f"sentence2 must be a string, got {type(sentence2)}, sentence2={sentence2}"
        assert isinstance(
            label, (float, int)
        ), f"label must be a float or an integer, got {type(label)}, label={label}"
        input_text = self.template["input_text"].format(
            sentence1=sentence1, sentence2=sentence2
        )
        target_text = self.template["target_text"].format(label)
        return input_text, target_text

    def __call__(self, example):
        """
        Preprocess the STSB dataset into a text-to-text format.
        """
        if isinstance(example["sentence1"], str):
            # not batched
            input_text, target_text = self.preprocess(
                example["sentence1"], example["sentence2"], example["label"]
            )
        else:
            # batched
            input_text, target_text = [], []
            for sentence1, sentence2, label in zip(
                example["sentence1"], example["sentence2"], example["label"]
            ):
                _input_text, _target_text = self.preprocess(sentence1, sentence2, label)
                input_text.append(_input_text)
                target_text.append(_target_text)

        return preprocess(
            tokenizer=self.tokenizer,
            input_text=input_text,
            target_text=target_text,
            tokenizer_kwawgs=self.tokenizer_kwargs,
        )
__call__(example)

Preprocess the STSB dataset into a text-to-text format.

Source code in fusion_bench/tasks/flan_t5_text_generation/glue_preprocessors.py
def __call__(self, example):
    """
    Preprocess the STSB dataset into a text-to-text format.
    """
    if isinstance(example["sentence1"], str):
        # not batched
        input_text, target_text = self.preprocess(
            example["sentence1"], example["sentence2"], example["label"]
        )
    else:
        # batched
        input_text, target_text = [], []
        for sentence1, sentence2, label in zip(
            example["sentence1"], example["sentence2"], example["label"]
        ):
            _input_text, _target_text = self.preprocess(sentence1, sentence2, label)
            input_text.append(_input_text)
            target_text.append(_target_text)

    return preprocess(
        tokenizer=self.tokenizer,
        input_text=input_text,
        target_text=target_text,
        tokenizer_kwawgs=self.tokenizer_kwargs,
    )

fusion_bench.tasks.flan_t5_text_generation.glue_load_dataset

load_glue_dataset(name, tokenizer, cache_dir='outputs/cache', split=None)

Source code in fusion_bench/tasks/flan_t5_text_generation/glue_load_dataset.py
def load_glue_dataset(
    name,
    tokenizer,
    cache_dir: Optional[str] = "outputs/cache",
    split: Optional[str] = None,
):
    with timeit_context(f"Loading {name} dataset"):
        if cache_dir is not None:
            if not os.path.exists(cache_dir):
                os.makedirs(cache_dir)
            cache_path = os.path.join(
                cache_dir, "flan-t5", f"_load_{name}_dataset_cached"
            )
            if os.path.exists(cache_path):
                dataset = load_from_disk(cache_path)
            else:
                dataset = _load_glue_dataset(name, tokenizer)
                log.info(f"Saving {name} dataset to {cache_path}")
                dataset.save_to_disk(cache_path)
        else:
            dataset = _load_glue_dataset(name, tokenizer)

    if split is not None:
        return dataset[split]
    else:
        return dataset

fusion_bench.tasks.flan_t5_text_generation.glue_evaluation

evaluate_accuracy(model, val_loader, tokenizer)

This function evaluates the accuracy of a language model on a validation set.

Parameters:

  • model (Module) –

    The language model to be evaluated.

  • val_loader (DataLoader) –

    The DataLoader object containing the validation data.

  • tokenizer (Tokenizer) –

    The tokenizer object used for tokenizing text.

Returns:

  • float

    The accuracy of the model on the validation set.

Source code in fusion_bench/tasks/flan_t5_text_generation/glue_evaluation.py
def evaluate_accuracy(model, val_loader: DataLoader, tokenizer):
    """
    This function evaluates the accuracy of a language model on a validation set.

    Parameters:
        model (nn.Module): The language model to be evaluated.
        val_loader (DataLoader): The DataLoader object containing the validation data.
        tokenizer (Tokenizer): The tokenizer object used for tokenizing text.

    Returns:
        float: The accuracy of the model on the validation set.
    """
    from tqdm import tqdm

    correct = 0
    total = 0

    model = model.eval()
    for batch_idx, batch in enumerate(
        tqdm(
            val_loader, desc="Evaluate Exact Accuracy", leave=False, dynamic_ncols=True
        )
    ):
        with torch.no_grad():
            outputs = model.generate(batch["input_ids"], max_length=10)
            output_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)

            labels = [
                remove_special_tokens(tokenizer, label_token)
                for label_token in batch["labels"]
            ]
            labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

            # compare output_text and labels
            for i, j in zip(output_text, labels):
                if i == j:
                    correct += 1
                total += 1

    # return accuracy
    return correct / total

evaluate_spearman_rho(model, val_loader, tokenizer)

This function evaluates the Spearman's rank correlation coefficient (rho) between the model's predictions and the actual labels on a validation set.

Parameters:

  • model (Module) –

    The language model to be evaluated.

  • val_loader (DataLoader) –

    The DataLoader object containing the validation data.

  • tokenizer (Tokenizer) –

    The tokenizer object used for tokenizing text.

Returns:

  • float

    The Spearman's rho between the model's predictions and the actual labels.

Source code in fusion_bench/tasks/flan_t5_text_generation/glue_evaluation.py
def evaluate_spearman_rho(model, val_loader: DataLoader, tokenizer):
    """
    This function evaluates the Spearman's rank correlation coefficient (rho) between the model's predictions and the actual labels on a validation set.

    Parameters:
        model (nn.Module): The language model to be evaluated.
        val_loader (DataLoader): The DataLoader object containing the validation data.
        tokenizer (Tokenizer): The tokenizer object used for tokenizing text.

    Returns:
        float: The Spearman's rho between the model's predictions and the actual labels.
    """
    from tqdm import tqdm

    model = model.eval()
    all_preds: List[str] = []
    all_labels: List[str] = []
    for batch_idx, batch in enumerate(
        tqdm(val_loader, desc="Evaluate Spearman Rho", leave=False, dynamic_ncols=True)
    ):
        with torch.no_grad():
            outputs = model.generate(batch["input_ids"], max_length=10)
            output_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)

            labels = [
                remove_special_tokens(tokenizer, label_token)
                for label_token in batch["labels"]
            ]
            labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

            all_preds.extend(output_text)
            all_labels.extend(labels)

    # save `all_preds` and `all_labels`
    # with open("temp/all_preds.txt", "w") as f:
    #     for preds in all_preds:
    #         for pred in preds:
    #             f.write(pred + "\n")
    # with open("temp/all_labels.txt", "w") as f:
    #     for labels in all_labels:
    #         for label in labels:
    #             f.write(label + "\n")

    # calculate spearman's rho
    # 1. convert string list `all_preds` and `all_labels` to numpy array
    # 2. compute spearman's rho
    from scipy.stats import spearmanr

    def parse_flost(s: str):
        try:
            return float(s)
        except Exception:
            return 0.0

    all_preds = np.array([parse_flost(pred) for pred in all_preds])
    all_labels = np.array([parse_flost(label) for label in all_labels])
    rho = spearmanr(all_preds, all_labels)[0]
    return rho

remove_special_tokens(tokenizer, token_list)

This function removes special tokens from a list of tokens. It also stops processing when it encounters a token with a value of -100.

Parameters:

  • tokenizer (Tokenizer) –

    The tokenizer object used for tokenizing text.

  • token_list (list) –

    The list of tokens to be processed.

Returns:

  • list

    The list of tokens after removing special tokens.

Source code in fusion_bench/tasks/flan_t5_text_generation/glue_evaluation.py
def remove_special_tokens(tokenizer, token_list: list):
    """
    This function removes special tokens from a list of tokens. It also stops processing
    when it encounters a token with a value of -100.

    Parameters:
        tokenizer (Tokenizer): The tokenizer object used for tokenizing text.
        token_list (list): The list of tokens to be processed.

    Returns:
        list: The list of tokens after removing special tokens.
    """
    ret = []
    for token in token_list:
        if token not in tokenizer.all_special_ids and token > 0:
            ret.append(token)
        if token == -100:
            break
    return ret

fusion_bench.tasks.flan_t5_text_generation.glue_prompt_templates

glue_prompt_templates = {'cola': cola, 'mnli': mnli, 'mrpc': mrpc, 'qnli': qnli, 'qqp': qqp, 'rte': rte, 'stsb': stsb, 'sst2': sst2} module-attribute

cola = {'description': 'template used by GLUE-CoLA', 'input_text': 'Indicate if the following sentence is grammatically correct or not: "{sentence}". Answere \'acceptable\' or \'unacceptable\'.', 'target_text': {'0': 'unacceptable', '1': 'acceptable'}} module-attribute

mnli = {'input_text': "Does the premise: '{premise}' logically imply, contradict, or is neutral to the hypothesis: '{hypothesis}'? Answere with 'entailment', 'contradiction', or 'neutral'.", 'target_text': {'0': 'entailment', '1': 'neutral', '2': 'contradiction'}} module-attribute

mrpc = {'input_text': "Are the following sentences '{sentence1}' and '{sentence2}' conveying the same meaning? Answere with 'yes' or 'no'.", 'target_text': {'0': 'no', '1': 'yes'}} module-attribute

qnli = {'input_text': "Given the context: '{sentence}', does the question '{question}' have an answer based on the information provided? Answer with 'yes' or 'no'.", 'target_text': {'0': 'yes', '1': 'no'}} module-attribute

qqp = {'input_text': "Do the questions '{question1}' and '{question2}' have the same intent? Answere with 'yes' or 'no'.", 'target_text': {'0': 'no', '1': 'yes'}} module-attribute

rte = {'description': 'Template used by GLUE-RTE', 'input_text': "Does the text: '{sentence1}' entail that '{sentence2}' is true? Provide 'yes' or 'no'.", 'target_text': {'0': 'yes', '1': 'no'}} module-attribute

stsb = {'input_text': "Consider the sentences '{sentence1}' and '{sentence2}'. On a scale from 1 (completely different) to 5 (completely similar), rate the similarity.", 'target_text': '{:.1f}'} module-attribute

sst2 = {'input_text': "Given the sentence '{sentence}', determine the sentiment. Is it positive or negative?", 'target_text': {'0': 'negative', '1': 'positive'}} module-attribute