Add prototorch/datasets

2020-04-14 19:47:34 +02:00
parent 4158586cb9
commit a22c752342
3 changed files with 181 additions and 0 deletions
--- a/prototorch/datasets/init.py
+++ b/prototorch/datasets/init.py
--- a/prototorch/datasets/abstract.py
+++ b/prototorch/datasets/abstract.py
@@ -0,0 +1,87 @@
 """ProtoTorch abstract datasets
 Based on `torchvision.VisionDataset` and `torchvision.MNIST`
 For the original code, see:
 https://github.com/pytorch/vision/blob/master/torchvision/datasets/vision.py
 https://github.com/pytorch/vision/blob/master/torchvision/datasets/mnist.py
 """
 import os
 import torch
 class Dataset(torch.utils.data.Dataset):
    """Abstract dataset class to be inherited"""
    _repr_indent = 2
    def __init__(self, root):
        if isinstance(root, torch._six.string_classes):
            root = os.path.expanduser(root)
        self.root = root
    def __getitem__(self, index):
        raise NotImplementedError
    def __len__(self):
        raise NotImplementedError
 class ProtoDataset(Dataset):
    """Abstract dataset class to be inherited"""
    training_file = 'training.pt'
    test_file = 'test.pt'
    def __init__(self, root, train=True, download=True, verbose=True):
        super().__init__(root)
        self.train = train  # training set or test set
        self.verbose = verbose
        if download:
            self.download()
        if not self._check_exists():
            raise RuntimeError('Dataset not found. '
                               'You can use download=True to download it')
        data_file = self.training_file if self.train else self.test_file
        self.data, self.targets = torch.load(
            os.path.join(self.processed_folder, data_file))
    @property
    def raw_folder(self):
        return os.path.join(self.root, self.__class__.__name__, 'raw')
    @property
    def processed_folder(self):
        return os.path.join(self.root, self.__class__.__name__, 'processed')
    @property
    def class_to_idx(self):
        return {_class: i for i, _class in enumerate(self.classes)}
    def _check_exists(self):
        return (os.path.exists(
            os.path.join(self.processed_folder, self.training_file))
                and os.path.exists(
                    os.path.join(self.processed_folder, self.test_file)))
    def __repr__(self):
        head = 'Dataset ' + self.__class__.__name__
        body = ['Number of datapoints: {}'.format(self.__len__())]
        if self.root is not None:
            body.append('Root location: {}'.format(self.root))
        body += self.extra_repr().splitlines()
        lines = [head] + [' ' * self._repr_indent + line for line in body]
        return '\n'.join(lines)
    def extra_repr(self):
        return f"Split: {'Train' if self.train is True else 'Test'}"
    def __len__(self):
        return len(self.data)
    def download(self):
        raise NotImplementedError
--- a/prototorch/datasets/tecator.py
+++ b/prototorch/datasets/tecator.py
@@ -0,0 +1,94 @@
 """Tecator dataset for classification
 URL:
    http://lib.stat.cmu.edu/datasets/tecator
 LICENCE / TERMS / COPYRIGHT:
    This is the Tecator data set: The task is to predict the fat content
    of a meat sample on the basis of its near infrared absorbance spectrum.
    -------------------------------------------------------------------------
    1. Statement of permission from Tecator (the original data source)
    These data are recorded on a Tecator Infratec Food and Feed Analyzer
    working in the wavelength range 850 - 1050 nm by the Near Infrared
    Transmission (NIT) principle. Each sample contains finely chopped pure
    meat with different moisture, fat and protein contents.
    If results from these data are used in a publication we want you to
    mention the instrument and company name (Tecator) in the publication.
    In addition, please send a preprint of your article to
        Karin Thente, Tecator AB,
        Box 70, S-263 21 Hoganas, Sweden
    The data are available in the public domain with no responsability from
    the original data source. The data can be redistributed as long as this
    permission note is attached.
    For more information about the instrument - call Perstorp Analytical's
    representative in your area.
 Description:
    For each meat sample the data consists of a 100 channel spectrum of
    absorbances and the contents of moisture (water), fat and protein.
    The absorbance is -log10 of the transmittance
    measured by the spectrometer. The three contents, measured in percent,
    are determined by analytic chemistry.
 """
 import os
 import numpy as np
 import torch
 from torchvision.datasets.utils import download_file_from_google_drive
 from prototorch.datasets.abstract import ProtoDataset
 class Tecator(ProtoDataset):
    """Tecator dataset for classification"""
    resources = [('1MMuUK8V41IgNpnPDbg3E-QAL6wlErTk0',
                  'ba5607c580d0f91bb27dc29d13c2f8df')]
    classes = ['0 - low_fat', '1 - high_fat']
    def __getitem__(self, index):
        img, target = self.data[index], int(self.targets[index])
        return img, target
    def download(self):
        """Download the data if it doesn't exist in already."""
        if self._check_exists():
            return
        if self.verbose:
            print('Making directories...')
        os.makedirs(self.raw_folder, exist_ok=True)
        os.makedirs(self.processed_folder, exist_ok=True)
        if self.verbose:
            print('Downloading...')
        for fileid, md5 in self.resources:
            filename = 'tecator.npz'
            download_file_from_google_drive(fileid,
                                            root=self.raw_folder,
                                            filename=filename,
                                            md5=md5)
        if self.verbose:
            print('Processing...')
        with np.load(os.path.join(self.raw_folder, 'tecator.npz'),
                     allow_pickle=False) as f:
            x_train, y_train = f['x_train'], f['y_train']
            x_test, y_test = f['x_test'], f['y_test']
        training_set = [torch.as_tensor(x_train), torch.as_tensor(y_train)]
        test_set = [torch.as_tensor(x_test), torch.as_tensor(y_test)]
        with open(os.path.join(self.processed_folder, self.training_file),
                  'wb') as f:
            torch.save(training_set, f)
        with open(os.path.join(self.processed_folder, self.test_file),
                  'wb') as f:
            torch.save(test_set, f)
        if self.verbose:
            print('Done!')