103 lines
3.6 KiB
Python
103 lines
3.6 KiB
Python
"""Tecator dataset for classification.
|
|
|
|
URL:
|
|
http://lib.stat.cmu.edu/datasets/tecator
|
|
|
|
LICENCE / TERMS / COPYRIGHT:
|
|
This is the Tecator data set: The task is to predict the fat content
|
|
of a meat sample on the basis of its near infrared absorbance spectrum.
|
|
-------------------------------------------------------------------------
|
|
1. Statement of permission from Tecator (the original data source)
|
|
|
|
These data are recorded on a Tecator Infratec Food and Feed Analyzer
|
|
working in the wavelength range 850 - 1050 nm by the Near Infrared
|
|
Transmission (NIT) principle. Each sample contains finely chopped pure
|
|
meat with different moisture, fat and protein contents.
|
|
|
|
If results from these data are used in a publication we want you to
|
|
mention the instrument and company name (Tecator) in the publication.
|
|
In addition, please send a preprint of your article to
|
|
|
|
Karin Thente, Tecator AB,
|
|
Box 70, S-263 21 Hoganas, Sweden
|
|
|
|
The data are available in the public domain with no responsability from
|
|
the original data source. The data can be redistributed as long as this
|
|
permission note is attached.
|
|
|
|
For more information about the instrument - call Perstorp Analytical's
|
|
representative in your area.
|
|
|
|
Description:
|
|
For each meat sample the data consists of a 100 channel spectrum of
|
|
absorbances and the contents of moisture (water), fat and protein.
|
|
The absorbance is -log10 of the transmittance
|
|
measured by the spectrometer. The three contents, measured in percent,
|
|
are determined by analytic chemistry.
|
|
"""
|
|
|
|
import os
|
|
|
|
import numpy as np
|
|
import torch
|
|
from torchvision.datasets.utils import download_file_from_google_drive
|
|
|
|
from prototorch.datasets.abstract import ProtoDataset
|
|
|
|
|
|
class Tecator(ProtoDataset):
|
|
"""Tecator dataset for classification."""
|
|
resources = [
|
|
('1MMuUK8V41IgNpnPDbg3E-QAL6wlErTk0',
|
|
'ba5607c580d0f91bb27dc29d13c2f8df'),
|
|
] # (google_storage_id, md5hash)
|
|
classes = ['0 - low_fat', '1 - high_fat']
|
|
|
|
def __getitem__(self, index):
|
|
img, target = self.data[index], int(self.targets[index])
|
|
return img, target
|
|
|
|
def download(self):
|
|
"""Download the data if it doesn't exist in already."""
|
|
if self._check_exists():
|
|
return
|
|
|
|
if self.verbose:
|
|
print('Making directories...')
|
|
os.makedirs(self.raw_folder, exist_ok=True)
|
|
os.makedirs(self.processed_folder, exist_ok=True)
|
|
|
|
if self.verbose:
|
|
print('Downloading...')
|
|
for fileid, md5 in self.resources:
|
|
filename = 'tecator.npz'
|
|
download_file_from_google_drive(fileid,
|
|
root=self.raw_folder,
|
|
filename=filename,
|
|
md5=md5)
|
|
|
|
if self.verbose:
|
|
print('Processing...')
|
|
with np.load(os.path.join(self.raw_folder, 'tecator.npz'),
|
|
allow_pickle=False) as f:
|
|
x_train, y_train = f['x_train'], f['y_train']
|
|
x_test, y_test = f['x_test'], f['y_test']
|
|
training_set = [
|
|
torch.tensor(x_train, dtype=torch.float32),
|
|
torch.tensor(y_train),
|
|
]
|
|
test_set = [
|
|
torch.tensor(x_test, dtype=torch.float32),
|
|
torch.tensor(y_test),
|
|
]
|
|
|
|
with open(os.path.join(self.processed_folder, self.training_file),
|
|
'wb') as f:
|
|
torch.save(training_set, f)
|
|
with open(os.path.join(self.processed_folder, self.test_file),
|
|
'wb') as f:
|
|
torch.save(test_set, f)
|
|
|
|
if self.verbose:
|
|
print('Done!')
|