107 lines
3.7 KiB
Python
107 lines
3.7 KiB
Python
"""Tecator dataset for classification.
|
|
|
|
URL:
|
|
http://lib.stat.cmu.edu/datasets/tecator
|
|
|
|
LICENCE / TERMS / COPYRIGHT:
|
|
This is the Tecator data set: The task is to predict the fat content
|
|
of a meat sample on the basis of its near infrared absorbance spectrum.
|
|
-------------------------------------------------------------------------
|
|
1. Statement of permission from Tecator (the original data source)
|
|
|
|
These data are recorded on a Tecator Infratec Food and Feed Analyzer
|
|
working in the wavelength range 850 - 1050 nm by the Near Infrared
|
|
Transmission (NIT) principle. Each sample contains finely chopped pure
|
|
meat with different moisture, fat and protein contents.
|
|
|
|
If results from these data are used in a publication we want you to
|
|
mention the instrument and company name (Tecator) in the publication.
|
|
In addition, please send a preprint of your article to
|
|
|
|
Karin Thente, Tecator AB,
|
|
Box 70, S-263 21 Hoganas, Sweden
|
|
|
|
The data are available in the public domain with no responsability from
|
|
the original data source. The data can be redistributed as long as this
|
|
permission note is attached.
|
|
|
|
For more information about the instrument - call Perstorp Analytical's
|
|
representative in your area.
|
|
|
|
Description:
|
|
For each meat sample the data consists of a 100 channel spectrum of
|
|
absorbances and the contents of moisture (water), fat and protein.
|
|
The absorbance is -log10 of the transmittance
|
|
measured by the spectrometer. The three contents, measured in percent,
|
|
are determined by analytic chemistry.
|
|
"""
|
|
|
|
import os
|
|
|
|
import numpy as np
|
|
import torch
|
|
from torchvision.datasets.utils import download_file_from_google_drive
|
|
|
|
from prototorch.datasets.abstract import ProtoDataset
|
|
|
|
|
|
class Tecator(ProtoDataset):
|
|
"""
|
|
`Tecator Dataset <http://lib.stat.cmu.edu/datasets/tecator>`__
|
|
for classification.
|
|
"""
|
|
|
|
_resources = [
|
|
("1P9WIYnyxFPh6f1vqAbnKfK8oYmUgyV83",
|
|
"ba5607c580d0f91bb27dc29d13c2f8df"),
|
|
] # (google_storage_id, md5hash)
|
|
classes = ["0 - low_fat", "1 - high_fat"]
|
|
|
|
def __getitem__(self, index):
|
|
img, target = self.data[index], int(self.targets[index])
|
|
return img, target
|
|
|
|
def _download(self):
|
|
"""Download the data if it doesn't exist in already."""
|
|
if self._check_exists():
|
|
return
|
|
|
|
if self.verbose:
|
|
print("Making directories...")
|
|
os.makedirs(self.raw_folder, exist_ok=True)
|
|
os.makedirs(self.processed_folder, exist_ok=True)
|
|
|
|
if self.verbose:
|
|
print("Downloading...")
|
|
for fileid, md5 in self._resources:
|
|
filename = "tecator.npz"
|
|
download_file_from_google_drive(fileid,
|
|
root=self.raw_folder,
|
|
filename=filename,
|
|
md5=md5)
|
|
|
|
if self.verbose:
|
|
print("Processing...")
|
|
with np.load(os.path.join(self.raw_folder, "tecator.npz"),
|
|
allow_pickle=False) as f:
|
|
x_train, y_train = f["x_train"], f["y_train"]
|
|
x_test, y_test = f["x_test"], f["y_test"]
|
|
training_set = [
|
|
torch.tensor(x_train, dtype=torch.float32),
|
|
torch.tensor(y_train),
|
|
]
|
|
test_set = [
|
|
torch.tensor(x_test, dtype=torch.float32),
|
|
torch.tensor(y_test),
|
|
]
|
|
|
|
with open(os.path.join(self.processed_folder, self.training_file),
|
|
"wb") as f:
|
|
torch.save(training_set, f)
|
|
with open(os.path.join(self.processed_folder, self.test_file),
|
|
"wb") as f:
|
|
torch.save(test_set, f)
|
|
|
|
if self.verbose:
|
|
print("Done!")
|