Skip to content

Feature extraction

FeatureExtractor

Source code in autorad/feature_extraction/extractor.py
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
class FeatureExtractor:
    def __init__(
        self,
        dataset: ImageDataset,
        feature_set: str = "pyradiomics",
        extraction_params: PathLike = "CT_Baessler.yaml",
        n_jobs: int | None = None,
    ):
        """
        Args:
            dataset: ImageDataset containing image paths, mask paths, and IDs
            feature_set: library to use features from (for now only pyradiomics)
            extraction_params: path to the JSON file containing the extraction
                parameters, or a string containing the name of the file in the
                default extraction parameter directory
                (autorad.config.pyradiomics_params)
            n_jobs: number of parallel jobs to run
        Returns:
            None
        """
        self.dataset = dataset
        self.feature_set = feature_set
        self.extraction_params = self._get_extraction_param_path(
            extraction_params
        )
        log.info(f"Using extraction params from {self.extraction_params}")
        self.n_jobs = utils.set_n_jobs(n_jobs)
        self._initialize_extractor()

    def _get_extraction_param_path(self, extraction_params: PathLike) -> str:
        default_extraction_param_dir = Path(config.PARAM_DIR)
        if Path(extraction_params).is_file():
            result_path = Path(extraction_params)
        elif (default_extraction_param_dir / str(extraction_params)).is_file():
            result_path = default_extraction_param_dir / str(extraction_params)
        else:
            raise ValueError(
                f"Extraction parameter file {extraction_params} not found."
            )
        return str(result_path)

    def run(
        self, keep_metadata=True, mask_label: int | None = None
    ) -> pd.DataFrame:
        """
        Run feature extraction.
        Args:
            keep_metadata: merge extracted features with data from the
                ImageDataset.df.
            mask_label: label in the mask to extract features from.
                For default value of None, the `label` value from extraction
                param file is used. Set this when you have multiple labels in your mask
        Returns:
            DataFrame containing extracted features
        """
        log.info("Extracting features")
        if self.n_jobs is None or self.n_jobs == 1:
            feature_df = self.get_features(mask_label=mask_label)
        else:
            feature_df = self.get_features_parallel(mask_label=mask_label)
        if feature_df.empty:
            raise ValueError(
                "No features extracted. Check the logs and your dataset."
            )

        ID_colname = self.dataset.ID_colname
        # move ID column to front
        feature_df = feature_df.set_index(ID_colname).reset_index()

        run_id = self.save_config(mask_label=mask_label)

        # add ID for this extraction run
        feature_df.insert(1, "extraction_ID", run_id)

        if keep_metadata:
            # Add all columns from ImageDataset.df
            try:
                feature_df = self.dataset.df.merge(
                    feature_df,
                    on=ID_colname,
                )
            except ValueError:
                raise ValueError("Error concatenating features and metadata.")
        return feature_df

    def save_config(self, mask_label):
        extraction_param_dict = io.load_yaml(self.extraction_params)
        if mask_label is not None:
            extraction_param_dict["label"] = mask_label
        run_config = {
            "feature_set": self.feature_set,
            "extraction_params": extraction_param_dict,
        }

        mlflow.set_tracking_uri("file://" + config.MODEL_REGISTRY)
        mlflow.set_experiment("feature_extraction")
        with mlflow.start_run() as run:
            mlflow_utils.log_dict_as_artifact(run_config, "extraction_config")

        return run.info.run_id

    def _initialize_extractor(self):
        if self.feature_set == "pyradiomics":
            self.extractor = PyRadiomicsExtractorWrapper(
                str(self.extraction_params)
            )
        else:
            raise ValueError("Feature set not supported")
        log.info(f"Initialized extractor {self.feature_set}")
        return self

    def get_features_for_single_case(
        self,
        image_path: PathLike,
        mask_path: PathLike,
        ID: str | None = None,
        mask_label: int | None = None,
    ) -> dict | None:
        """
        Returns:
            feature_series: dict with extracted features
        """
        image_path = Path(image_path)
        mask_path = Path(mask_path)

        if not image_path.exists():
            log.warning(
                f"Image not found. Skipping case... (path={image_path}"
            )
            return None
        if not mask_path.exists():
            log.warning(f"Mask not found. Skipping case... (path={mask_path}")
            return None
        try:
            feature_dict = self.extractor.execute(
                image_path, mask_path, label=mask_label
            )
        except Exception as e:
            error_msg = f"Error extracting features for image, mask pair: {image_path}, {mask_path}"
            log.error(error_msg)
            log.error(f"Original error: {e}")
            return None

        if ID is not None:
            feature_dict[self.dataset.ID_colname] = ID

        return feature_dict

    @utils.time_it
    def get_features(self, mask_label=None) -> pd.DataFrame:
        """
        Get features for all cases.
        """
        lst_of_feature_dicts = [
            self.get_features_for_single_case(
                image_path, mask_path, id_, mask_label=mask_label
            )
            for image_path, mask_path, id_ in tqdm(
                list(
                    zip(
                        self.dataset.image_paths,
                        self.dataset.mask_paths,
                        self.dataset.ids,
                    )
                )
            )
        ]
        lst_of_feature_dicts = [
            feature_dict
            for feature_dict in lst_of_feature_dicts
            if feature_dict is not None
        ]
        feature_df = pd.DataFrame(lst_of_feature_dicts)
        return feature_df

    @utils.time_it
    def get_features_parallel(self, mask_label=None) -> pd.DataFrame:
        lst_of_feature_dicts = pqdm(
            (
                {
                    "image_path": vals[0],
                    "mask_path": vals[1],
                    "ID": vals[2],
                    "mask_label": mask_label,
                }
                for vals in zip(
                    self.dataset.image_paths,
                    self.dataset.mask_paths,
                    self.dataset.ids,
                )
            ),
            self.get_features_for_single_case,
            n_jobs=self.n_jobs,
            argument_type="kwargs",
        )
        lst_of_feature_dicts = [
            feature_dict
            for feature_dict in lst_of_feature_dicts
            if feature_dict is not None
        ]
        feature_df = pd.DataFrame(lst_of_feature_dicts)
        return feature_df

    def get_pyradiomics_feature_names(self) -> list[str]:
        class_obj = featureextractor.getFeatureClasses()
        feature_classes = list(class_obj.keys())
        feature_names = [
            f"{klass}_{name}"
            for klass in feature_classes
            for name in class_obj[klass].getFeatureNames().keys()
        ]
        return feature_names

__init__(dataset, feature_set='pyradiomics', extraction_params='CT_Baessler.yaml', n_jobs=None)

Parameters:

Name Type Description Default
dataset ImageDataset

ImageDataset containing image paths, mask paths, and IDs

required
feature_set str

library to use features from (for now only pyradiomics)

'pyradiomics'
extraction_params PathLike

path to the JSON file containing the extraction parameters, or a string containing the name of the file in the default extraction parameter directory (autorad.config.pyradiomics_params)

'CT_Baessler.yaml'
n_jobs int | None

number of parallel jobs to run

None

Returns:

Type Description

None

Source code in autorad/feature_extraction/extractor.py
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
def __init__(
    self,
    dataset: ImageDataset,
    feature_set: str = "pyradiomics",
    extraction_params: PathLike = "CT_Baessler.yaml",
    n_jobs: int | None = None,
):
    """
    Args:
        dataset: ImageDataset containing image paths, mask paths, and IDs
        feature_set: library to use features from (for now only pyradiomics)
        extraction_params: path to the JSON file containing the extraction
            parameters, or a string containing the name of the file in the
            default extraction parameter directory
            (autorad.config.pyradiomics_params)
        n_jobs: number of parallel jobs to run
    Returns:
        None
    """
    self.dataset = dataset
    self.feature_set = feature_set
    self.extraction_params = self._get_extraction_param_path(
        extraction_params
    )
    log.info(f"Using extraction params from {self.extraction_params}")
    self.n_jobs = utils.set_n_jobs(n_jobs)
    self._initialize_extractor()

get_features(mask_label=None)

Get features for all cases.

Source code in autorad/feature_extraction/extractor.py
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
@utils.time_it
def get_features(self, mask_label=None) -> pd.DataFrame:
    """
    Get features for all cases.
    """
    lst_of_feature_dicts = [
        self.get_features_for_single_case(
            image_path, mask_path, id_, mask_label=mask_label
        )
        for image_path, mask_path, id_ in tqdm(
            list(
                zip(
                    self.dataset.image_paths,
                    self.dataset.mask_paths,
                    self.dataset.ids,
                )
            )
        )
    ]
    lst_of_feature_dicts = [
        feature_dict
        for feature_dict in lst_of_feature_dicts
        if feature_dict is not None
    ]
    feature_df = pd.DataFrame(lst_of_feature_dicts)
    return feature_df

get_features_for_single_case(image_path, mask_path, ID=None, mask_label=None)

Returns:

Name Type Description
feature_series dict | None

dict with extracted features

Source code in autorad/feature_extraction/extractor.py
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
def get_features_for_single_case(
    self,
    image_path: PathLike,
    mask_path: PathLike,
    ID: str | None = None,
    mask_label: int | None = None,
) -> dict | None:
    """
    Returns:
        feature_series: dict with extracted features
    """
    image_path = Path(image_path)
    mask_path = Path(mask_path)

    if not image_path.exists():
        log.warning(
            f"Image not found. Skipping case... (path={image_path}"
        )
        return None
    if not mask_path.exists():
        log.warning(f"Mask not found. Skipping case... (path={mask_path}")
        return None
    try:
        feature_dict = self.extractor.execute(
            image_path, mask_path, label=mask_label
        )
    except Exception as e:
        error_msg = f"Error extracting features for image, mask pair: {image_path}, {mask_path}"
        log.error(error_msg)
        log.error(f"Original error: {e}")
        return None

    if ID is not None:
        feature_dict[self.dataset.ID_colname] = ID

    return feature_dict

run(keep_metadata=True, mask_label=None)

Run feature extraction.

Parameters:

Name Type Description Default
keep_metadata

merge extracted features with data from the ImageDataset.df.

True
mask_label int | None

label in the mask to extract features from. For default value of None, the label value from extraction param file is used. Set this when you have multiple labels in your mask

None

Returns:

Type Description
pd.DataFrame

DataFrame containing extracted features

Source code in autorad/feature_extraction/extractor.py
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
def run(
    self, keep_metadata=True, mask_label: int | None = None
) -> pd.DataFrame:
    """
    Run feature extraction.
    Args:
        keep_metadata: merge extracted features with data from the
            ImageDataset.df.
        mask_label: label in the mask to extract features from.
            For default value of None, the `label` value from extraction
            param file is used. Set this when you have multiple labels in your mask
    Returns:
        DataFrame containing extracted features
    """
    log.info("Extracting features")
    if self.n_jobs is None or self.n_jobs == 1:
        feature_df = self.get_features(mask_label=mask_label)
    else:
        feature_df = self.get_features_parallel(mask_label=mask_label)
    if feature_df.empty:
        raise ValueError(
            "No features extracted. Check the logs and your dataset."
        )

    ID_colname = self.dataset.ID_colname
    # move ID column to front
    feature_df = feature_df.set_index(ID_colname).reset_index()

    run_id = self.save_config(mask_label=mask_label)

    # add ID for this extraction run
    feature_df.insert(1, "extraction_ID", run_id)

    if keep_metadata:
        # Add all columns from ImageDataset.df
        try:
            feature_df = self.dataset.df.merge(
                feature_df,
                on=ID_colname,
            )
        except ValueError:
            raise ValueError("Error concatenating features and metadata.")
    return feature_df

PyRadiomicsExtractorWrapper

Bases: featureextractor.RadiomicsFeatureExtractor

Wrapper that filters out extracted metadata

Source code in autorad/feature_extraction/extractor.py
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
class PyRadiomicsExtractorWrapper(featureextractor.RadiomicsFeatureExtractor):
    """Wrapper that filters out extracted metadata"""

    def __init__(self, extraction_params: PathLike, *args, **kwargs):
        super().__init__(str(extraction_params), *args, **kwargs)

    def execute(
        self,
        image_path: PathLike,
        mask_path: PathLike,
        label: int | None = None,
    ) -> dict:
        img = io.read_image_sitk(Path(image_path))
        mask = io.read_segmentation_sitk(Path(mask_path), label=label)
        feature_dict = dict(super().execute(img, mask, label=label))
        feature_dict_without_metadata = {
            feature_name: feature_dict[feature_name]
            for feature_name in feature_dict.keys()
            if "diagnostic" not in feature_name
        }
        return feature_dict_without_metadata