Source code for activation_extractor.extractors.intermediateExtractor
import numpy as np
import os, shutil, sys
from collections import OrderedDict
import torch
from activation_extractor.extractors.intermediateExtractorBase import IntermediateExtractorBase
from activation_extractor.model_functions.embedding_to_numpy import embedding_to_numpy
[docs]
class IntermediateExtractor(IntermediateExtractorBase):
"""
Extends the functionality of the ``IntermediateExtractorBase`` to automatically save activations.
"""
#Save outputs ---------------------------------------------------------------------------
[docs]
def gpu_to_cpu(self):
"""
Takes all the intermediate activations stored in the extractor object and moves them to CPU
(if on GPU) after formatting them to a numpy array.
"""
for name in self.layer_list:
try:
self.intermediate_outputs[name] = embedding_to_numpy(self.intermediate_outputs[name])
except KeyError:
#print(f"{name} not in dictionary", file=sys.stderr)
pass
except Exception as e:
print(e)
[docs]
def emb_reformatting(self, outputs,
emb_format='full',
sequence_axis=1,
custom_position=None,
):
"""
Takes an output to save and reformats it according to ``emb_format`` :
* full: nothing
* mean: mean along sequence axis
* LT: last token in sequence
* FT: first token in sequence
* custom: custom token position in sequence
"""
#check the format
if emb_format not in ['full','mean','LT', 'FT','custom']:
raise ValueError(f"format must be 'full','mean' or 'LT', 'FT', 'custom'; got {emb_format} instead.")
token_position_dict = {
"FT":0,
"LT":-1,
"custom":custom_position,
#"full":None, "mean":None,
}
match emb_format:
case 'mean':
outputs = np.mean(outputs, axis=sequence_axis)
case 'LT' | 'FT' | 'custom':
token_position = token_position_dict[emb_format]
#get slicer for the right axis ===============
#Create a list of slice(None) for each dimension
slicer = [slice(None)] * outputs.ndim
#select last token or First token on the sequence axis index
slicer[sequence_axis] = token_position
#format slicer
slicer = tuple(slicer)
#slice array selecting
outputs = outputs[slicer]
case 'full':
pass
return outputs
[docs]
def save_outputs(self, output_folder, output_id, reset=False, move_to_cpu=True,
save_method="numpy_compressed", emb_formats=['LT', 'FT'],
sequence_axis=1, custom_position=None):
"""
Save intermediate activation dictionary to output folder.
You can choose:
* the saving function (numpy_compressed or numpy)
* the embedding format (full, mean, LT: last token) : a list.
* sequence_axis : sequence length axis to take mean or last token from
"""
#make output folder
if reset: shutil.rmtree(output_folder, ignore_errors=True)
os.makedirs(output_folder, exist_ok=True)
os.chmod(output_folder, mode=0o777)
for emb_format in emb_formats:
os.makedirs(output_folder+f"/{emb_format}/{output_id}", exist_ok=True)
os.chmod(output_folder+f"/{emb_format}/{output_id}", mode=0o777)
#move from gpu to cpu
if move_to_cpu: self.gpu_to_cpu()
#save each layer
for name,outputs in self.intermediate_outputs.items():
#reformat outputs
for emb_format in emb_formats:
#reformat outputs
outputs_formatted = self.emb_reformatting(outputs=outputs,
emb_format=emb_format,
sequence_axis=sequence_axis,
custom_position=custom_position,
)
#different saving functions
match save_method:
case "numpy_compressed":
np.savez_compressed(f'{output_folder}/{emb_format}/{output_id}/{name}.npz',
outputs_formatted)
case "numpy":
np.save(f'{output_folder}/{emb_format}/{output_id}/{name}.npy',
outputs_formatted)