-
Notifications
You must be signed in to change notification settings - Fork 0
/
reuse_methods.py
263 lines (228 loc) · 12.5 KB
/
reuse_methods.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
from works import Work
from license_parser import Parser
from collections import defaultdict
import logging
from copy import deepcopy
""" <reuse_methods.py>
This file contains the definitions of reuse methods.
Note that, all of these methods are not released with the new work by default.
Therefore, the new (work, usage) tuple will be added to new_work's auxworks rather than subworks.
Except that some of the reuse methods inevitably incorporates the original works (e.g. amalgamate).
For combination, original works will be added to mixworks.
You can manually combine these works for future license analysis based on preset open policy, or analyze them one by one.
"""
default_parser = Parser("licenses_description.yml")
"""
Use a work (model/data/algorithm) or list of works ([model, data, algorithm]).
NOTE: You can input list of works, the first work will compute the target output to construct a new work,
other works the new work are bundled as a combination, which will be released together.
For example, model output (aka prediction), algorithm (like SGD), data (input samples) as the bundle, meanwhile
the model, algorithm and data will be the auxworks of the predction work.
You can also use output_as to specify the output type and form (internal mimatching may exist and be ignore)
The single work as input means only the resulting output will be released, it is the predtions in this case.
The aux works will not be released with this work.
"""
def use(works, aux_works=None, output_as:Work=None, license_name:str=None) -> Work:
if isinstance(works, Work):
works = [works] # Convert to list
if isinstance(aux_works, Work):
aux_works = [aux_works]
if output_as is None:
output_as = Work('dummy', 'data', 'raw') # Default output as raw data
new_work = new_reused_work(works, 'U', license_name, output_as)
for w in works:
new_work.subworks += reuse_method_spread(w, 'use') # 'use' of this work relied on the 'use' of all mixworks
if aux_works:
for aw in aux_works:
new_work.auxworks += reuse_method_spread(aw, 'use')
logging.debug(f"Use {','.join([w.name for w in works])} with auxworks {','.join([aw.name for aw in aux_works])}")
return new_work
# Copy a work, this copy method will be applied to all mixworks, but the subworks will be skipped.
def naive_copy(work:Work, license_name:str=None) -> Work:
new_work = new_reused_work([work], 'CP', license_name)
# All mixworks will be copied, we can consider the new work as a new combination, subworks will NOT be copied
new_work.mixworks = reuse_method_spread(work, 'copy')
logging.debug(f"Copy {work.name}")
#else:
# new_work.assign_license(work.license_name) # Retain the license of original work
return new_work
"""
Combine multiple works to constrct a new work
"""
def combine(works:list, license_name:str=None) -> Work:
if len(works) <= 1:
logging.warning(f"Not enough works to combine")
return None
# The license name will not be set here, remain 'TBD', call analysis
new_work = new_reused_work(works, 'C', license_name)
for w in works:
if new_work.type == 'mix':
new_work.mixworks.append((w, 'combine_mix')) # use 'combine_mix' if the resulting work is a combination with mix types, this distinction is useful for copyleft data licenses.
else:
new_work.mixworks.append((w, 'combine')) # Recursive mixworks are not be included
logging.debug(f"Combine {','.join([w.name for w in works])}")
return new_work
def amalgamate(works:list, license_name:str=None) -> Work:
if len(works) <= 1:
logging.warning(f"Not enough works to amalgamate")
return None
if all(w.type == works[0].type for w in works):
if all(w.form == 'raw' for w in works):
new_work = new_reused_work(works, 'A', license_name)
# The modified works are regarded as 'subworks' of the new work (will be share with this work)
new_work.subworks = [(w, 'amalgamate') for w in works] #TODO: How about work type = mix? How about spread to mixworks?
return new_work
else:
logging.error("The amalgamation of works only support raw form")
else:
logging.error("Different types of works used for amalgamation")
return None
# Distill knowledge from old models to new models
def distill(works, dest_work:Work=None, aux_works=None, license_name:str=None):
if isinstance(works, Work):
works = [works] # Convert to list
if isinstance(aux_works, Work):
aux_works = [aux_works]
if dest_work: # The target model for distillation
if dest_work.type != 'model':
logging.error("The type of destination work of distillation must be a model")
return
else:
new_work = deepcopy(dest_work) # The distilled knowledge directly transfered to the destination model
new_work.name = get_new_work_name(works+[dest_work], 'D') # Rename the dest_work, for example, D_work1_work2
new_work.subworks += reuse_method_spread(dest_work, 'train') # Add the dest_work to the subworks of new work
new_work.license_name = 'TBD' # Reset the license name of new work
else:
if all(w.type != 'model' for w in works):
logging.error("At least one of the works used for distillation needs to be 'model' type")
return
else:
dummy_model_works = Work('dummy', 'model', 'raw') # We suppose the new work is a raw form model if the dest_model is not provided
new_work = new_reused_work(works, 'D', license_name, dummy_model_works)
for w in works:
new_work.auxworks += reuse_method_spread(w, 'distill') # NOTE: Accumulation '+=' must be placed here because there may be another auxworks in dest_works
if aux_works:
for aw in aux_works:
new_work.auxworks += reuse_method_spread(aw, 'distill')
return new_work
'''
Similar with use(), the difference is all input works are considered released with result in use(), but
in generate(), all input works will not be released with result. If you just want to get the inference result of a model, please call generate() rather than use().
NOTE: Most licenses have no restrictions on the output of models or algorithms.
'''
def generate(works, aux_works=None, output_as:Work=None, license_name:str=None) -> Work:
if isinstance(works, Work):
works = [works] # Convert to list
if isinstance(aux_works, Work):
aux_works = [aux_works]
if output_as is None:
output_as = Work('dummy', 'data', 'raw') # Default output as raw data
new_work = new_reused_work(works, 'G', license_name, output_as)
for w in works:
new_work.auxworks += reuse_method_spread(w, 'generate') # 'generate' of this work relied on the 'use' of all mixworks
if aux_works:
for aw in aux_works:
new_work.auxworks += reuse_method_spread(aw, 'generate')
logging.debug(f"Generate {','.join([w.name for w in works])} with auxworks {','.join([aw.name for aw in aux_works])}")
return new_work
# Embed works (corpus, image or other data samples) using aux_works (model or algorithm)
def embed(works, aux_works=None, output_as:Work=None, license_name:str=None) -> Work:
if isinstance(works, Work):
works = [works] # Convert to list
if isinstance(aux_works, Work):
aux_works = [aux_works]
if output_as is None:
output_as = Work('dummy', 'data', 'raw') # Default output as raw data
new_work = new_reused_work(works, 'E', license_name, output_as)
for w in works:
new_work.subworks += reuse_method_spread(w, 'embed') # 'embed' of this work relied on the 'embed' of all mixworks
if aux_works: # Model like feature extractor, translator
for aw in aux_works:
new_work.auxworks += reuse_method_spread(aw, 'use')
return new_work
# NOTE: To prevent loop, destination model should not be in works.
# You can just leave dest_work=None to create a new work
def train(works, dest_work:Work=None, aux_works=None, license_name:str=None) -> Work:
if isinstance(works, Work):
works = [works] # Convert to list
if isinstance(aux_works, Work):
aux_works = [aux_works]
if dest_work:
if dest_work.type != 'model':
logging.error("The type of destination work of train must be a model")
return
else:
new_work = deepcopy(dest_work) # This training procedure is applied to the destination model
new_work.name = get_new_work_name(works+[dest_work], 'T') # Rename the dest_work, for example, T_work1_work2
new_work = deepcopy(dest_work) # The distilled knowledge directly transfered to the destination model
new_work.subworks += reuse_method_spread(dest_work, 'modify') # Add the dest_work to the subworks of new work
new_work.license_name = 'TBD' # Reset the license name of new work
else:
dummy_model_works = Work('dummy', 'model', 'raw') # We suppose the new work is a raw form model if the dest_model is not provided
new_work = new_reused_work(works, 'T', license_name, dummy_model_works)
for w in works:
if w.type == 'model':
# The model in input will be regarded as the initial model and will be added as subworks of new work, NOTE: Accumulation '+=' must be placed here because there may be another subworks in dest_works
new_work.subworks += reuse_method_spread(w, 'train')
else:
# The software and data in input will be regarded as auxworks of the new work and will not be released with the new work
new_work.auxworks += reuse_method_spread(w, 'train')
if aux_works:
for aw in aux_works:
new_work.auxworks += reuse_method_spread(aw, 'train')
return new_work
# Finetune is a kind of train
def finetune(work, data, aux_works=None):
aux_works = [data] + aux_works if aux_works else [data]
return train(work, aux_works=aux_works)
def group_by_work_type(works:list):
work_index = defaultdict(list)
for idx, w in enumerate(works):
work_index[w.type].append(idx)
return work_index['model'], work_index['data'], work_index['algorithm'], work_index['mix']
def get_new_work_name(works:list, specifier:str) -> str:
if specifier:
return '_'.join([specifier] + [w.name for w in works]) # i.e., C_model1_model2
return get_new_work_name(works, 'UNK') # i.e. UNK_model1_model2
# If all works have same work type, return this type, otherwise, return 'mix'
def get_new_work_type(works:list) -> str:
all_work_type = []
for w in works:
if w.type == 'mix':
for (work, type) in w.mixworks:
all_work_type.append(type)
else:
all_work_type.append(w.type)
all_work_type = list(set(all_work_type))
if len(all_work_type) == 1:
# Combination of works with same type will result new work in same type
new_work_type = all_work_type[0]
else:
new_work_type = 'mix'
return new_work_type
# Deal with the new work form, compatibility: raw > binary > saas
def get_new_work_form(works:list) -> str:
all_work_form = [w.form for w in works]
for comp_form in ['saas', 'binary', 'raw']: # This order is matter
if comp_form in all_work_form:
new_work_form = comp_form
break
return new_work_form
# Create new work. NOTE: The license name will not be set here, remain 'TBD', call analysis to determine
# You can specify which licenses you want to grant to this work and the type, form of new work
def new_reused_work(works:list, specifier:str=None, assign_license_name:str=None, output_as:Work=None) -> Work:
new_work_name = get_new_work_name(works, specifier)
new_work_type = output_as.type if output_as else get_new_work_type(works)
new_work_form = output_as.form if output_as else get_new_work_form(works)
new_work = Work(new_work_name, new_work_type, new_work_form)
if assign_license_name:
new_work.assign_license(assign_license_name)
return new_work
# Deal with the spread of reusing method of mixworks, this work or the mixworks of this work will be returned
def reuse_method_spread(work:Work, method:str) -> list:
spreaded_works = []
if work.is_include_mixworks():
spreaded_works += [(mw, method) for mw in work.find_mixworks()] # Spread to all mixworks
else:
spreaded_works.append((work, method))
return spreaded_works