Construction Notebook for:
Wolfram AudioIdentify V1 Trained on AudioSet Data

NetModel Access

This Notebook

NetModel["Wolfram AudioIdentify V1 Trained on AudioSet Data","ConstructionNotebook"]

Untrained Net

NetModel["Wolfram AudioIdentify V1 Trained on AudioSet Data","UninitializedEvaluationNet"]

Trained Net

NetModel["Wolfram AudioIdentify V1 Trained on AudioSet Data"]

Net Construction

Encoder

In[]:=

encoder=With[{enc=NetEncoder[{"AudioMelSpectrogram","SampleRate"16000,"Normalization"None,"WindowSize"400,"Offset"160,"WindowFunction"HannWindow,"MinimumFrequency"20,"MaximumFrequency"8000,"NumberOfFilters"64,"Augmentation"None,"TargetLength"All}]},NetEncoder[{"Function",Function[feat,Partition[If[Length[feat]<96,ArrayPad[feat,{{0,96-Length[feat]}},Padding"Fixed"],feat],96,64]]/@Normal[enc[#1]]&,{"Varying",96,64},"Batched"True}]]

Out[]=

NetEncoder



Type:	Function
Output:	array (size: n ×96×64)



Decoder

In[]:=

decoder=NetDecoder"Class",

entities



Out[]=

NetDecoder



Type:	Class
Input:	vector (size: 527)



Internal Functions

In[]:=

mobileunit[prefix_,nchannels_,kernel_,stride_,pad_,ngroup_]:=<|"conv"<>prefixConvolutionLayer[nchannels,kernel,"Stride"stride,"PaddingSize"pad,"ChannelGroups"ngroup],"conv"<>prefix<>"_bn"BatchNormalizationLayer[],"relu"<>prefixElementwiseLayer[(Min[Max[0,#],6]&)]|>

In[]:=

mobileunit2[prefix_,nchannels_,kernel_,stride_,pad_,ngroup_]:=<|"conv"<>prefixConvolutionLayer[nchannels,kernel,"Stride"stride,"PaddingSize"pad,"ChannelGroups"ngroup],"conv"<>prefix<>"_bn"BatchNormalizationLayer[]|>

In[]:=

mobileunit3[prefix_,nchannels_,kernel_,stride_,pad_,ngroup_]:=<|"conv"<>prefixConvolutionLayer[nchannels,kernel,"Stride"stride,"PaddingSize"{{Ceiling[kernel[[1]]/2]-2,Ceiling[kernel[[2]]/2]},{Ceiling[kernel[[1]]/2]-2,Ceiling[kernel[[2]]/2]}},"ChannelGroups"ngroup],"conv"<>prefix<>"_bn"BatchNormalizationLayer[],"relu"<>prefixElementwiseLayer[(Min[Max[0,#],6]&)]|>

In[]:=

invresunit[prefix_,nchannels_,kernel_,stride_,pad_,ngroup_]:=<|prefixNetChain[Join[mobileunit[prefix<>"_expand",ngroup,1,1,0,1],mobileunit[prefix<>"_dwise",ngroup,kernel,stride,pad,ngroup],mobileunit2[prefix<>"_linear",nchannels,1,1,0,1]]]|>

In[]:=

invresunit2[prefix_,nchannels_,kernel_,stride_,pad_,ngroup_]:=<|prefixNetChain@Join[mobileunit[prefix<>"_dwise",ngroup,kernel,stride,pad,ngroup],mobileunit2[prefix<>"_linear",nchannels,1,1,0,1]]|>

In[]:=

invresunit3[prefix_,nchannels_,kernel_,stride_,pad_,ngroup_]:=<|prefixNetChain@Join[mobileunit[prefix<>"_expand",ngroup,1,1,0,1],mobileunit3[prefix<>"_dwise",ngroup,kernel,stride,pad,ngroup],mobileunit2[prefix<>"_linear",nchannels,1,1,0,1]]|>

In[]:=

mobilenetblock[prefix_,nchannels_,kernel_,stride_,pad_,ngroup_]:=Module[{layers},layers=Join[mobileunit[prefix<>"_expand",ngroup,1,1,0,1],mobileunit[prefix<>"_dwise",ngroup,kernel,stride,pad,ngroup],mobileunit2[prefix<>"_linear",nchannels,1,1,0,1]];<|prefix->NetGraph[Append[layers,"sum_"<>prefix->ThreadingLayer[Plus]],Join[{Fold[Rule[#2,#1]&,Reverse@Keys[layers]]},{Last[Keys[layers]]->"sum_"<>prefix,NetPort["Input"]"sum_"<>prefix}]]|>]

In[]:=

genAudioIdentifyNet[c1_,c2_,c3_,c4_,c5_,c6_,c7_,c8_,c9_,g1_,g2_,g3_,g4_,g5_,g6_,g7_,p_,dim_]:=NetChain[Join[<|"reshape1"->ReshapeLayer[{1,96,64}],"log"->ElementwiseLayer[Log[#+.001]&],"1"NetChain@mobileunit3["1",c1,{3,3},{2,2},{0,0},1]|>,invresunit2["2_1",c2,{3,3},{1,1},{1,1},g1],invresunit3["2_2",c3,{3,3},{2,2},{0,0},g2],mobilenetblock["3_1",c3,{3,3},{1,1},{1,1},g3],invresunit3["3_2",c4,{3,3},{2,2},{0,0},g3],mobilenetblock["4_1",c4,{3,3},{1,1},{1,1},g4],mobilenetblock["4_2",c4,{3,3},{1,1},{1,1},g4],invresunit3["4_3",c5,{3,3},{2,2},{0,0},g4],mobilenetblock["4_4",c5,{3,3},{1,1},{1,1},g5],mobilenetblock["4_5",c5,{3,3},{1,1},{1,1},g5],mobilenetblock["4_6",c5,{3,3},{1,1},{1,1},g5],invresunit["4_7",c6,{3,3},{1,1},{1,1},g5],mobilenetblock["5_1",c6,{3,3},{1,1},{1,1},g6],mobilenetblock["5_2",c6,{3,3},{1,1},{1,1},g6],invresunit3["5_3",c7,{3,3},{2,2},{0,0},g6],mobilenetblock["6_1",c7,{3,3},{1,1},{1,1},g7],mobilenetblock["6_2",c7,{3,3},{1,1},{1,1},g7],invresunit["6_3",c8,{3,3},{1,1},{1,1},g7],<|"6_4"NetChain@mobileunit["6_4",c9,{1,1},{1,1},{0,0},1],"pool6"PoolingLayer[{p,2},{1,1},"Function"Mean],"fc7"ConvolutionLayer[527,{1,1}],"reshape2"ReshapeLayer[{527}],"logistic"LogisticSigmoid|>],"Input"{dim,64}]

In[]:=

coreNetLarge=genAudioIdentifyNet[40,24,32,40,80,128,208,416,1664,40,144,192,240,480,768,1248,3,96]

Out[]=

NetChain



uniniti

aliz

	Input	matrix (size: 96×64)
reshape1	ReshapeLayer	array (size: 1×96×64)
log	Log[0.001+ x ]	array (size: 1×96×64)
1	NetChain (3 nodes)	array (size: 40×48×32)
2_1	NetChain (5 nodes)	array (size: 24×48×32)
2_2	NetChain (8 nodes)	array (size: 32×24×16)
3_1	NetGraph (9 nodes)	array (size: 32×24×16)
3_2	NetChain (8 nodes)	array (size: 40×12×8)
4_1	NetGraph (9 nodes)	array (size: 40×12×8)
4_2	NetGraph (9 nodes)	array (size: 40×12×8)
4_3	NetChain (8 nodes)	array (size: 80×6×4)
4_4	NetGraph (9 nodes)	array (size: 80×6×4)
4_5	NetGraph (9 nodes)	array (size: 80×6×4)
4_6	NetGraph (9 nodes)	array (size: 80×6×4)
4_7	NetChain (8 nodes)	array (size: 128×6×4)
5_1	NetGraph (9 nodes)	array (size: 128×6×4)
5_2	NetGraph (9 nodes)	array (size: 128×6×4)
5_3	NetChain (8 nodes)	array (size: 208×3×2)
6_1	NetGraph (9 nodes)	array (size: 208×3×2)
6_2	NetGraph (9 nodes)	array (size: 208×3×2)
6_3	NetChain (8 nodes)	array (size: 416×3×2)
6_4	NetChain (3 nodes)	array (size: 1664×3×2)
pool6	PoolingLayer	array (size: 1664×1×1)
fc7	ConvolutionLayer	array (size: 527×1×1)
reshape2	ReshapeLayer	vector (size: 527)
logistic	LogisticSigmoid	vector (size: 527)
	Output	vector (size: 527)



In[]:=

coreNetSmall=genAudioIdentifyNet[32,16,24,32,64,96,160,320,1280,32,96,144,192,384,576,960,3,96]

Out[]=

NetChain



uniniti

aliz

	Input	matrix (size: 96×64)
reshape1	ReshapeLayer	array (size: 1×96×64)
log	Log[0.001+ x ]	array (size: 1×96×64)
1	NetChain (3 nodes)	array (size: 32×48×32)
2_1	NetChain (5 nodes)	array (size: 16×48×32)
2_2	NetChain (8 nodes)	array (size: 24×24×16)
3_1	NetGraph (9 nodes)	array (size: 24×24×16)
3_2	NetChain (8 nodes)	array (size: 32×12×8)
4_1	NetGraph (9 nodes)	array (size: 32×12×8)
4_2	NetGraph (9 nodes)	array (size: 32×12×8)
4_3	NetChain (8 nodes)	array (size: 64×6×4)
4_4	NetGraph (9 nodes)	array (size: 64×6×4)
4_5	NetGraph (9 nodes)	array (size: 64×6×4)
4_6	NetGraph (9 nodes)	array (size: 64×6×4)
4_7	NetChain (8 nodes)	array (size: 96×6×4)
5_1	NetGraph (9 nodes)	array (size: 96×6×4)
5_2	NetGraph (9 nodes)	array (size: 96×6×4)
5_3	NetChain (8 nodes)	array (size: 160×3×2)
6_1	NetGraph (9 nodes)	array (size: 160×3×2)
6_2	NetGraph (9 nodes)	array (size: 160×3×2)
6_3	NetChain (8 nodes)	array (size: 320×3×2)
6_4	NetChain (3 nodes)	array (size: 1280×3×2)
pool6	PoolingLayer	array (size: 1280×1×1)
fc7	ConvolutionLayer	array (size: 527×1×1)
reshape2	ReshapeLayer	vector (size: 527)
logistic	LogisticSigmoid	vector (size: 527)
	Output	vector (size: 527)



Final Net

In[]:=

audioIdentifyNetLarge=NetChain[{NetMapOperator[coreNetLarge],AggregationLayer[Max,1]},"Input"encoder,"Output"decoder]

Construction Notebook for:​Wolfram AudioIdentify V1 Trained on AudioSet Data

NetModel Access

This Notebook

Untrained Net

Trained Net

Net Construction

Encoder

Decoder

Internal Functions

Final Net

Construction Notebook for:
Wolfram AudioIdentify V1 Trained on AudioSet Data