Construction Notebook for:
Wolfram AudioIdentify V1 Trained on AudioSet Data
Construction Notebook for:
Wolfram AudioIdentify V1 Trained on AudioSet Data
Wolfram AudioIdentify V1 Trained on AudioSet Data
NetModel Access
NetModel Access
This Notebook
This Notebook
NetModel["Wolfram AudioIdentify V1 Trained on AudioSet Data","ConstructionNotebook"]
Untrained Net
Untrained Net
NetModel["Wolfram AudioIdentify V1 Trained on AudioSet Data","UninitializedEvaluationNet"]
Trained Net
Trained Net
NetModel["Wolfram AudioIdentify V1 Trained on AudioSet Data"]
Net Construction
Net Construction
Encoder
Encoder
In[]:=
encoder=With[{enc=NetEncoder[{"AudioMelSpectrogram","SampleRate"16000,"Normalization"None,"WindowSize"400,"Offset"160,"WindowFunction"HannWindow,"MinimumFrequency"20,"MaximumFrequency"8000,"NumberOfFilters"64,"Augmentation"None,"TargetLength"All}]},NetEncoder[{"Function",Function[feat,Partition[If[Length[feat]<96,ArrayPad[feat,{{0,96-Length[feat]}},Padding"Fixed"],feat],96,64]]/@Normal[enc[#1]]&,{"Varying",96,64},"Batched"True}]]
Out[]=
NetEncoder
Decoder
Decoder
In[]:=
decoder=NetDecoder"Class",
Out[]=
NetDecoder
Internal Functions
Internal Functions
In[]:=
mobileunit[prefix_,nchannels_,kernel_,stride_,pad_,ngroup_]:=<|"conv"<>prefixConvolutionLayer[nchannels,kernel,"Stride"stride,"PaddingSize"pad,"ChannelGroups"ngroup],"conv"<>prefix<>"_bn"BatchNormalizationLayer[],"relu"<>prefixElementwiseLayer[(Min[Max[0,#],6]&)]|>
In[]:=
mobileunit2[prefix_,nchannels_,kernel_,stride_,pad_,ngroup_]:=<|"conv"<>prefixConvolutionLayer[nchannels,kernel,"Stride"stride,"PaddingSize"pad,"ChannelGroups"ngroup],"conv"<>prefix<>"_bn"BatchNormalizationLayer[]|>
In[]:=
mobileunit3[prefix_,nchannels_,kernel_,stride_,pad_,ngroup_]:=<|"conv"<>prefixConvolutionLayer[nchannels,kernel,"Stride"stride,"PaddingSize"{{Ceiling[kernel[[1]]/2]-2,Ceiling[kernel[[2]]/2]},{Ceiling[kernel[[1]]/2]-2,Ceiling[kernel[[2]]/2]}},"ChannelGroups"ngroup],"conv"<>prefix<>"_bn"BatchNormalizationLayer[],"relu"<>prefixElementwiseLayer[(Min[Max[0,#],6]&)]|>
In[]:=
invresunit[prefix_,nchannels_,kernel_,stride_,pad_,ngroup_]:=<|prefixNetChain[Join[mobileunit[prefix<>"_expand",ngroup,1,1,0,1],mobileunit[prefix<>"_dwise",ngroup,kernel,stride,pad,ngroup],mobileunit2[prefix<>"_linear",nchannels,1,1,0,1]]]|>
In[]:=
invresunit2[prefix_,nchannels_,kernel_,stride_,pad_,ngroup_]:=<|prefixNetChain@Join[mobileunit[prefix<>"_dwise",ngroup,kernel,stride,pad,ngroup],mobileunit2[prefix<>"_linear",nchannels,1,1,0,1]]|>
In[]:=
invresunit3[prefix_,nchannels_,kernel_,stride_,pad_,ngroup_]:=<|prefixNetChain@Join[mobileunit[prefix<>"_expand",ngroup,1,1,0,1],mobileunit3[prefix<>"_dwise",ngroup,kernel,stride,pad,ngroup],mobileunit2[prefix<>"_linear",nchannels,1,1,0,1]]|>
In[]:=
mobilenetblock[prefix_,nchannels_,kernel_,stride_,pad_,ngroup_]:=Module[{layers},layers=Join[mobileunit[prefix<>"_expand",ngroup,1,1,0,1],mobileunit[prefix<>"_dwise",ngroup,kernel,stride,pad,ngroup],mobileunit2[prefix<>"_linear",nchannels,1,1,0,1]];<|prefix->NetGraph[Append[layers,"sum_"<>prefix->ThreadingLayer[Plus]],Join[{Fold[Rule[#2,#1]&,Reverse@Keys[layers]]},{Last[Keys[layers]]->"sum_"<>prefix,NetPort["Input"]"sum_"<>prefix}]]|>]
In[]:=
genAudioIdentifyNet[c1_,c2_,c3_,c4_,c5_,c6_,c7_,c8_,c9_,g1_,g2_,g3_,g4_,g5_,g6_,g7_,p_,dim_]:=NetChain[Join[<|"reshape1"->ReshapeLayer[{1,96,64}],"log"->ElementwiseLayer[Log[#+.001]&],"1"NetChain@mobileunit3["1",c1,{3,3},{2,2},{0,0},1]|>,invresunit2["2_1",c2,{3,3},{1,1},{1,1},g1],invresunit3["2_2",c3,{3,3},{2,2},{0,0},g2],mobilenetblock["3_1",c3,{3,3},{1,1},{1,1},g3],invresunit3["3_2",c4,{3,3},{2,2},{0,0},g3],mobilenetblock["4_1",c4,{3,3},{1,1},{1,1},g4],mobilenetblock["4_2",c4,{3,3},{1,1},{1,1},g4],invresunit3["4_3",c5,{3,3},{2,2},{0,0},g4],mobilenetblock["4_4",c5,{3,3},{1,1},{1,1},g5],mobilenetblock["4_5",c5,{3,3},{1,1},{1,1},g5],mobilenetblock["4_6",c5,{3,3},{1,1},{1,1},g5],invresunit["4_7",c6,{3,3},{1,1},{1,1},g5],mobilenetblock["5_1",c6,{3,3},{1,1},{1,1},g6],mobilenetblock["5_2",c6,{3,3},{1,1},{1,1},g6],invresunit3["5_3",c7,{3,3},{2,2},{0,0},g6],mobilenetblock["6_1",c7,{3,3},{1,1},{1,1},g7],mobilenetblock["6_2",c7,{3,3},{1,1},{1,1},g7],invresunit["6_3",c8,{3,3},{1,1},{1,1},g7],<|"6_4"NetChain@mobileunit["6_4",c9,{1,1},{1,1},{0,0},1],"pool6"PoolingLayer[{p,2},{1,1},"Function"Mean],"fc7"ConvolutionLayer[527,{1,1}],"reshape2"ReshapeLayer[{527}],"logistic"LogisticSigmoid|>],"Input"{dim,64}]
In[]:=
coreNetLarge=genAudioIdentifyNet[40,24,32,40,80,128,208,416,1664,40,144,192,240,480,768,1248,3,96]
Out[]=
NetChain
In[]:=
coreNetSmall=genAudioIdentifyNet[32,16,24,32,64,96,160,320,1280,32,96,144,192,384,576,960,3,96]
Out[]=
NetChain
Final Net
Final Net
In[]:=
audioIdentifyNetLarge=NetChain[{NetMapOperator[coreNetLarge],AggregationLayer[Max,1]},"Input"encoder,"Output"decoder]