summaryrefslogtreecommitdiff
blob: 6e99ff6849d280c436ef9b938bc2b16982ae5614 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
# Piper audio feature extraction: schema for low-level operation
#
# This file is formatted to 130 characters width, in order to fit the
# comments next to the schema definitions.
#
# Copyright (c) 2015-2017 Queen Mary, University of London, provided
# under a BSD-style licence. See the file COPYING for details.

@0xc4b1c6c44c999206;

using Cxx = import "/capnp/c++.capnp";
$Cxx.namespace("piper");

struct Basic {
    # Basic metadata common to many Piper structures.

    identifier         @0  :Text;                 # A computer-readable string. Must match the regex /^[a-zA-Z0-9_-]+$/.
    name               @1  :Text;                 # A short human-readable name or label. Must be present.
    description        @2  :Text;                 # An optional human-readable descriptive text that may accompany the name.
}

struct ParameterDescriptor {
    # Properties of an adjustable parameter. A parameter's value is just a single
    # float, but the descriptor explains how to interpret and present that value.
    # A Piper feature extractor has a static list of parameters. The properties of
    # a given parameter never change, in contrast to output descriptors, which
    # may have different properties depending on the configuration of the extractor.

    basic              @0  :Basic;                # Basic metadata about the parameter.
    unit               @1  :Text;                 # Human-recognisable unit of the parameter (e.g. Hz). May be left empty.
    minValue           @2  :Float32     = 0.0;    # Minimum value. Must be provided.
    maxValue           @3  :Float32     = 0.0;    # Maximum value. Must be provided.
    defaultValue       @4  :Float32     = 0.0;    # Default if the parameter is not set to anything else. Must be provided.
    isQuantized        @5  :Bool        = false;  # True if parameter values are quantized to a particular resolution.
    quantizeStep       @6  :Float32     = 0.0;    # Quantization resolution, if isQuantized.
    valueNames         @7  :List(Text)  = [];     # Optional human-readable labels for the values, if isQuantized.
}

enum SampleType {
    # How returned features are spaced on the input timeline.

    oneSamplePerStep   @0;                        # Each process input returns a feature aligned with that input's timestamp.
    fixedSampleRate    @1;                        # Features are equally spaced at a given sample rate.
    variableSampleRate @2;                        # Features have their own individual timestamps.
}

struct StaticOutputDescriptor {

    # Properties of an output, that is, a single stream of features
    # produced in response to process and finish requests. A feature
    # extractor may have any number of outputs, and it always
    # calculates and returns features from all of them when
    # processing; this is useful in cases where more than one feature
    # can be easily calculated using a single method.
    # 
    # This structure contains the properties of an output that are
    # static, i.e. that do not depend on the parameter values provided
    # at configuration, excluding the Basic struct parameters like id
    # and description.  The Basic struct properties are not included
    # for historical reasons: they were already referenced separately
    # in the OutputDescriptor and ExtractorStaticData before this
    # struct was introduced.
    
    typeURI            @0  :Text;                 # URI indicating the sort of feature that this output returns (see docs).
}

struct ConfiguredOutputDescriptor {
    # Properties of an output, that is, a single stream of features produced
    # in response to process and finish requests. A feature extractor may
    # have any number of outputs, and it always calculates and returns features
    # from all of them when processing; this is useful in cases where more
    # than one feature can be easily calculated using a single method.
    # This structure contains the properties of an output that are not static,
    # i.e. that may depend on the parameter values provided at configuration.

    unit               @0  :Text;                 # Human-recognisable unit of the bin values in output features. May be empty.
    hasFixedBinCount   @1  :Bool        = false;  # True if this output has an equal number of values in each returned feature.
    binCount           @2  :Int32       = 0;      # Number of values per feature for this output, if hasFixedBinCount.
    binNames           @3  :List(Text)  = [];     # Optional human-readable labels for the value bins, if hasFixedBinCount.
    hasKnownExtents    @4  :Bool        = false;  # True if all feature values fall within the same fixed min/max range.
    minValue           @5  :Float32     = 0.0;    # Minimum value in range for any value from this output, if hasKnownExtents.
    maxValue           @6  :Float32     = 0.0;    # Maximum value in range for any value from this output, if hasKnownExtents.
    isQuantized        @7  :Bool        = false;  # True if feature values are quantized to a particular resolution.
    quantizeStep       @8  :Float32     = 0.0;    # Quantization resolution, if isQuantized.
    sampleType         @9  :SampleType;           # How returned features from this output are spaced on the input timeline.
    sampleRate         @10 :Float32     = 0.0;    # Sample rate (features per second) if sampleType == fixedSampleRate.
    hasDuration        @11 :Bool        = false;  # True if features returned from this output will have a duration.
}

struct OutputDescriptor {
    # All the properties of an output, both static (the basic metadata and static
    # descriptor) and potentially dependent on configuration parameters (the
    # configured descriptor).

    basic              @0  :Basic;                # Basic metadata about the output.
    configured         @1  :ConfiguredOutputDescriptor;    # Properties of the output that may depend on configuration parameters.
    static             @2  :StaticOutputDescriptor;        # Properties (other than Basic) that do not depend on parameters.
}

enum InputDomain {
    # Whether a feature extractor requires time-domain audio input (i.e.
    # "normal" or "unprocessed" audio samples) or frequency-domain input
    # (i.e. resulting from windowed, usually overlapping, short-time
    # Fourier transforms).

    timeDomain         @0;                        # The plugin requires time-domain audio samples as input.
    frequencyDomain    @1;                        # The plugin requires input to have been pre-processed using windowed STFTs.
}

struct ExtractorStaticData {
    # Static properties of a feature extractor. That is, metadata about the
    # extractor that are the same regardless of how you configure or run it.

    key                @0  :Text;                 # String that "globally" identifies the extractor, used to load it (see docs).
    basic              @1  :Basic;                # Basic metadata about the extractor.
    maker              @2  :Text;                 # Human-readable text naming the author or vendor of the extractor.
    rights             @3  :Text;                 # Human-readable summary of copyright and/or licensing terms for the extractor.
    version            @4  :Int32;                # Version number of extractor; must increase if new algorithm changes results.
    category           @5  :List(Text);           # List of general->specific category labels for this extractor (see docs).
    minChannelCount    @6  :Int32;                # Minimum number of input channels of audio this extractor can accept.
    maxChannelCount    @7  :Int32;                # Maximum number of input channels of audio this extractor can accept.
    parameters         @8  :List(ParameterDescriptor);    # List of configurable parameter properties for the feature extractor.
    programs           @9  :List(Text);           # List of predefined programs. For backward-compatibility, not recommended.
    inputDomain        @10 :InputDomain;          # Whether the extractor requires time-domain or frequency-domain input audio.
    basicOutputInfo    @11 :List(Basic);          # Basic metadata about all of the outputs of the extractor.

    struct SOPair {
        # A mapping between output identifier and static descriptor for
	# that output.
	
        output         @0  :Text;                 # Output id, matching the output's descriptor's basic identifier.
        static         @1  :StaticOutputDescriptor;
    }

    staticOutputInfo   @12 :List(SOPair);         # Static descriptors for all outputs that have any static metadata.
}

struct RealTime {
    # Time structure. When used as a timestamp, this is relative to "start
    # of audio".
    
    sec                @0  :Int32       = 0;      # Number of seconds.
    nsec               @1  :Int32       = 0;      # Number of nanoseconds. Must have same sign as sec unless sec == 0.
}

struct ProcessInput {
    # Audio and timing input data provided to a process request.

    inputBuffers       @0  :List(List(Float32));  # A single block of audio data (time or frequency domain) for each channel.
    timestamp          @1  :RealTime;             # Time of start of block (time-domain) or "centre" of it (frequency-domain).
}

struct Feature {
    # A single feature calculated and returned from a process or finish request.

    hasTimestamp       @0  :Bool        = false;  # True if feature has a timestamp. Must be true for a variableSampleRate output.
    timestamp          @1  :RealTime;             # Timestamp of feature, if hasTimestamp.
    hasDuration        @2  :Bool        = false;  # True if feature has a duration. Must be true if output's hasDuration is true.
    duration           @3  :RealTime;             # Duration of feature, if hasDuration.
    label              @4  :Text;                 # Optional human-readable text attached to feature.
    featureValues      @5  :List(Float32) = [];   # The feature values themselves (of size binCount, if output hasFixedBinCount).
}

struct FeatureSet {
    # The set of all features, across all outputs, calculated and returned from
    # a single process or finish request.

    struct FSPair {
        # A mapping between output identifier and ordered list of features for
	# that output.
	
        output         @0  :Text;                 # Output id, matching the output's descriptor's basic identifier.
        features       @1  :List(Feature) = [];   # Features calculated for that output during the current request, in time order.
    }
    
    featurePairs       @0  :List(FSPair);         # The feature lists for all outputs for which any features have been calculated.
}

struct Framing {
    # Determines how audio should be split up into individual buffers for input.
    # If the feature extractor accepts frequency-domain input, then this
    # applies prior to the STFT transform.
    #
    # These values are sometimes mandatory, but in other contexts one or both may
    # be set to zero to mean "don't care". See documentation for structures that
    # include a framing field for details.
    
    blockSize          @0  :Int32;                # Number of time-domain audio samples per buffer (on each channel).
    stepSize           @1  :Int32;                # Number of samples to advance between buffers: equals blockSize for no overlap.
}

struct Configuration {
    # Bundle of parameter values and other configuration data for a feature-
    # extraction procedure.

    struct PVPair {
        # A mapping between parameter identifier and value.
	
        parameter      @0  :Text;                 # Parameter id, matching the parameter's descriptor's basic identifier.
        value          @1  :Float32;              # Value to set parameter to (within constraints given in parameter descriptor).
    }
    
    parameterValues    @0  :List(PVPair);         # Values for all parameters, or at least any that are to change from defaults.
    currentProgram     @1  :Text;                 # Selection of predefined program. For backward-compatibility, not recommended. 
    channelCount       @2  :Int32;                # Number of audio channels of input.
    framing            @3  :Framing;              # Step and block size for framing the input.
}

enum AdapterFlag {
    # Flags that may be used when requesting a server to load a feature
    # extractor, to ask the server to do some of the work of framing and input
    # conversion instead of leaving it to the client side. These affect the
    # apparent behaviour of the loaded extractor.

    adaptInputDomain   @0;                        # Input-domain conversion, so the extractor always expects time-domain input.
    adaptChannelCount  @1;                        # Channel mixing or duplication, so any number of input channels is acceptable. 
    adaptBufferSize    @2;                        # Framing, so the extractor accepts any blockSize of non-overlapping buffers.
}

const adaptAllSafe :List(AdapterFlag) =
    [ adaptInputDomain, adaptChannelCount ];
    # The set of adapter flags that can always be applied, leaving results unchanged.

const adaptAll :List(AdapterFlag) =
    [ adaptInputDomain, adaptChannelCount, adaptBufferSize ];
    # The set of adapter flags that may cause "equivalent" results to be returned (see documentation).

struct ListRequest {
    # Request a server to provide a list of available feature extractors.
    
    from               @0  :List(Text);           # If non-empty, provide only extractors found in the given list of "libraries".
}

struct ListResponse {
    # Response to a successful list request.
    
    available          @0  :List(ExtractorStaticData);    # List of static data about available feature extractors.
}

struct LoadRequest {
    # Request a server to load a feature extractor and return a handle to it.
    
    key                @0  :Text;                 # Key as found in the extractor's static data structure.
    inputSampleRate    @1  :Float32;              # Sample rate for input audio. Properties of the extractor may depend on this.
    adapterFlags       @2  :List(AdapterFlag);    # Set of optional flags to make any framing and input conversion requests.
}

struct LoadResponse {
    # Response to a successful load request.
    
    handle             @0  :Int32;                # Handle to be used to refer to the loaded feature extractor in future requests.
    staticData         @1  :ExtractorStaticData;  # Static data about this feature extractor, identical to that in list response.
    defaultConfiguration @2  :Configuration;      # Extractor's default parameter values and preferred input framing.
}

struct ConfigurationRequest {
    # Request a server to configure a loaded feature extractor and prepare
    # it for use. This request must be carried out on a feature extractor
    # before any process request can be made.
    
    handle             @0  :Int32;                # Handle as returned in the load response from the loading of this extractor.
    configuration      @1  :Configuration;        # Bundle of parameter values to set, and client's preferred input framing.
}

struct ConfigurationResponse {
    # Response to a successful configuration request.

    handle             @0  :Int32;                # Handle of extractor, as passed in the configuration request.
    outputs            @1  :List(OutputDescriptor);       # Full set of properties of all outputs following configuration.
    framing            @2  :Framing;              # Input framing that must be used for subsequent process requests.
}

struct ProcessRequest {
    # Request a server to process a buffer of audio using a loaded and
    # configured feature extractor.

    handle             @0  :Int32;                # Handle as returned in the load response from the loading of this extractor.
    processInput       @1  :ProcessInput;         # Audio in the input domain, with framing as in the configuration response.
}

struct ProcessResponse {
    # Response to a successful process request.

    handle             @0  :Int32;                # Handle of extractor, as passed in the process request.
    features           @1  :FeatureSet;           # All features across all outputs calculated during this process request.
}

struct FinishRequest {
    # Request a server to finish processing and unload a loaded feature
    # extractor. This request may be made at any time -- the extractor does
    # not have to have been configured or used. The extractor handle cannot
    # be used again with this server afterwards.

    handle             @0  :Int32;                # Handle as returned in the load response from the loading of this extractor.
}

struct FinishResponse {
    # Response to a successful finish request.

    handle             @0  :Int32;                # Handle of extractor, as passed in the finish request. May not be used again.
    features           @1  :FeatureSet;           # Features the extractor has calculated now that it knows all input has ended.
}

struct Error {
    # Response to any request that fails.

    code               @0  :Int32;                # Error code. 
    message            @1  :Text;                 # Error message.
}

struct RpcRequest {
    # Request bundle for use when using Cap'n Proto serialisation without
    # Cap'n Proto RPC layer. For Cap'n Proto RPC, see piper.rpc.capnp.

    id :union {
        # Identifier used solely to associate a response packet with its
	# originating request. Server does not examine the contents of this,
	# it just copies the request id structure into the response.
	
        number         @0  :Int32;
        tag            @1  :Text;
        none           @2  :Void;
    }
    
    request :union {
        # For more details, see the documentation for the individual
	# request structures.
	
	list           @3  :ListRequest;          # Provide a list of available feature extractors.
	load           @4  :LoadRequest;          # Load a feature extractor and return a handle to it.
	configure      @5  :ConfigurationRequest; # Configure a loaded feature extractor, set parameters, and prepare it for use.
	process        @6  :ProcessRequest;       # Process a single fixed-size buffer of audio and return calculated features.
	finish         @7  :FinishRequest;        # Get any remaining features and unload the extractor.
    }
}

struct RpcResponse {
    # Response bundle for use when using Cap'n Proto serialisation without
    # Cap'n Proto RPC layer. For Cap'n Proto RPC, see piper.rpc.capnp.

    id :union {
        # Identifier used solely to associate a response packet with its
	# originating request. Server does not examine the contents of this,
	# it just copies the request id structure into the response.
	
        number         @0  :Int32;
        tag            @1  :Text;
        none           @2  :Void;
    }

    response :union {
        # For more details, see the documentation for the individual
	# response structures.
	
        error          @3  :Error;                # The request (of whatever type) failed.
	list           @4  :ListResponse;         # List succeeded: here is static data about the requested extractors.
	load           @5  :LoadResponse;         # Load succeeded: here is a handle for the loaded extractor.
	configure      @6  :ConfigurationResponse;# Configure succeeded: ready to process, here are values such as block size.
	process        @7  :ProcessResponse;      # Process succeeded: here are all features calculated from this input block.
	finish         @8  :FinishResponse;       # Finish succeeded: extractor unloaded, here are all remaining features.
    }
}