Browse Source

Merge pull request #2675 from xiang90/v3rfc

doc: v3api rfc
Xiang Li 10 years ago
parent
commit
6ee5cd9105
2 changed files with 463 additions and 0 deletions
  1. 191 0
      Documentation/rfc/v3api.md
  2. 272 0
      Documentation/rfc/v3api.proto

+ 191 - 0
Documentation/rfc/v3api.md

@@ -0,0 +1,191 @@
+## Design
+
+1. Flatten binary key-value space
+    
+2. Keep the event history until compaction
+    - access to old version of keys
+    - user controlled history compaction
+    
+3. Support range query
+    - Pagination support with limit argument
+    - Support consistency guarantee across multiple range queries
+    
+4. Replace TTL key with Lease
+    - more efficient/ low cost keep alive
+    - a logical group of TTL keys
+    
+5. Replace CAS/CAD with multi-object Tnx
+    - MUCH MORE powerful and flexible
+    
+6. Support efficient watching with multiple ranges
+
+7. RPC API supports the completed set of APIs. 
+    - more efficient than JSON/HTTP
+    - additional tnx/lease support
+
+8. HTTP API supports a subset of APIs.
+    - easy for people to try out etcd
+    - easy for people to write simple etcd application
+
+
+## Protobuf Defined API
+
+[protobuf](./v3api.proto)
+
+### Examples
+
+#### Put a key (foo=bar)
+```
+// A put is always successful
+Put( PutRequest { key = foo, value = bar } )
+
+PutResponse { 
+    cluster_id = 0x1000,
+    member_id = 0x1,
+    index = 1,
+    raft_term = 0x1,
+}
+```
+
+#### Get a key (assume we have foo=bar)
+```
+Get ( RangeRequest { key = foo } )
+
+RangeResponse {
+    cluster_id = 0x1000,
+    member_id = 0x1,
+    index = 1,
+    raft_term = 0x1,
+    kvs = {
+      {
+          key = foo,
+          value = bar,
+          create_index = 1,
+          mod_index = 1,
+          version = 1;
+      },
+    },
+}
+```
+
+#### Range over a key space (assume we have foo0=bar0… foo100=bar100)
+```
+Range ( RangeRequest { key = foo, end_key = foo80, limit = 30  } )
+
+RangeResponse {
+    cluster_id = 0x1000,
+    member_id = 0x1,
+    index = 100,
+    raft_term = 0x1,
+    kvs = {
+      {
+          key = foo0,
+          value = bar0,
+          create_index = 1,
+          mod_index = 1,
+          version = 1;
+      },
+         ...,
+      {
+          key = foo30,
+          value = bar30,
+          create_index = 30,
+          mod_index = 30,
+          version = 1;
+      },
+    },
+}
+```
+
+#### Finish a tnx (assume we have foo0=bar0, foo1=bar1)
+```
+Tnx(TnxRequest {
+    // mod_index of foo0 is equal to 1, mod_index of foo1 is greater than 1
+    compare = {
+        {compareType = equal, key = foo0, mod_index = 1}, 
+        {compareType = greater, key = foo1, mod_index = 1}}
+    },
+    // if the comparison succeeds, put foo2 = bar2
+    success = {PutRequest { key = foo2, value = success }},
+    // if the comparison fails, put foo2=fail
+    failure = {PutRequest { key = foo2, value = failure }},
+)
+
+TnxResponse {
+    cluster_id = 0x1000,
+    member_id = 0x1,
+    index = 3,
+    raft_term = 0x1,
+    succeeded = true,
+    responses = {
+      // response of PUT foo2=success
+      {
+            cluster_id = 0x1000,
+            member_id = 0x1,
+            index = 3,
+            raft_term = 0x1,
+        }
+    }
+}
+```
+
+#### Watch on a key/range
+
+```
+Watch( WatchRequest{
+           key = foo,
+           end_key = fop, // prefix foo
+           start_index = 20,
+           end_index = 10000,
+           // server decided notification frequency
+           progress_notification = true,
+       } 
+       … // this can be a watch request stream
+      )
+
+// put (foo0=bar0) event at 3
+WatchResponse {
+    cluster_id = 0x1000,
+    member_id = 0x1,
+    index = 3,
+    raft_term = 0x1,
+    event_type = put,
+    kv = {
+              key = foo0,
+              value = bar0,
+              create_index = 1,
+              mod_index = 1,
+              version = 1;
+          },
+    }
+    …
+    
+    // a notification at 2000
+    WatchResponse {
+        cluster_id = 0x1000,
+        member_id = 0x1,
+        index = 2000,
+        raft_term = 0x1,
+        // nil event as notification
+    }
+    
+    … 
+    
+    // put (foo0=bar3000) event at 3000
+    WatchResponse {
+        cluster_id = 0x1000,
+        member_id = 0x1,
+        index = 3000,
+        raft_term = 0x1,
+        event_type = put,
+        kv = {
+                key = foo0,
+                value = bar3000,
+                create_index = 1,
+                mod_index = 3000,
+                version = 2;
+          },
+    }
+    …
+    
+```

+ 272 - 0
Documentation/rfc/v3api.proto

@@ -0,0 +1,272 @@
+syntax = "proto3";
+
+// Interface exported by the server.
+service etcd {
+  // Range gets the keys in the range from the store.
+  rpc Range(RangeRequest) returns (RangeResponse) {}
+
+  // Put puts the given key into the store.
+  // A put request increases the index of the store,
+  // and generates one event in the event history.
+  rpc Put(PutRequest) returns (PutResponse) {}
+
+  // Delete deletes the given range from the store.
+  // A delete request increase the index of the store,
+  // and generates one event in the event history.
+  rpc DeleteRange(DeleteRangeRequest) returns (DeleteRangeResponse) {}
+
+  // Tnx processes all the requests in one transaction.
+  // A tnx request increases the index of the store,
+  // and generates events with the same index in the event history.
+  rpc Tnx(TnxRequest) returns (TnxResponse) {}
+
+  // Watch watches the events happening or happened in etcd. Both input and output
+  // are stream. One watch rpc can watch for multiple ranges and get a stream of
+  // events. The whole events history can be watched unless compacted.
+  rpc WatchRange(stream WatchRangeRequest) returns (stream WatchRangeResponse) {}
+
+  // Compact compacts the event history in etcd. User should compact the
+  // event history periodically, or it will grow infinitely.
+  rpc Compact(CompactionRequest) returns (CompactionResponse) {}
+
+  // LeaseCreate creates a lease. A lease has a TTL. The lease will expire if the
+  // server does not receive a keepAlive within TTL from the lease holder.
+  // All keys attached to the lease will be expired and deleted if the lease expires.
+  // The key expiration generates an event in event history.
+  rpc LeaseCreate(LeaseCreateRequest) returns (LeaseCreateResponse) {}
+
+  // LeaseRevoke revokes a lease. All the key attached to the lease will be expired and deleted.
+  rpc LeaseRevoke(LeaseRevokeRequest) returns (LeaseRevokeResponse) {}
+
+  // LeaseAttach attaches keys with a lease.
+  rpc LeaseAttach(LeaseAttachRequest) returns (LeaseAttachResponse) {}
+
+  // LeaseTnx likes Tnx. It has two addition success and failure LeaseAttachRequest list.
+  // If the Tnx is successful, then the success list will be executed. Or the failure list
+  // will be executed.
+  rpc LeaseTnx(LeaseTnxRequest) returns (LeaseTnxResponse) {}
+
+  // KeepAlive keeps the lease alive.
+  rpc LeaseKeepAlive(stream LeaseKeepAliveRequest) returns (stream LeaseKeepAliveResponse) {}
+}
+
+message ResponseHeader {
+  // an error type message?
+  optional string error = 1;
+  optional uint64 cluster_id = 2;
+  optional uint64 member_id = 3;
+  // index of the store when the request was applied.
+  optional int64 index = 4;
+  // term of raft when the request was applied.
+  optional uint64 raft_term = 5;
+}
+
+message RangeRequest {
+  // if the range_end is not given, the request returns the key.
+  optional bytes key = 1;
+  // if the range_end is given, it gets the keys in range [key, range_end).
+  optional bytes range_end = 2;
+  // limit the number of keys returned.
+  optional int64 limit = 3;
+  // the response will be consistent with previous request with same token if the token is 
+  // given and is vaild.
+  optional bytes consistent_token = 4;
+}
+
+message RangeResponse {
+  optional ResponseHeader header = 1;
+  repeated KeyValue kvs = 2;
+  optional bytes consistent_token = 3;
+}
+
+message PutRequest {
+  optional bytes key = 1;
+  optional bytes value = 2;
+}
+
+message PutResponse {
+  optional ResponseHeader header = 1;
+}
+
+message DeleteRangeRequest {
+  // if the range_end is not given, the request deletes the key.
+  optional bytes key = 1;
+  // if the range_end is given, it deletes the keys in range [key, range_end).
+  optional bytes range_end = 2;
+}
+
+message DeleteRangeResponse {
+  optional ResponseHeader header = 1;
+}
+
+message RequestUnion {
+  oneof request {
+    RangeRequest request_range = 1;
+    PutRequest request_put = 2;
+    DeleteRangeRequest request_delete_range = 3;
+  }
+}
+
+message ResponseUnion {
+  oneof response {
+    RangeResponse reponse_range = 1;
+    PutResponse response_put = 2;
+    DeleteRangeResponse response_delete_range = 3;
+  }
+}
+
+message Compare {
+  enum CompareType {
+    EQUAL = 0;
+    GREATER = 1;
+    LESS = 2;
+  }
+  optional CompareType type = 1;
+  // key path
+  optional bytes key = 2;
+  oneof target {
+    // version of the given key
+    int64 version = 3;
+    // create index of the given key
+    int64 create_index = 4;
+    // last modified index of the given key
+    int64 mod_index = 5;
+    // value of the given key
+    bytes value = 6;
+  }
+}
+
+// First all the compare requests are processed.
+// If all the compare succeed, all the success
+// requests will be processed.
+// Or all the failure requests will be processed and
+// all the errors in the comparison will be returned.
+
+// From google paxosdb paper:
+// Our implementation hinges around a powerful primitive which we call MultiOp. All other database
+// operations except for iteration are implemented as a single call to MultiOp. A MultiOp is applied atomically
+// and consists of three components:
+// 1. A list of tests called guard. Each test in guard checks a single entry in the database. It may check
+// for the absence or presence of a value, or compare with a given value. Two different tests in the guard
+// may apply to the same or different entries in the database. All tests in the guard are applied and
+// MultiOp returns the results. If all tests are true, MultiOp executes t op (see item 2 below), otherwise
+// it executes f op (see item 3 below).
+// 2. A list of database operations called t op. Each operation in the list is either an insert, delete, or
+// lookup operation, and applies to a single database entry. Two different operations in the list may apply
+// to the same or different entries in the database. These operations are executed
+// if guard evaluates to
+// true.
+// 3. A list of database operations called f op. Like t op, but executed if guard evaluates to false.
+message TnxRequest {
+  repeated Compare compare = 1;
+  repeated RequestUnion success = 2;
+  repeated RequestUnion failure = 3;
+}
+
+message TnxResponse {
+  optional ResponseHeader header = 1;
+  optional bool succeeded = 2;
+  repeated ResponseUnion responses = 3;
+}
+
+message KeyValue {
+  optional bytes key = 1;
+  // mod_index is the last modified index of the key.
+  optional int64 create_index = 2;
+  optional int64 mod_index = 3;
+  // version is the version of the key. A deletion resets
+  // the version to zero and any modification of the key
+  // increases its version.
+  optional int64 version = 4;
+  optional bytes value = 5;
+}
+
+message WatchRangeRequest {
+  // if the range_end is not given, the request returns the key.
+  optional bytes key = 1;
+  // if the range_end is given, it gets the keys in range [key, range_end).
+  optional bytes range_end = 2;
+  // start_index is an optional index (including) to watch from. No start_index is "now".
+  optional int64 start_index = 3;
+  // end_index is an optional index (excluding) to end watch. No end_index is "forever".
+  optional int64 end_index = 4;
+  optional bool progress_notification = 5;
+}
+
+message WatchRangeResponse {
+  optional ResponseHeader header = 1;
+  repeated Event events = 2;
+}
+
+message Event {
+  enum EventType {
+    PUT = 0;
+    DELETE = 1;
+    EXPIRE = 2;
+  }
+  optional EventType event_type = 1;
+  // a put event contains the current key-value
+  // a delete/expire event contains the previous
+  // key-value
+  optional KeyValue kv = 2;
+}
+
+message CompactionRequest {
+  optional int64 index = 1;
+}
+
+message CompactionResponse {
+  optional ResponseHeader header = 1;
+}
+
+message LeaseCreateRequest {
+  // advisory ttl in seconds
+  optional int64 ttl = 1;
+}
+
+message LeaseCreateResponse {
+  optional ResponseHeader header = 1;
+  optional int64 lease_id = 2;
+  // server decided ttl in second
+  optional int64 ttl = 3;
+  optional string error = 4;
+}
+
+message LeaseRevokeRequest {
+  optional int64 lease_id = 1;
+}
+
+message LeaseRevokeResponse {
+  optional ResponseHeader header = 1;
+}
+
+message LeaseTnxRequest {
+  optional TnxRequest request = 1;
+  repeated LeaseAttachRequest success = 2;
+  repeated LeaseAttachRequest failure = 3;
+}
+
+message LeaseTnxResponse {
+  optional ResponseHeader header = 1;
+  optional TnxResponse response = 2;
+  repeated LeaseAttachResponse attach_responses = 3;
+}
+
+message LeaseAttachRequest {
+  optional int64 lease_id = 1;
+  optional bytes key = 2;
+}
+
+message LeaseAttachResponse {
+  optional ResponseHeader header = 1;
+}
+
+message LeaseKeepAliveRequest {
+  optional int64 lease_id = 1;
+}
+
+message LeaseKeepAliveResponse {
+  optional ResponseHeader header = 1;
+  optional int64 lease_id = 2;
+  optional int64 ttl = 3;
+}