aboutsummaryrefslogtreecommitdiff
path: root/include/linux/ceph
diff options
context:
space:
mode:
Diffstat (limited to 'include/linux/ceph')
-rw-r--r--include/linux/ceph/auth.h10
-rw-r--r--include/linux/ceph/ceph_frag.h4
-rw-r--r--include/linux/ceph/ceph_fs.h75
-rw-r--r--include/linux/ceph/decode.h57
-rw-r--r--include/linux/ceph/libceph.h63
-rw-r--r--include/linux/ceph/mon_client.h30
-rw-r--r--include/linux/ceph/msgpool.h1
-rw-r--r--include/linux/ceph/osd_client.h238
-rw-r--r--include/linux/ceph/osdmap.h168
-rw-r--r--include/linux/ceph/rados.h34
-rw-r--r--include/linux/ceph/string_table.h62
11 files changed, 545 insertions, 197 deletions
diff --git a/include/linux/ceph/auth.h b/include/linux/ceph/auth.h
index 260d78b587c4..1563265d2097 100644
--- a/include/linux/ceph/auth.h
+++ b/include/linux/ceph/auth.h
@@ -12,9 +12,12 @@
*/
struct ceph_auth_client;
-struct ceph_authorizer;
struct ceph_msg;
+struct ceph_authorizer {
+ void (*destroy)(struct ceph_authorizer *);
+};
+
struct ceph_auth_handshake {
struct ceph_authorizer *authorizer;
void *authorizer_buf;
@@ -62,8 +65,6 @@ struct ceph_auth_client_ops {
struct ceph_auth_handshake *auth);
int (*verify_authorizer_reply)(struct ceph_auth_client *ac,
struct ceph_authorizer *a, size_t len);
- void (*destroy_authorizer)(struct ceph_auth_client *ac,
- struct ceph_authorizer *a);
void (*invalidate_authorizer)(struct ceph_auth_client *ac,
int peer_type);
@@ -112,8 +113,7 @@ extern int ceph_auth_is_authenticated(struct ceph_auth_client *ac);
extern int ceph_auth_create_authorizer(struct ceph_auth_client *ac,
int peer_type,
struct ceph_auth_handshake *auth);
-extern void ceph_auth_destroy_authorizer(struct ceph_auth_client *ac,
- struct ceph_authorizer *a);
+void ceph_auth_destroy_authorizer(struct ceph_authorizer *a);
extern int ceph_auth_update_authorizer(struct ceph_auth_client *ac,
int peer_type,
struct ceph_auth_handshake *a);
diff --git a/include/linux/ceph/ceph_frag.h b/include/linux/ceph/ceph_frag.h
index b827e066e55a..146507df8650 100644
--- a/include/linux/ceph/ceph_frag.h
+++ b/include/linux/ceph/ceph_frag.h
@@ -51,11 +51,11 @@ static inline __u32 ceph_frag_make_child(__u32 f, int by, int i)
return ceph_frag_make(newbits,
ceph_frag_value(f) | (i << (24 - newbits)));
}
-static inline int ceph_frag_is_leftmost(__u32 f)
+static inline bool ceph_frag_is_leftmost(__u32 f)
{
return ceph_frag_value(f) == 0;
}
-static inline int ceph_frag_is_rightmost(__u32 f)
+static inline bool ceph_frag_is_rightmost(__u32 f)
{
return ceph_frag_value(f) == ceph_frag_mask(f);
}
diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
index 37f28bf55ce4..7868d602c0a0 100644
--- a/include/linux/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -34,9 +34,9 @@
#define CEPH_MAX_MON 31
/*
- * ceph_file_layout - describe data layout for a file/inode
+ * legacy ceph_file_layoute
*/
-struct ceph_file_layout {
+struct ceph_file_layout_legacy {
/* file -> object mapping */
__le32 fl_stripe_unit; /* stripe unit, in bytes. must be multiple
of page size. */
@@ -53,32 +53,26 @@ struct ceph_file_layout {
__le32 fl_pg_pool; /* namespace, crush ruleset, rep level */
} __attribute__ ((packed));
-#define ceph_file_layout_su(l) ((__s32)le32_to_cpu((l).fl_stripe_unit))
-#define ceph_file_layout_stripe_count(l) \
- ((__s32)le32_to_cpu((l).fl_stripe_count))
-#define ceph_file_layout_object_size(l) ((__s32)le32_to_cpu((l).fl_object_size))
-#define ceph_file_layout_cas_hash(l) ((__s32)le32_to_cpu((l).fl_cas_hash))
-#define ceph_file_layout_object_su(l) \
- ((__s32)le32_to_cpu((l).fl_object_stripe_unit))
-#define ceph_file_layout_pg_pool(l) \
- ((__s32)le32_to_cpu((l).fl_pg_pool))
-
-static inline unsigned ceph_file_layout_stripe_width(struct ceph_file_layout *l)
-{
- return le32_to_cpu(l->fl_stripe_unit) *
- le32_to_cpu(l->fl_stripe_count);
-}
-
-/* "period" == bytes before i start on a new set of objects */
-static inline unsigned ceph_file_layout_period(struct ceph_file_layout *l)
-{
- return le32_to_cpu(l->fl_object_size) *
- le32_to_cpu(l->fl_stripe_count);
-}
+struct ceph_string;
+/*
+ * ceph_file_layout - describe data layout for a file/inode
+ */
+struct ceph_file_layout {
+ /* file -> object mapping */
+ u32 stripe_unit; /* stripe unit, in bytes */
+ u32 stripe_count; /* over this many objects */
+ u32 object_size; /* until objects are this big */
+ s64 pool_id; /* rados pool id */
+ struct ceph_string __rcu *pool_ns; /* rados pool namespace */
+};
-#define CEPH_MIN_STRIPE_UNIT 65536
+extern int ceph_file_layout_is_valid(const struct ceph_file_layout *layout);
+extern void ceph_file_layout_from_legacy(struct ceph_file_layout *fl,
+ struct ceph_file_layout_legacy *legacy);
+extern void ceph_file_layout_to_legacy(struct ceph_file_layout *fl,
+ struct ceph_file_layout_legacy *legacy);
-int ceph_file_layout_is_valid(const struct ceph_file_layout *layout);
+#define CEPH_MIN_STRIPE_UNIT 65536
struct ceph_dir_layout {
__u8 dl_dir_hash; /* see ceph_hash.h for ids */
@@ -127,6 +121,7 @@ struct ceph_dir_layout {
/* client <-> mds */
#define CEPH_MSG_MDS_MAP 21
+#define CEPH_MSG_FS_MAP_USER 103
#define CEPH_MSG_CLIENT_SESSION 22
#define CEPH_MSG_CLIENT_RECONNECT 23
@@ -153,8 +148,9 @@ struct ceph_dir_layout {
/* watch-notify operations */
enum {
- WATCH_NOTIFY = 1, /* notifying watcher */
- WATCH_NOTIFY_COMPLETE = 2, /* notifier notified when done */
+ CEPH_WATCH_EVENT_NOTIFY = 1, /* notifying watcher */
+ CEPH_WATCH_EVENT_NOTIFY_COMPLETE = 2, /* notifier notified when done */
+ CEPH_WATCH_EVENT_DISCONNECT = 3, /* we were disconnected */
};
@@ -207,6 +203,8 @@ struct ceph_mon_subscribe_ack {
struct ceph_fsid fsid;
} __attribute__ ((packed));
+#define CEPH_FS_CLUSTER_ID_NONE -1
+
/*
* mdsmap flags
*/
@@ -344,6 +342,18 @@ extern const char *ceph_mds_op_name(int op);
#define CEPH_XATTR_REPLACE (1 << 1)
#define CEPH_XATTR_REMOVE (1 << 31)
+/*
+ * readdir request flags;
+ */
+#define CEPH_READDIR_REPLY_BITFLAGS (1<<0)
+
+/*
+ * readdir reply flags.
+ */
+#define CEPH_READDIR_FRAG_END (1<<0)
+#define CEPH_READDIR_FRAG_COMPLETE (1<<8)
+#define CEPH_READDIR_HASH_ORDER (1<<9)
+
union ceph_mds_request_args {
struct {
__le32 mask; /* CEPH_CAP_* */
@@ -361,6 +371,7 @@ union ceph_mds_request_args {
__le32 frag; /* which dir fragment */
__le32 max_entries; /* how many dentries to grab */
__le32 max_bytes;
+ __le16 flags;
} __attribute__ ((packed)) readdir;
struct {
__le32 mode;
@@ -383,7 +394,7 @@ union ceph_mds_request_args {
__le32 flags;
} __attribute__ ((packed)) setxattr;
struct {
- struct ceph_file_layout layout;
+ struct ceph_file_layout_legacy layout;
} __attribute__ ((packed)) setlayout;
struct {
__u8 rule; /* currently fcntl or flock */
@@ -462,7 +473,7 @@ struct ceph_mds_reply_inode {
__le64 version; /* inode version */
__le64 xattr_version; /* version for xattr blob */
struct ceph_mds_reply_cap cap; /* caps issued for this inode */
- struct ceph_file_layout layout;
+ struct ceph_file_layout_legacy layout;
struct ceph_timespec ctime, mtime, atime;
__le32 time_warp_seq;
__le64 size, max_size, truncate_size;
@@ -515,7 +526,7 @@ struct ceph_filelock {
#define CEPH_FILE_MODE_WR 2
#define CEPH_FILE_MODE_RDWR 3 /* RD | WR */
#define CEPH_FILE_MODE_LAZY 4 /* lazy io */
-#define CEPH_FILE_MODE_NUM 8 /* bc these are bit fields.. mostly */
+#define CEPH_FILE_MODE_BITS 4
int ceph_flags_to_mode(int flags);
@@ -657,7 +668,7 @@ struct ceph_mds_caps {
__le64 size, max_size, truncate_size;
__le32 truncate_seq;
struct ceph_timespec mtime, atime, ctime;
- struct ceph_file_layout layout;
+ struct ceph_file_layout_legacy layout;
__le32 time_warp_seq;
} __attribute__ ((packed));
diff --git a/include/linux/ceph/decode.h b/include/linux/ceph/decode.h
index a6ef9cc267ec..f990f2cc907a 100644
--- a/include/linux/ceph/decode.h
+++ b/include/linux/ceph/decode.h
@@ -3,6 +3,7 @@
#include <linux/err.h>
#include <linux/bug.h>
+#include <linux/slab.h>
#include <linux/time.h>
#include <asm/unaligned.h>
@@ -47,7 +48,7 @@ static inline void ceph_decode_copy(void **p, void *pv, size_t n)
/*
* bounds check input.
*/
-static inline int ceph_has_room(void **p, void *end, size_t n)
+static inline bool ceph_has_room(void **p, void *end, size_t n)
{
return end >= *p && n <= end - *p;
}
@@ -217,6 +218,60 @@ static inline void ceph_encode_string(void **p, void *end,
*p += len;
}
+/*
+ * version and length starting block encoders/decoders
+ */
+
+/* current code version (u8) + compat code version (u8) + len of struct (u32) */
+#define CEPH_ENCODING_START_BLK_LEN 6
+
+/**
+ * ceph_start_encoding - start encoding block
+ * @struct_v: current (code) version of the encoding
+ * @struct_compat: oldest code version that can decode it
+ * @struct_len: length of struct encoding
+ */
+static inline void ceph_start_encoding(void **p, u8 struct_v, u8 struct_compat,
+ u32 struct_len)
+{
+ ceph_encode_8(p, struct_v);
+ ceph_encode_8(p, struct_compat);
+ ceph_encode_32(p, struct_len);
+}
+
+/**
+ * ceph_start_decoding - start decoding block
+ * @v: current version of the encoding that the code supports
+ * @name: name of the struct (free-form)
+ * @struct_v: out param for the encoding version
+ * @struct_len: out param for the length of struct encoding
+ *
+ * Validates the length of struct encoding, so unsafe ceph_decode_*
+ * variants can be used for decoding.
+ */
+static inline int ceph_start_decoding(void **p, void *end, u8 v,
+ const char *name, u8 *struct_v,
+ u32 *struct_len)
+{
+ u8 struct_compat;
+
+ ceph_decode_need(p, end, CEPH_ENCODING_START_BLK_LEN, bad);
+ *struct_v = ceph_decode_8(p);
+ struct_compat = ceph_decode_8(p);
+ if (v < struct_compat) {
+ pr_warn("got struct_v %d struct_compat %d > %d of %s\n",
+ *struct_v, struct_compat, v, name);
+ return -EINVAL;
+ }
+
+ *struct_len = ceph_decode_32(p);
+ ceph_decode_need(p, end, *struct_len, bad);
+ return 0;
+
+bad:
+ return -ERANGE;
+}
+
#define ceph_encode_need(p, end, n, bad) \
do { \
if (!likely(ceph_has_room(p, end, n))) \
diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
index e7975e4681e1..83fc1fff7061 100644
--- a/include/linux/ceph/libceph.h
+++ b/include/linux/ceph/libceph.h
@@ -21,6 +21,7 @@
#include <linux/ceph/mon_client.h>
#include <linux/ceph/osd_client.h>
#include <linux/ceph/ceph_fs.h>
+#include <linux/ceph/string_table.h>
/*
* mount options
@@ -176,10 +177,68 @@ extern void ceph_put_snap_context(struct ceph_snap_context *sc);
*/
static inline int calc_pages_for(u64 off, u64 len)
{
- return ((off+len+PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT) -
- (off >> PAGE_CACHE_SHIFT);
+ return ((off+len+PAGE_SIZE-1) >> PAGE_SHIFT) -
+ (off >> PAGE_SHIFT);
}
+/*
+ * These are not meant to be generic - an integer key is assumed.
+ */
+#define DEFINE_RB_INSDEL_FUNCS(name, type, keyfld, nodefld) \
+static void insert_##name(struct rb_root *root, type *t) \
+{ \
+ struct rb_node **n = &root->rb_node; \
+ struct rb_node *parent = NULL; \
+ \
+ BUG_ON(!RB_EMPTY_NODE(&t->nodefld)); \
+ \
+ while (*n) { \
+ type *cur = rb_entry(*n, type, nodefld); \
+ \
+ parent = *n; \
+ if (t->keyfld < cur->keyfld) \
+ n = &(*n)->rb_left; \
+ else if (t->keyfld > cur->keyfld) \
+ n = &(*n)->rb_right; \
+ else \
+ BUG(); \
+ } \
+ \
+ rb_link_node(&t->nodefld, parent, n); \
+ rb_insert_color(&t->nodefld, root); \
+} \
+static void erase_##name(struct rb_root *root, type *t) \
+{ \
+ BUG_ON(RB_EMPTY_NODE(&t->nodefld)); \
+ rb_erase(&t->nodefld, root); \
+ RB_CLEAR_NODE(&t->nodefld); \
+}
+
+#define DEFINE_RB_LOOKUP_FUNC(name, type, keyfld, nodefld) \
+extern type __lookup_##name##_key; \
+static type *lookup_##name(struct rb_root *root, \
+ typeof(__lookup_##name##_key.keyfld) key) \
+{ \
+ struct rb_node *n = root->rb_node; \
+ \
+ while (n) { \
+ type *cur = rb_entry(n, type, nodefld); \
+ \
+ if (key < cur->keyfld) \
+ n = n->rb_left; \
+ else if (key > cur->keyfld) \
+ n = n->rb_right; \
+ else \
+ return cur; \
+ } \
+ \
+ return NULL; \
+}
+
+#define DEFINE_RB_FUNCS(name, type, keyfld, nodefld) \
+DEFINE_RB_INSDEL_FUNCS(name, type, keyfld, nodefld) \
+DEFINE_RB_LOOKUP_FUNC(name, type, keyfld, nodefld)
+
extern struct kmem_cache *ceph_inode_cachep;
extern struct kmem_cache *ceph_cap_cachep;
extern struct kmem_cache *ceph_cap_flush_cachep;
diff --git a/include/linux/ceph/mon_client.h b/include/linux/ceph/mon_client.h
index e230e7ed60d3..24d704d1ea5c 100644
--- a/include/linux/ceph/mon_client.h
+++ b/include/linux/ceph/mon_client.h
@@ -39,20 +39,31 @@ struct ceph_mon_request {
ceph_monc_request_func_t do_request;
};
+typedef void (*ceph_monc_callback_t)(struct ceph_mon_generic_request *);
+
/*
* ceph_mon_generic_request is being used for the statfs and
* mon_get_version requests which are being done a bit differently
* because we need to get data back to the caller
*/
struct ceph_mon_generic_request {
+ struct ceph_mon_client *monc;
struct kref kref;
u64 tid;
struct rb_node node;
int result;
- void *buf;
+
struct completion completion;
+ ceph_monc_callback_t complete_cb;
+ u64 private_data; /* r_tid/linger_id */
+
struct ceph_msg *request; /* original request */
struct ceph_msg *reply; /* and reply */
+
+ union {
+ struct ceph_statfs *st;
+ u64 newest;
+ } u;
};
struct ceph_mon_client {
@@ -77,7 +88,6 @@ struct ceph_mon_client {
/* pending generic requests */
struct rb_root generic_request_tree;
- int num_generic_requests;
u64 last_tid;
/* subs, indexed with CEPH_SUB_* */
@@ -85,7 +95,8 @@ struct ceph_mon_client {
struct ceph_mon_subscribe_item item;
bool want;
u32 have; /* epoch */
- } subs[3];
+ } subs[4];
+ int fs_cluster_id; /* "mdsmap.<id>" sub */
#ifdef CONFIG_DEBUG_FS
struct dentry *debugfs_file;
@@ -100,9 +111,10 @@ extern int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl);
extern void ceph_monc_stop(struct ceph_mon_client *monc);
enum {
- CEPH_SUB_MDSMAP = 0,
- CEPH_SUB_MONMAP,
+ CEPH_SUB_MONMAP = 0,
CEPH_SUB_OSDMAP,
+ CEPH_SUB_FSMAP,
+ CEPH_SUB_MDSMAP,
};
extern const char *ceph_sub_str[];
@@ -116,16 +128,18 @@ extern const char *ceph_sub_str[];
bool ceph_monc_want_map(struct ceph_mon_client *monc, int sub, u32 epoch,
bool continuous);
void ceph_monc_got_map(struct ceph_mon_client *monc, int sub, u32 epoch);
+void ceph_monc_renew_subs(struct ceph_mon_client *monc);
-extern void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc);
extern int ceph_monc_wait_osdmap(struct ceph_mon_client *monc, u32 epoch,
unsigned long timeout);
extern int ceph_monc_do_statfs(struct ceph_mon_client *monc,
struct ceph_statfs *buf);
-extern int ceph_monc_do_get_version(struct ceph_mon_client *monc,
- const char *what, u64 *newest);
+int ceph_monc_get_version(struct ceph_mon_client *monc, const char *what,
+ u64 *newest);
+int ceph_monc_get_version_async(struct ceph_mon_client *monc, const char *what,
+ ceph_monc_callback_t cb, u64 private_data);
extern int ceph_monc_open_session(struct ceph_mon_client *monc);
diff --git a/include/linux/ceph/msgpool.h b/include/linux/ceph/msgpool.h
index 4b0d38960726..ddd0d48d0384 100644
--- a/include/linux/ceph/msgpool.h
+++ b/include/linux/ceph/msgpool.h
@@ -2,7 +2,6 @@
#define _FS_CEPH_MSGPOOL
#include <linux/mempool.h>
-#include <linux/ceph/messenger.h>
/*
* we use memory pools for preallocating messages we may receive, to
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 4343df806710..858932304260 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -9,6 +9,7 @@
#include <linux/ceph/types.h>
#include <linux/ceph/osdmap.h>
#include <linux/ceph/messenger.h>
+#include <linux/ceph/msgpool.h>
#include <linux/ceph/auth.h>
#include <linux/ceph/pagelist.h>
@@ -16,15 +17,15 @@ struct ceph_msg;
struct ceph_snap_context;
struct ceph_osd_request;
struct ceph_osd_client;
-struct ceph_authorizer;
/*
* completion callback for async writepages
*/
-typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *,
- struct ceph_msg *);
+typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *);
typedef void (*ceph_osdc_unsafe_callback_t)(struct ceph_osd_request *, bool);
+#define CEPH_HOMELESS_OSD -1
+
/* a given osd we're communicating with */
struct ceph_osd {
atomic_t o_ref;
@@ -33,16 +34,15 @@ struct ceph_osd {
int o_incarnation;
struct rb_node o_node;
struct ceph_connection o_con;
- struct list_head o_requests;
- struct list_head o_linger_requests;
+ struct rb_root o_requests;
+ struct rb_root o_linger_requests;
struct list_head o_osd_lru;
struct ceph_auth_handshake o_auth;
unsigned long lru_ttl;
- int o_marked_for_keepalive;
struct list_head o_keepalive_item;
+ struct mutex lock;
};
-
#define CEPH_OSD_SLAB_OPS 2
#define CEPH_OSD_MAX_OPS 16
@@ -105,76 +105,95 @@ struct ceph_osd_req_op {
struct ceph_osd_data response_data;
__u8 class_len;
__u8 method_len;
- __u8 argc;
+ u32 indata_len;
} cls;
struct {
u64 cookie;
- u64 ver;
- u32 prot_ver;
- u32 timeout;
- __u8 flag;
+ __u8 op; /* CEPH_OSD_WATCH_OP_ */
+ u32 gen;
} watch;
struct {
+ struct ceph_osd_data request_data;
+ } notify_ack;
+ struct {
+ u64 cookie;
+ struct ceph_osd_data request_data;
+ struct ceph_osd_data response_data;
+ } notify;
+ struct {
u64 expected_object_size;
u64 expected_write_size;
} alloc_hint;
};
};
+struct ceph_osd_request_target {
+ struct ceph_object_id base_oid;
+ struct ceph_object_locator base_oloc;
+ struct ceph_object_id target_oid;
+ struct ceph_object_locator target_oloc;
+
+ struct ceph_pg pgid;
+ u32 pg_num;
+ u32 pg_num_mask;
+ struct ceph_osds acting;
+ struct ceph_osds up;
+ int size;
+ int min_size;
+ bool sort_bitwise;
+
+ unsigned int flags; /* CEPH_OSD_FLAG_* */
+ bool paused;
+
+ int osd;
+};
+
/* an in-flight request */
struct ceph_osd_request {
u64 r_tid; /* unique for this client */
struct rb_node r_node;
- struct list_head r_req_lru_item;
- struct list_head r_osd_item;
- struct list_head r_linger_item;
- struct list_head r_linger_osd_item;
+ struct rb_node r_mc_node; /* map check */
struct ceph_osd *r_osd;
- struct ceph_pg r_pgid;
- int r_pg_osds[CEPH_PG_MAX_SIZE];
- int r_num_pg_osds;
+
+ struct ceph_osd_request_target r_t;
+#define r_base_oid r_t.base_oid
+#define r_base_oloc r_t.base_oloc
+#define r_flags r_t.flags
struct ceph_msg *r_request, *r_reply;
- int r_flags; /* any additional flags for the osd */
u32 r_sent; /* >0 if r_request is sending/sent */
/* request osd ops array */
unsigned int r_num_ops;
- /* these are updated on each send */
- __le32 *r_request_osdmap_epoch;
- __le32 *r_request_flags;
- __le64 *r_request_pool;
- void *r_request_pgid;
- __le32 *r_request_attempts;
- bool r_paused;
- struct ceph_eversion *r_request_reassert_version;
-
int r_result;
- int r_got_reply;
- int r_linger;
+ bool r_got_reply;
struct ceph_osd_client *r_osdc;
struct kref r_kref;
bool r_mempool;
- struct completion r_completion, r_safe_completion;
+ struct completion r_completion;
+ struct completion r_safe_completion; /* fsync waiter */
ceph_osdc_callback_t r_callback;
ceph_osdc_unsafe_callback_t r_unsafe_callback;
- struct ceph_eversion r_reassert_version;
struct list_head r_unsafe_item;
struct inode *r_inode; /* for use by callbacks */
void *r_priv; /* ditto */
- struct ceph_object_locator r_base_oloc;
- struct ceph_object_id r_base_oid;
- struct ceph_object_locator r_target_oloc;
- struct ceph_object_id r_target_oid;
-
- u64 r_snapid;
- unsigned long r_stamp; /* send OR check time */
+ /* set by submitter */
+ u64 r_snapid; /* for reads, CEPH_NOSNAP o/w */
+ struct ceph_snap_context *r_snapc; /* for writes */
+ struct timespec r_mtime; /* ditto */
+ u64 r_data_offset; /* ditto */
+ bool r_linger; /* don't resend on failure */
- struct ceph_snap_context *r_snapc; /* snap context for writes */
+ /* internal */
+ unsigned long r_stamp; /* jiffies, send or check time */
+ int r_attempts;
+ struct ceph_eversion r_replay_version; /* aka reassert_version */
+ u32 r_last_force_resend;
+ u32 r_map_dne_bound;
struct ceph_osd_req_op r_ops[];
};
@@ -183,44 +202,70 @@ struct ceph_request_redirect {
struct ceph_object_locator oloc;
};
-struct ceph_osd_event {
- u64 cookie;
- int one_shot;
+typedef void (*rados_watchcb2_t)(void *arg, u64 notify_id, u64 cookie,
+ u64 notifier_id, void *data, size_t data_len);
+typedef void (*rados_watcherrcb_t)(void *arg, u64 cookie, int err);
+
+struct ceph_osd_linger_request {
struct ceph_osd_client *osdc;
- void (*cb)(u64, u64, u8, void *);
- void *data;
- struct rb_node node;
- struct list_head osd_node;
+ u64 linger_id;
+ bool committed;
+ bool is_watch; /* watch or notify */
+
+ struct ceph_osd *osd;
+ struct ceph_osd_request *reg_req;
+ struct ceph_osd_request *ping_req;
+ unsigned long ping_sent;
+ unsigned long watch_valid_thru;
+ struct list_head pending_lworks;
+
+ struct ceph_osd_request_target t;
+ u32 last_force_resend;
+ u32 map_dne_bound;
+
+ struct timespec mtime;
+
struct kref kref;
-};
+ struct mutex lock;
+ struct rb_node node; /* osd */
+ struct rb_node osdc_node; /* osdc */
+ struct rb_node mc_node; /* map check */
+ struct list_head scan_item;
+
+ struct completion reg_commit_wait;
+ struct completion notify_finish_wait;
+ int reg_commit_error;
+ int notify_finish_error;
+ int last_error;
+
+ u32 register_gen;
+ u64 notify_id;
+
+ rados_watchcb2_t wcb;
+ rados_watcherrcb_t errcb;
+ void *data;
-struct ceph_osd_event_work {
- struct work_struct work;
- struct ceph_osd_event *event;
- u64 ver;
- u64 notify_id;
- u8 opcode;
+ struct page ***preply_pages;
+ size_t *preply_len;
};
struct ceph_osd_client {
struct ceph_client *client;
struct ceph_osdmap *osdmap; /* current map */
- struct rw_semaphore map_sem;
- struct completion map_waiters;
- u64 last_requested_map;
+ struct rw_semaphore lock;
- struct mutex request_mutex;
struct rb_root osds; /* osds */
struct list_head osd_lru; /* idle osds */
- u64 timeout_tid; /* tid of timeout triggering rq */
- u64 last_tid; /* tid of last request */
- struct rb_root requests; /* pending requests */
- struct list_head req_lru; /* in-flight lru */
- struct list_head req_unsent; /* unsent/need-resend queue */
- struct list_head req_notarget; /* map to no osd */
- struct list_head req_linger; /* lingering requests */
- int num_requests;
+ spinlock_t osd_lru_lock;
+ struct ceph_osd homeless_osd;
+ atomic64_t last_tid; /* tid of last request */
+ u64 last_linger_id;
+ struct rb_root linger_requests; /* lingering requests */
+ struct rb_root map_checks;
+ struct rb_root linger_map_checks;
+ atomic_t num_requests;
+ atomic_t num_homeless;
struct delayed_work timeout_work;
struct delayed_work osds_timeout_work;
#ifdef CONFIG_DEBUG_FS
@@ -232,13 +277,14 @@ struct ceph_osd_client {
struct ceph_msgpool msgpool_op;
struct ceph_msgpool msgpool_op_reply;
- spinlock_t event_lock;
- struct rb_root event_tree;
- u64 event_count;
-
struct workqueue_struct *notify_wq;
};
+static inline bool ceph_osdmap_flag(struct ceph_osd_client *osdc, int flag)
+{
+ return osdc->osdmap->flags & flag;
+}
+
extern int ceph_osdc_setup(void);
extern void ceph_osdc_cleanup(void);
@@ -272,9 +318,6 @@ extern void osd_req_op_extent_dup_last(struct ceph_osd_request *osd_req,
extern struct ceph_osd_data *osd_req_op_extent_osd_data(
struct ceph_osd_request *osd_req,
unsigned int which);
-extern struct ceph_osd_data *osd_req_op_cls_response_data(
- struct ceph_osd_request *osd_req,
- unsigned int which);
extern void osd_req_op_extent_osd_data_pages(struct ceph_osd_request *,
unsigned int which,
@@ -310,9 +353,6 @@ extern void osd_req_op_cls_init(struct ceph_osd_request *osd_req,
extern int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which,
u16 opcode, const char *name, const void *value,
size_t size, u8 cmp_op, u8 cmp_mode);
-extern void osd_req_op_watch_init(struct ceph_osd_request *osd_req,
- unsigned int which, u16 opcode,
- u64 cookie, u64 version, int flag);
extern void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req,
unsigned int which,
u64 expected_object_size,
@@ -323,11 +363,7 @@ extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *
unsigned int num_ops,
bool use_mempool,
gfp_t gfp_flags);
-
-extern void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off,
- struct ceph_snap_context *snapc,
- u64 snap_id,
- struct timespec *mtime);
+int ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp);
extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *,
struct ceph_file_layout *layout,
@@ -339,9 +375,6 @@ extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *,
u32 truncate_seq, u64 truncate_size,
bool use_mempool);
-extern void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc,
- struct ceph_osd_request *req);
-
extern void ceph_osdc_get_request(struct ceph_osd_request *req);
extern void ceph_osdc_put_request(struct ceph_osd_request *req);
@@ -354,6 +387,7 @@ extern int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
extern void ceph_osdc_sync(struct ceph_osd_client *osdc);
extern void ceph_osdc_flush_notifies(struct ceph_osd_client *osdc);
+void ceph_osdc_maybe_request_map(struct ceph_osd_client *osdc);
extern int ceph_osdc_readpages(struct ceph_osd_client *osdc,
struct ceph_vino vino,
@@ -372,11 +406,33 @@ extern int ceph_osdc_writepages(struct ceph_osd_client *osdc,
struct timespec *mtime,
struct page **pages, int nr_pages);
-/* watch/notify events */
-extern int ceph_osdc_create_event(struct ceph_osd_client *osdc,
- void (*event_cb)(u64, u64, u8, void *),
- void *data, struct ceph_osd_event **pevent);
-extern void ceph_osdc_cancel_event(struct ceph_osd_event *event);
-extern void ceph_osdc_put_event(struct ceph_osd_event *event);
+/* watch/notify */
+struct ceph_osd_linger_request *
+ceph_osdc_watch(struct ceph_osd_client *osdc,
+ struct ceph_object_id *oid,
+ struct ceph_object_locator *oloc,
+ rados_watchcb2_t wcb,
+ rados_watcherrcb_t errcb,
+ void *data);
+int ceph_osdc_unwatch(struct ceph_osd_client *osdc,
+ struct ceph_osd_linger_request *lreq);
+
+int ceph_osdc_notify_ack(struct ceph_osd_client *osdc,
+ struct ceph_object_id *oid,
+ struct ceph_object_locator *oloc,
+ u64 notify_id,
+ u64 cookie,
+ void *payload,
+ size_t payload_len);
+int ceph_osdc_notify(struct ceph_osd_client *osdc,
+ struct ceph_object_id *oid,
+ struct ceph_object_locator *oloc,
+ void *payload,
+ size_t payload_len,
+ u32 timeout,
+ struct page ***preply_pages,
+ size_t *preply_len);
+int ceph_osdc_watch_check(struct ceph_osd_client *osdc,
+ struct ceph_osd_linger_request *lreq);
#endif
diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
index e55c08bc3a96..9a9041784dcf 100644
--- a/include/linux/ceph/osdmap.h
+++ b/include/linux/ceph/osdmap.h
@@ -24,21 +24,29 @@ struct ceph_pg {
uint32_t seed;
};
-#define CEPH_POOL_FLAG_HASHPSPOOL 1
+int ceph_pg_compare(const struct ceph_pg *lhs, const struct ceph_pg *rhs);
+
+#define CEPH_POOL_FLAG_HASHPSPOOL (1ULL << 0) /* hash pg seed and pool id
+ together */
+#define CEPH_POOL_FLAG_FULL (1ULL << 1) /* pool is full */
struct ceph_pg_pool_info {
struct rb_node node;
s64 id;
- u8 type;
+ u8 type; /* CEPH_POOL_TYPE_* */
u8 size;
+ u8 min_size;
u8 crush_ruleset;
u8 object_hash;
+ u32 last_force_request_resend;
u32 pg_num, pgp_num;
int pg_num_mask, pgp_num_mask;
s64 read_tier;
s64 write_tier; /* wins for read+write ops */
- u64 flags;
+ u64 flags; /* CEPH_POOL_FLAG_* */
char *name;
+
+ bool was_full; /* for handle_one_map() */
};
static inline bool ceph_can_shift_osds(struct ceph_pg_pool_info *pool)
@@ -55,8 +63,24 @@ static inline bool ceph_can_shift_osds(struct ceph_pg_pool_info *pool)
struct ceph_object_locator {
s64 pool;
+ struct ceph_string *pool_ns;
};
+static inline void ceph_oloc_init(struct ceph_object_locator *oloc)
+{
+ oloc->pool = -1;
+ oloc->pool_ns = NULL;
+}
+
+static inline bool ceph_oloc_empty(const struct ceph_object_locator *oloc)
+{
+ return oloc->pool == -1;
+}
+
+void ceph_oloc_copy(struct ceph_object_locator *dest,
+ const struct ceph_object_locator *src);
+void ceph_oloc_destroy(struct ceph_object_locator *oloc);
+
/*
* Maximum supported by kernel client object name length
*
@@ -64,11 +88,52 @@ struct ceph_object_locator {
*/
#define CEPH_MAX_OID_NAME_LEN 100
+/*
+ * 51-char inline_name is long enough for all cephfs and all but one
+ * rbd requests: <imgname> in "<imgname>.rbd"/"rbd_id.<imgname>" can be
+ * arbitrarily long (~PAGE_SIZE). It's done once during rbd map; all
+ * other rbd requests fit into inline_name.
+ *
+ * Makes ceph_object_id 64 bytes on 64-bit.
+ */
+#define CEPH_OID_INLINE_LEN 52
+
+/*
+ * Both inline and external buffers have space for a NUL-terminator,
+ * which is carried around. It's not required though - RADOS object
+ * names don't have to be NUL-terminated and may contain NULs.
+ */
struct ceph_object_id {
- char name[CEPH_MAX_OID_NAME_LEN];
+ char *name;
+ char inline_name[CEPH_OID_INLINE_LEN];
int name_len;
};
+static inline void ceph_oid_init(struct ceph_object_id *oid)
+{
+ oid->name = oid->inline_name;
+ oid->name_len = 0;
+}
+
+#define CEPH_OID_INIT_ONSTACK(oid) \
+ ({ ceph_oid_init(&oid); oid; })
+#define CEPH_DEFINE_OID_ONSTACK(oid) \
+ struct ceph_object_id oid = CEPH_OID_INIT_ONSTACK(oid)
+
+static inline bool ceph_oid_empty(const struct ceph_object_id *oid)
+{
+ return oid->name == oid->inline_name && !oid->name_len;
+}
+
+void ceph_oid_copy(struct ceph_object_id *dest,
+ const struct ceph_object_id *src);
+__printf(2, 3)
+void ceph_oid_printf(struct ceph_object_id *oid, const char *fmt, ...);
+__printf(3, 4)
+int ceph_oid_aprintf(struct ceph_object_id *oid, gfp_t gfp,
+ const char *fmt, ...);
+void ceph_oid_destroy(struct ceph_object_id *oid);
+
struct ceph_pg_mapping {
struct rb_node node;
struct ceph_pg pgid;
@@ -87,7 +152,6 @@ struct ceph_pg_mapping {
struct ceph_osdmap {
struct ceph_fsid fsid;
u32 epoch;
- u32 mkfs_epoch;
struct ceph_timespec created, modified;
u32 flags; /* CEPH_OSDMAP_* */
@@ -113,52 +177,23 @@ struct ceph_osdmap {
int crush_scratch_ary[CEPH_PG_MAX_SIZE * 3];
};
-static inline void ceph_oid_set_name(struct ceph_object_id *oid,
- const char *name)
-{
- int len;
-
- len = strlen(name);
- if (len > sizeof(oid->name)) {
- WARN(1, "ceph_oid_set_name '%s' len %d vs %zu, truncating\n",
- name, len, sizeof(oid->name));
- len = sizeof(oid->name);
- }
-
- memcpy(oid->name, name, len);
- oid->name_len = len;
-}
-
-static inline void ceph_oid_copy(struct ceph_object_id *dest,
- struct ceph_object_id *src)
-{
- BUG_ON(src->name_len > sizeof(dest->name));
- memcpy(dest->name, src->name, src->name_len);
- dest->name_len = src->name_len;
-}
-
-static inline int ceph_osd_exists(struct ceph_osdmap *map, int osd)
+static inline bool ceph_osd_exists(struct ceph_osdmap *map, int osd)
{
return osd >= 0 && osd < map->max_osd &&
(map->osd_state[osd] & CEPH_OSD_EXISTS);
}
-static inline int ceph_osd_is_up(struct ceph_osdmap *map, int osd)
+static inline bool ceph_osd_is_up(struct ceph_osdmap *map, int osd)
{
return ceph_osd_exists(map, osd) &&
(map->osd_state[osd] & CEPH_OSD_UP);
}
-static inline int ceph_osd_is_down(struct ceph_osdmap *map, int osd)
+static inline bool ceph_osd_is_down(struct ceph_osdmap *map, int osd)
{
return !ceph_osd_is_up(map, osd);
}
-static inline bool ceph_osdmap_flag(struct ceph_osdmap *map, int flag)
-{
- return map && (map->flags & flag);
-}
-
extern char *ceph_osdmap_state_str(char *str, int len, int state);
extern u32 ceph_get_primary_affinity(struct ceph_osdmap *map, int osd);
@@ -192,28 +227,59 @@ static inline int ceph_decode_pgid(void **p, void *end, struct ceph_pg *pgid)
return 0;
}
+struct ceph_osdmap *ceph_osdmap_alloc(void);
extern struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end);
-extern struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
- struct ceph_osdmap *map,
- struct ceph_messenger *msgr);
+struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
+ struct ceph_osdmap *map);
extern void ceph_osdmap_destroy(struct ceph_osdmap *map);
+struct ceph_osds {
+ int osds[CEPH_PG_MAX_SIZE];
+ int size;
+ int primary; /* id, NOT index */
+};
+
+static inline void ceph_osds_init(struct ceph_osds *set)
+{
+ set->size = 0;
+ set->primary = -1;
+}
+
+void ceph_osds_copy(struct ceph_osds *dest, const struct ceph_osds *src);
+
+bool ceph_is_new_interval(const struct ceph_osds *old_acting,
+ const struct ceph_osds *new_acting,
+ const struct ceph_osds *old_up,
+ const struct ceph_osds *new_up,
+ int old_size,
+ int new_size,
+ int old_min_size,
+ int new_min_size,
+ u32 old_pg_num,
+ u32 new_pg_num,
+ bool old_sort_bitwise,
+ bool new_sort_bitwise,
+ const struct ceph_pg *pgid);
+bool ceph_osds_changed(const struct ceph_osds *old_acting,
+ const struct ceph_osds *new_acting,
+ bool any_change);
+
/* calculate mapping of a file extent to an object */
extern int ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
u64 off, u64 len,
u64 *bno, u64 *oxoff, u64 *oxlen);
-/* calculate mapping of object to a placement group */
-extern int ceph_oloc_oid_to_pg(struct ceph_osdmap *osdmap,
- struct ceph_object_locator *oloc,
- struct ceph_object_id *oid,
- struct ceph_pg *pg_out);
-
-extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap,
- struct ceph_pg pgid,
- int *osds, int *primary);
-extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap,
- struct ceph_pg pgid);
+int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap,
+ struct ceph_object_id *oid,
+ struct ceph_object_locator *oloc,
+ struct ceph_pg *raw_pgid);
+
+void ceph_pg_to_up_acting_osds(struct ceph_osdmap *osdmap,
+ const struct ceph_pg *raw_pgid,
+ struct ceph_osds *up,
+ struct ceph_osds *acting);
+int ceph_pg_to_acting_primary(struct ceph_osdmap *osdmap,
+ const struct ceph_pg *raw_pgid);
extern struct ceph_pg_pool_info *ceph_pg_pool_by_id(struct ceph_osdmap *map,
u64 id);
diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h
index 2f822dca1046..5c0da61cb763 100644
--- a/include/linux/ceph/rados.h
+++ b/include/linux/ceph/rados.h
@@ -114,8 +114,8 @@ struct ceph_object_layout {
* compound epoch+version, used by storage layer to serialize mutations
*/
struct ceph_eversion {
- __le32 epoch;
__le64 version;
+ __le32 epoch;
} __attribute__ ((packed));
/*
@@ -153,6 +153,11 @@ extern const char *ceph_osd_state_name(int s);
#define CEPH_OSDMAP_NOIN (1<<8) /* block osd auto mark-in */
#define CEPH_OSDMAP_NOBACKFILL (1<<9) /* block osd backfill */
#define CEPH_OSDMAP_NORECOVER (1<<10) /* block osd recovery and backfill */
+#define CEPH_OSDMAP_NOSCRUB (1<<11) /* block periodic scrub */
+#define CEPH_OSDMAP_NODEEP_SCRUB (1<<12) /* block periodic deep-scrub */
+#define CEPH_OSDMAP_NOTIERAGENT (1<<13) /* disable tiering agent */
+#define CEPH_OSDMAP_NOREBALANCE (1<<14) /* block osd backfill unless pg is degraded */
+#define CEPH_OSDMAP_SORTBITWISE (1<<15) /* use bitwise hobject_t sort */
/*
* The error code to return when an OSD can't handle a write
@@ -389,6 +394,13 @@ enum {
CEPH_OSD_FLAG_SKIPRWLOCKS = 0x10000, /* skip rw locks */
CEPH_OSD_FLAG_IGNORE_OVERLAY = 0x20000, /* ignore pool overlay */
CEPH_OSD_FLAG_FLUSH = 0x40000, /* this is part of flush */
+ CEPH_OSD_FLAG_MAP_SNAP_CLONE = 0x80000, /* map snap direct to clone id */
+ CEPH_OSD_FLAG_ENFORCE_SNAPC = 0x100000, /* use snapc provided even if
+ pool uses pool snaps */
+ CEPH_OSD_FLAG_REDIRECTED = 0x200000, /* op has been redirected */
+ CEPH_OSD_FLAG_KNOWN_REDIR = 0x400000, /* redirect bit is authoritative */
+ CEPH_OSD_FLAG_FULL_TRY = 0x800000, /* try op despite full flag */
+ CEPH_OSD_FLAG_FULL_FORCE = 0x1000000, /* force op despite full flag */
};
enum {
@@ -415,7 +427,17 @@ enum {
CEPH_OSD_CMPXATTR_MODE_U64 = 2
};
-#define RADOS_NOTIFY_VER 1
+enum {
+ CEPH_OSD_WATCH_OP_UNWATCH = 0,
+ CEPH_OSD_WATCH_OP_LEGACY_WATCH = 1,
+ /* note: use only ODD ids to prevent pre-giant code from
+ interpreting the op as UNWATCH */
+ CEPH_OSD_WATCH_OP_WATCH = 3,
+ CEPH_OSD_WATCH_OP_RECONNECT = 5,
+ CEPH_OSD_WATCH_OP_PING = 7,
+};
+
+const char *ceph_osd_watch_op_name(int o);
/*
* an individual object operation. each may be accompanied by some data
@@ -450,10 +472,14 @@ struct ceph_osd_op {
} __attribute__ ((packed)) snap;
struct {
__le64 cookie;
- __le64 ver;
- __u8 flag; /* 0 = unwatch, 1 = watch */
+ __le64 ver; /* no longer used */
+ __u8 op; /* CEPH_OSD_WATCH_OP_* */
+ __le32 gen; /* registration generation */
} __attribute__ ((packed)) watch;
struct {
+ __le64 cookie;
+ } __attribute__ ((packed)) notify;
+ struct {
__le64 offset, length;
__le64 src_offset;
} __attribute__ ((packed)) clonerange;
diff --git a/include/linux/ceph/string_table.h b/include/linux/ceph/string_table.h
new file mode 100644
index 000000000000..1b02c96daf75
--- /dev/null
+++ b/include/linux/ceph/string_table.h
@@ -0,0 +1,62 @@
+#ifndef _FS_CEPH_STRING_TABLE_H
+#define _FS_CEPH_STRING_TABLE_H
+
+#include <linux/types.h>
+#include <linux/kref.h>
+#include <linux/rbtree.h>
+#include <linux/rcupdate.h>
+
+struct ceph_string {
+ struct kref kref;
+ union {
+ struct rb_node node;
+ struct rcu_head rcu;
+ };
+ size_t len;
+ char str[];
+};
+
+extern void ceph_release_string(struct kref *ref);
+extern struct ceph_string *ceph_find_or_create_string(const char *str,
+ size_t len);
+extern bool ceph_strings_empty(void);
+
+static inline struct ceph_string *ceph_get_string(struct ceph_string *str)
+{
+ kref_get(&str->kref);
+ return str;
+}
+
+static inline void ceph_put_string(struct ceph_string *str)
+{
+ if (!str)
+ return;
+ kref_put(&str->kref, ceph_release_string);
+}
+
+static inline int ceph_compare_string(struct ceph_string *cs,
+ const char* str, size_t len)
+{
+ size_t cs_len = cs ? cs->len : 0;
+ if (cs_len != len)
+ return cs_len - len;
+ if (len == 0)
+ return 0;
+ return strncmp(cs->str, str, len);
+}
+
+#define ceph_try_get_string(x) \
+({ \
+ struct ceph_string *___str; \
+ rcu_read_lock(); \
+ for (;;) { \
+ ___str = rcu_dereference(x); \
+ if (!___str || \
+ kref_get_unless_zero(&___str->kref)) \
+ break; \
+ } \
+ rcu_read_unlock(); \
+ (___str); \
+})
+
+#endif