4738 lines
127 KiB
C
4738 lines
127 KiB
C
|
// SPDX-License-Identifier: GPL-2.0
|
||
|
#include <linux/ceph/ceph_debug.h>
|
||
|
|
||
|
#include <linux/fs.h>
|
||
|
#include <linux/kernel.h>
|
||
|
#include <linux/sched/signal.h>
|
||
|
#include <linux/slab.h>
|
||
|
#include <linux/vmalloc.h>
|
||
|
#include <linux/wait.h>
|
||
|
#include <linux/writeback.h>
|
||
|
#include <linux/iversion.h>
|
||
|
#include <linux/filelock.h>
|
||
|
|
||
|
#include "super.h"
|
||
|
#include "mds_client.h"
|
||
|
#include "cache.h"
|
||
|
#include <linux/ceph/decode.h>
|
||
|
#include <linux/ceph/messenger.h>
|
||
|
|
||
|
/*
|
||
|
* Capability management
|
||
|
*
|
||
|
* The Ceph metadata servers control client access to inode metadata
|
||
|
* and file data by issuing capabilities, granting clients permission
|
||
|
* to read and/or write both inode field and file data to OSDs
|
||
|
* (storage nodes). Each capability consists of a set of bits
|
||
|
* indicating which operations are allowed.
|
||
|
*
|
||
|
* If the client holds a *_SHARED cap, the client has a coherent value
|
||
|
* that can be safely read from the cached inode.
|
||
|
*
|
||
|
* In the case of a *_EXCL (exclusive) or FILE_WR capabilities, the
|
||
|
* client is allowed to change inode attributes (e.g., file size,
|
||
|
* mtime), note its dirty state in the ceph_cap, and asynchronously
|
||
|
* flush that metadata change to the MDS.
|
||
|
*
|
||
|
* In the event of a conflicting operation (perhaps by another
|
||
|
* client), the MDS will revoke the conflicting client capabilities.
|
||
|
*
|
||
|
* In order for a client to cache an inode, it must hold a capability
|
||
|
* with at least one MDS server. When inodes are released, release
|
||
|
* notifications are batched and periodically sent en masse to the MDS
|
||
|
* cluster to release server state.
|
||
|
*/
|
||
|
|
||
|
static u64 __get_oldest_flush_tid(struct ceph_mds_client *mdsc);
|
||
|
static void __kick_flushing_caps(struct ceph_mds_client *mdsc,
|
||
|
struct ceph_mds_session *session,
|
||
|
struct ceph_inode_info *ci,
|
||
|
u64 oldest_flush_tid);
|
||
|
|
||
|
/*
|
||
|
* Generate readable cap strings for debugging output.
|
||
|
*/
|
||
|
#define MAX_CAP_STR 20
|
||
|
static char cap_str[MAX_CAP_STR][40];
|
||
|
static DEFINE_SPINLOCK(cap_str_lock);
|
||
|
static int last_cap_str;
|
||
|
|
||
|
static char *gcap_string(char *s, int c)
|
||
|
{
|
||
|
if (c & CEPH_CAP_GSHARED)
|
||
|
*s++ = 's';
|
||
|
if (c & CEPH_CAP_GEXCL)
|
||
|
*s++ = 'x';
|
||
|
if (c & CEPH_CAP_GCACHE)
|
||
|
*s++ = 'c';
|
||
|
if (c & CEPH_CAP_GRD)
|
||
|
*s++ = 'r';
|
||
|
if (c & CEPH_CAP_GWR)
|
||
|
*s++ = 'w';
|
||
|
if (c & CEPH_CAP_GBUFFER)
|
||
|
*s++ = 'b';
|
||
|
if (c & CEPH_CAP_GWREXTEND)
|
||
|
*s++ = 'a';
|
||
|
if (c & CEPH_CAP_GLAZYIO)
|
||
|
*s++ = 'l';
|
||
|
return s;
|
||
|
}
|
||
|
|
||
|
const char *ceph_cap_string(int caps)
|
||
|
{
|
||
|
int i;
|
||
|
char *s;
|
||
|
int c;
|
||
|
|
||
|
spin_lock(&cap_str_lock);
|
||
|
i = last_cap_str++;
|
||
|
if (last_cap_str == MAX_CAP_STR)
|
||
|
last_cap_str = 0;
|
||
|
spin_unlock(&cap_str_lock);
|
||
|
|
||
|
s = cap_str[i];
|
||
|
|
||
|
if (caps & CEPH_CAP_PIN)
|
||
|
*s++ = 'p';
|
||
|
|
||
|
c = (caps >> CEPH_CAP_SAUTH) & 3;
|
||
|
if (c) {
|
||
|
*s++ = 'A';
|
||
|
s = gcap_string(s, c);
|
||
|
}
|
||
|
|
||
|
c = (caps >> CEPH_CAP_SLINK) & 3;
|
||
|
if (c) {
|
||
|
*s++ = 'L';
|
||
|
s = gcap_string(s, c);
|
||
|
}
|
||
|
|
||
|
c = (caps >> CEPH_CAP_SXATTR) & 3;
|
||
|
if (c) {
|
||
|
*s++ = 'X';
|
||
|
s = gcap_string(s, c);
|
||
|
}
|
||
|
|
||
|
c = caps >> CEPH_CAP_SFILE;
|
||
|
if (c) {
|
||
|
*s++ = 'F';
|
||
|
s = gcap_string(s, c);
|
||
|
}
|
||
|
|
||
|
if (s == cap_str[i])
|
||
|
*s++ = '-';
|
||
|
*s = 0;
|
||
|
return cap_str[i];
|
||
|
}
|
||
|
|
||
|
void ceph_caps_init(struct ceph_mds_client *mdsc)
|
||
|
{
|
||
|
INIT_LIST_HEAD(&mdsc->caps_list);
|
||
|
spin_lock_init(&mdsc->caps_list_lock);
|
||
|
}
|
||
|
|
||
|
void ceph_caps_finalize(struct ceph_mds_client *mdsc)
|
||
|
{
|
||
|
struct ceph_cap *cap;
|
||
|
|
||
|
spin_lock(&mdsc->caps_list_lock);
|
||
|
while (!list_empty(&mdsc->caps_list)) {
|
||
|
cap = list_first_entry(&mdsc->caps_list,
|
||
|
struct ceph_cap, caps_item);
|
||
|
list_del(&cap->caps_item);
|
||
|
kmem_cache_free(ceph_cap_cachep, cap);
|
||
|
}
|
||
|
mdsc->caps_total_count = 0;
|
||
|
mdsc->caps_avail_count = 0;
|
||
|
mdsc->caps_use_count = 0;
|
||
|
mdsc->caps_reserve_count = 0;
|
||
|
mdsc->caps_min_count = 0;
|
||
|
spin_unlock(&mdsc->caps_list_lock);
|
||
|
}
|
||
|
|
||
|
void ceph_adjust_caps_max_min(struct ceph_mds_client *mdsc,
|
||
|
struct ceph_mount_options *fsopt)
|
||
|
{
|
||
|
spin_lock(&mdsc->caps_list_lock);
|
||
|
mdsc->caps_min_count = fsopt->max_readdir;
|
||
|
if (mdsc->caps_min_count < 1024)
|
||
|
mdsc->caps_min_count = 1024;
|
||
|
mdsc->caps_use_max = fsopt->caps_max;
|
||
|
if (mdsc->caps_use_max > 0 &&
|
||
|
mdsc->caps_use_max < mdsc->caps_min_count)
|
||
|
mdsc->caps_use_max = mdsc->caps_min_count;
|
||
|
spin_unlock(&mdsc->caps_list_lock);
|
||
|
}
|
||
|
|
||
|
static void __ceph_unreserve_caps(struct ceph_mds_client *mdsc, int nr_caps)
|
||
|
{
|
||
|
struct ceph_cap *cap;
|
||
|
int i;
|
||
|
|
||
|
if (nr_caps) {
|
||
|
BUG_ON(mdsc->caps_reserve_count < nr_caps);
|
||
|
mdsc->caps_reserve_count -= nr_caps;
|
||
|
if (mdsc->caps_avail_count >=
|
||
|
mdsc->caps_reserve_count + mdsc->caps_min_count) {
|
||
|
mdsc->caps_total_count -= nr_caps;
|
||
|
for (i = 0; i < nr_caps; i++) {
|
||
|
cap = list_first_entry(&mdsc->caps_list,
|
||
|
struct ceph_cap, caps_item);
|
||
|
list_del(&cap->caps_item);
|
||
|
kmem_cache_free(ceph_cap_cachep, cap);
|
||
|
}
|
||
|
} else {
|
||
|
mdsc->caps_avail_count += nr_caps;
|
||
|
}
|
||
|
|
||
|
dout("%s: caps %d = %d used + %d resv + %d avail\n",
|
||
|
__func__,
|
||
|
mdsc->caps_total_count, mdsc->caps_use_count,
|
||
|
mdsc->caps_reserve_count, mdsc->caps_avail_count);
|
||
|
BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
|
||
|
mdsc->caps_reserve_count +
|
||
|
mdsc->caps_avail_count);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
* Called under mdsc->mutex.
|
||
|
*/
|
||
|
int ceph_reserve_caps(struct ceph_mds_client *mdsc,
|
||
|
struct ceph_cap_reservation *ctx, int need)
|
||
|
{
|
||
|
int i, j;
|
||
|
struct ceph_cap *cap;
|
||
|
int have;
|
||
|
int alloc = 0;
|
||
|
int max_caps;
|
||
|
int err = 0;
|
||
|
bool trimmed = false;
|
||
|
struct ceph_mds_session *s;
|
||
|
LIST_HEAD(newcaps);
|
||
|
|
||
|
dout("reserve caps ctx=%p need=%d\n", ctx, need);
|
||
|
|
||
|
/* first reserve any caps that are already allocated */
|
||
|
spin_lock(&mdsc->caps_list_lock);
|
||
|
if (mdsc->caps_avail_count >= need)
|
||
|
have = need;
|
||
|
else
|
||
|
have = mdsc->caps_avail_count;
|
||
|
mdsc->caps_avail_count -= have;
|
||
|
mdsc->caps_reserve_count += have;
|
||
|
BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
|
||
|
mdsc->caps_reserve_count +
|
||
|
mdsc->caps_avail_count);
|
||
|
spin_unlock(&mdsc->caps_list_lock);
|
||
|
|
||
|
for (i = have; i < need; ) {
|
||
|
cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
|
||
|
if (cap) {
|
||
|
list_add(&cap->caps_item, &newcaps);
|
||
|
alloc++;
|
||
|
i++;
|
||
|
continue;
|
||
|
}
|
||
|
|
||
|
if (!trimmed) {
|
||
|
for (j = 0; j < mdsc->max_sessions; j++) {
|
||
|
s = __ceph_lookup_mds_session(mdsc, j);
|
||
|
if (!s)
|
||
|
continue;
|
||
|
mutex_unlock(&mdsc->mutex);
|
||
|
|
||
|
mutex_lock(&s->s_mutex);
|
||
|
max_caps = s->s_nr_caps - (need - i);
|
||
|
ceph_trim_caps(mdsc, s, max_caps);
|
||
|
mutex_unlock(&s->s_mutex);
|
||
|
|
||
|
ceph_put_mds_session(s);
|
||
|
mutex_lock(&mdsc->mutex);
|
||
|
}
|
||
|
trimmed = true;
|
||
|
|
||
|
spin_lock(&mdsc->caps_list_lock);
|
||
|
if (mdsc->caps_avail_count) {
|
||
|
int more_have;
|
||
|
if (mdsc->caps_avail_count >= need - i)
|
||
|
more_have = need - i;
|
||
|
else
|
||
|
more_have = mdsc->caps_avail_count;
|
||
|
|
||
|
i += more_have;
|
||
|
have += more_have;
|
||
|
mdsc->caps_avail_count -= more_have;
|
||
|
mdsc->caps_reserve_count += more_have;
|
||
|
|
||
|
}
|
||
|
spin_unlock(&mdsc->caps_list_lock);
|
||
|
|
||
|
continue;
|
||
|
}
|
||
|
|
||
|
pr_warn("reserve caps ctx=%p ENOMEM need=%d got=%d\n",
|
||
|
ctx, need, have + alloc);
|
||
|
err = -ENOMEM;
|
||
|
break;
|
||
|
}
|
||
|
|
||
|
if (!err) {
|
||
|
BUG_ON(have + alloc != need);
|
||
|
ctx->count = need;
|
||
|
ctx->used = 0;
|
||
|
}
|
||
|
|
||
|
spin_lock(&mdsc->caps_list_lock);
|
||
|
mdsc->caps_total_count += alloc;
|
||
|
mdsc->caps_reserve_count += alloc;
|
||
|
list_splice(&newcaps, &mdsc->caps_list);
|
||
|
|
||
|
BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
|
||
|
mdsc->caps_reserve_count +
|
||
|
mdsc->caps_avail_count);
|
||
|
|
||
|
if (err)
|
||
|
__ceph_unreserve_caps(mdsc, have + alloc);
|
||
|
|
||
|
spin_unlock(&mdsc->caps_list_lock);
|
||
|
|
||
|
dout("reserve caps ctx=%p %d = %d used + %d resv + %d avail\n",
|
||
|
ctx, mdsc->caps_total_count, mdsc->caps_use_count,
|
||
|
mdsc->caps_reserve_count, mdsc->caps_avail_count);
|
||
|
return err;
|
||
|
}
|
||
|
|
||
|
void ceph_unreserve_caps(struct ceph_mds_client *mdsc,
|
||
|
struct ceph_cap_reservation *ctx)
|
||
|
{
|
||
|
bool reclaim = false;
|
||
|
if (!ctx->count)
|
||
|
return;
|
||
|
|
||
|
dout("unreserve caps ctx=%p count=%d\n", ctx, ctx->count);
|
||
|
spin_lock(&mdsc->caps_list_lock);
|
||
|
__ceph_unreserve_caps(mdsc, ctx->count);
|
||
|
ctx->count = 0;
|
||
|
|
||
|
if (mdsc->caps_use_max > 0 &&
|
||
|
mdsc->caps_use_count > mdsc->caps_use_max)
|
||
|
reclaim = true;
|
||
|
spin_unlock(&mdsc->caps_list_lock);
|
||
|
|
||
|
if (reclaim)
|
||
|
ceph_reclaim_caps_nr(mdsc, ctx->used);
|
||
|
}
|
||
|
|
||
|
struct ceph_cap *ceph_get_cap(struct ceph_mds_client *mdsc,
|
||
|
struct ceph_cap_reservation *ctx)
|
||
|
{
|
||
|
struct ceph_cap *cap = NULL;
|
||
|
|
||
|
/* temporary, until we do something about cap import/export */
|
||
|
if (!ctx) {
|
||
|
cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
|
||
|
if (cap) {
|
||
|
spin_lock(&mdsc->caps_list_lock);
|
||
|
mdsc->caps_use_count++;
|
||
|
mdsc->caps_total_count++;
|
||
|
spin_unlock(&mdsc->caps_list_lock);
|
||
|
} else {
|
||
|
spin_lock(&mdsc->caps_list_lock);
|
||
|
if (mdsc->caps_avail_count) {
|
||
|
BUG_ON(list_empty(&mdsc->caps_list));
|
||
|
|
||
|
mdsc->caps_avail_count--;
|
||
|
mdsc->caps_use_count++;
|
||
|
cap = list_first_entry(&mdsc->caps_list,
|
||
|
struct ceph_cap, caps_item);
|
||
|
list_del(&cap->caps_item);
|
||
|
|
||
|
BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
|
||
|
mdsc->caps_reserve_count + mdsc->caps_avail_count);
|
||
|
}
|
||
|
spin_unlock(&mdsc->caps_list_lock);
|
||
|
}
|
||
|
|
||
|
return cap;
|
||
|
}
|
||
|
|
||
|
spin_lock(&mdsc->caps_list_lock);
|
||
|
dout("get_cap ctx=%p (%d) %d = %d used + %d resv + %d avail\n",
|
||
|
ctx, ctx->count, mdsc->caps_total_count, mdsc->caps_use_count,
|
||
|
mdsc->caps_reserve_count, mdsc->caps_avail_count);
|
||
|
BUG_ON(!ctx->count);
|
||
|
BUG_ON(ctx->count > mdsc->caps_reserve_count);
|
||
|
BUG_ON(list_empty(&mdsc->caps_list));
|
||
|
|
||
|
ctx->count--;
|
||
|
ctx->used++;
|
||
|
mdsc->caps_reserve_count--;
|
||
|
mdsc->caps_use_count++;
|
||
|
|
||
|
cap = list_first_entry(&mdsc->caps_list, struct ceph_cap, caps_item);
|
||
|
list_del(&cap->caps_item);
|
||
|
|
||
|
BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
|
||
|
mdsc->caps_reserve_count + mdsc->caps_avail_count);
|
||
|
spin_unlock(&mdsc->caps_list_lock);
|
||
|
return cap;
|
||
|
}
|
||
|
|
||
|
void ceph_put_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap)
|
||
|
{
|
||
|
spin_lock(&mdsc->caps_list_lock);
|
||
|
dout("put_cap %p %d = %d used + %d resv + %d avail\n",
|
||
|
cap, mdsc->caps_total_count, mdsc->caps_use_count,
|
||
|
mdsc->caps_reserve_count, mdsc->caps_avail_count);
|
||
|
mdsc->caps_use_count--;
|
||
|
/*
|
||
|
* Keep some preallocated caps around (ceph_min_count), to
|
||
|
* avoid lots of free/alloc churn.
|
||
|
*/
|
||
|
if (mdsc->caps_avail_count >= mdsc->caps_reserve_count +
|
||
|
mdsc->caps_min_count) {
|
||
|
mdsc->caps_total_count--;
|
||
|
kmem_cache_free(ceph_cap_cachep, cap);
|
||
|
} else {
|
||
|
mdsc->caps_avail_count++;
|
||
|
list_add(&cap->caps_item, &mdsc->caps_list);
|
||
|
}
|
||
|
|
||
|
BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
|
||
|
mdsc->caps_reserve_count + mdsc->caps_avail_count);
|
||
|
spin_unlock(&mdsc->caps_list_lock);
|
||
|
}
|
||
|
|
||
|
void ceph_reservation_status(struct ceph_fs_client *fsc,
|
||
|
int *total, int *avail, int *used, int *reserved,
|
||
|
int *min)
|
||
|
{
|
||
|
struct ceph_mds_client *mdsc = fsc->mdsc;
|
||
|
|
||
|
spin_lock(&mdsc->caps_list_lock);
|
||
|
|
||
|
if (total)
|
||
|
*total = mdsc->caps_total_count;
|
||
|
if (avail)
|
||
|
*avail = mdsc->caps_avail_count;
|
||
|
if (used)
|
||
|
*used = mdsc->caps_use_count;
|
||
|
if (reserved)
|
||
|
*reserved = mdsc->caps_reserve_count;
|
||
|
if (min)
|
||
|
*min = mdsc->caps_min_count;
|
||
|
|
||
|
spin_unlock(&mdsc->caps_list_lock);
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
* Find ceph_cap for given mds, if any.
|
||
|
*
|
||
|
* Called with i_ceph_lock held.
|
||
|
*/
|
||
|
struct ceph_cap *__get_cap_for_mds(struct ceph_inode_info *ci, int mds)
|
||
|
{
|
||
|
struct ceph_cap *cap;
|
||
|
struct rb_node *n = ci->i_caps.rb_node;
|
||
|
|
||
|
while (n) {
|
||
|
cap = rb_entry(n, struct ceph_cap, ci_node);
|
||
|
if (mds < cap->mds)
|
||
|
n = n->rb_left;
|
||
|
else if (mds > cap->mds)
|
||
|
n = n->rb_right;
|
||
|
else
|
||
|
return cap;
|
||
|
}
|
||
|
return NULL;
|
||
|
}
|
||
|
|
||
|
struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci, int mds)
|
||
|
{
|
||
|
struct ceph_cap *cap;
|
||
|
|
||
|
spin_lock(&ci->i_ceph_lock);
|
||
|
cap = __get_cap_for_mds(ci, mds);
|
||
|
spin_unlock(&ci->i_ceph_lock);
|
||
|
return cap;
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
* Called under i_ceph_lock.
|
||
|
*/
|
||
|
static void __insert_cap_node(struct ceph_inode_info *ci,
|
||
|
struct ceph_cap *new)
|
||
|
{
|
||
|
struct rb_node **p = &ci->i_caps.rb_node;
|
||
|
struct rb_node *parent = NULL;
|
||
|
struct ceph_cap *cap = NULL;
|
||
|
|
||
|
while (*p) {
|
||
|
parent = *p;
|
||
|
cap = rb_entry(parent, struct ceph_cap, ci_node);
|
||
|
if (new->mds < cap->mds)
|
||
|
p = &(*p)->rb_left;
|
||
|
else if (new->mds > cap->mds)
|
||
|
p = &(*p)->rb_right;
|
||
|
else
|
||
|
BUG();
|
||
|
}
|
||
|
|
||
|
rb_link_node(&new->ci_node, parent, p);
|
||
|
rb_insert_color(&new->ci_node, &ci->i_caps);
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
* (re)set cap hold timeouts, which control the delayed release
|
||
|
* of unused caps back to the MDS. Should be called on cap use.
|
||
|
*/
|
||
|
static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
|
||
|
struct ceph_inode_info *ci)
|
||
|
{
|
||
|
struct ceph_mount_options *opt = mdsc->fsc->mount_options;
|
||
|
ci->i_hold_caps_max = round_jiffies(jiffies +
|
||
|
opt->caps_wanted_delay_max * HZ);
|
||
|
dout("__cap_set_timeouts %p %lu\n", &ci->netfs.inode,
|
||
|
ci->i_hold_caps_max - jiffies);
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
* (Re)queue cap at the end of the delayed cap release list.
|
||
|
*
|
||
|
* If I_FLUSH is set, leave the inode at the front of the list.
|
||
|
*
|
||
|
* Caller holds i_ceph_lock
|
||
|
* -> we take mdsc->cap_delay_lock
|
||
|
*/
|
||
|
static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
|
||
|
struct ceph_inode_info *ci)
|
||
|
{
|
||
|
dout("__cap_delay_requeue %p flags 0x%lx at %lu\n", &ci->netfs.inode,
|
||
|
ci->i_ceph_flags, ci->i_hold_caps_max);
|
||
|
if (!mdsc->stopping) {
|
||
|
spin_lock(&mdsc->cap_delay_lock);
|
||
|
if (!list_empty(&ci->i_cap_delay_list)) {
|
||
|
if (ci->i_ceph_flags & CEPH_I_FLUSH)
|
||
|
goto no_change;
|
||
|
list_del_init(&ci->i_cap_delay_list);
|
||
|
}
|
||
|
__cap_set_timeouts(mdsc, ci);
|
||
|
list_add_tail(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
|
||
|
no_change:
|
||
|
spin_unlock(&mdsc->cap_delay_lock);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
* Queue an inode for immediate writeback. Mark inode with I_FLUSH,
|
||
|
* indicating we should send a cap message to flush dirty metadata
|
||
|
* asap, and move to the front of the delayed cap list.
|
||
|
*/
|
||
|
static void __cap_delay_requeue_front(struct ceph_mds_client *mdsc,
|
||
|
struct ceph_inode_info *ci)
|
||
|
{
|
||
|
dout("__cap_delay_requeue_front %p\n", &ci->netfs.inode);
|
||
|
spin_lock(&mdsc->cap_delay_lock);
|
||
|
ci->i_ceph_flags |= CEPH_I_FLUSH;
|
||
|
if (!list_empty(&ci->i_cap_delay_list))
|
||
|
list_del_init(&ci->i_cap_delay_list);
|
||
|
list_add(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
|
||
|
spin_unlock(&mdsc->cap_delay_lock);
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
* Cancel delayed work on cap.
|
||
|
*
|
||
|
* Caller must hold i_ceph_lock.
|
||
|
*/
|
||
|
static void __cap_delay_cancel(struct ceph_mds_client *mdsc,
|
||
|
struct ceph_inode_info *ci)
|
||
|
{
|
||
|
dout("__cap_delay_cancel %p\n", &ci->netfs.inode);
|
||
|
if (list_empty(&ci->i_cap_delay_list))
|
||
|
return;
|
||
|
spin_lock(&mdsc->cap_delay_lock);
|
||
|
list_del_init(&ci->i_cap_delay_list);
|
||
|
spin_unlock(&mdsc->cap_delay_lock);
|
||
|
}
|
||
|
|
||
|
/* Common issue checks for add_cap, handle_cap_grant. */
|
||
|
static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
|
||
|
unsigned issued)
|
||
|
{
|
||
|
unsigned had = __ceph_caps_issued(ci, NULL);
|
||
|
|
||
|
lockdep_assert_held(&ci->i_ceph_lock);
|
||
|
|
||
|
/*
|
||
|
* Each time we receive FILE_CACHE anew, we increment
|
||
|
* i_rdcache_gen.
|
||
|
*/
|
||
|
if (S_ISREG(ci->netfs.inode.i_mode) &&
|
||
|
(issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) &&
|
||
|
(had & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0) {
|
||
|
ci->i_rdcache_gen++;
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
* If FILE_SHARED is newly issued, mark dir not complete. We don't
|
||
|
* know what happened to this directory while we didn't have the cap.
|
||
|
* If FILE_SHARED is being revoked, also mark dir not complete. It
|
||
|
* stops on-going cached readdir.
|
||
|
*/
|
||
|
if ((issued & CEPH_CAP_FILE_SHARED) != (had & CEPH_CAP_FILE_SHARED)) {
|
||
|
if (issued & CEPH_CAP_FILE_SHARED)
|
||
|
atomic_inc(&ci->i_shared_gen);
|
||
|
if (S_ISDIR(ci->netfs.inode.i_mode)) {
|
||
|
dout(" marking %p NOT complete\n", &ci->netfs.inode);
|
||
|
__ceph_dir_clear_complete(ci);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/* Wipe saved layout if we're losing DIR_CREATE caps */
|
||
|
if (S_ISDIR(ci->netfs.inode.i_mode) && (had & CEPH_CAP_DIR_CREATE) &&
|
||
|
!(issued & CEPH_CAP_DIR_CREATE)) {
|
||
|
ceph_put_string(rcu_dereference_raw(ci->i_cached_layout.pool_ns));
|
||
|
memset(&ci->i_cached_layout, 0, sizeof(ci->i_cached_layout));
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* change_auth_cap_ses - move inode to appropriate lists when auth caps change
|
||
|
* @ci: inode to be moved
|
||
|
* @session: new auth caps session
|
||
|
*/
|
||
|
void change_auth_cap_ses(struct ceph_inode_info *ci,
|
||
|
struct ceph_mds_session *session)
|
||
|
{
|
||
|
lockdep_assert_held(&ci->i_ceph_lock);
|
||
|
|
||
|
if (list_empty(&ci->i_dirty_item) && list_empty(&ci->i_flushing_item))
|
||
|
return;
|
||
|
|
||
|
spin_lock(&session->s_mdsc->cap_dirty_lock);
|
||
|
if (!list_empty(&ci->i_dirty_item))
|
||
|
list_move(&ci->i_dirty_item, &session->s_cap_dirty);
|
||
|
if (!list_empty(&ci->i_flushing_item))
|
||
|
list_move_tail(&ci->i_flushing_item, &session->s_cap_flushing);
|
||
|
spin_unlock(&session->s_mdsc->cap_dirty_lock);
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
* Add a capability under the given MDS session.
|
||
|
*
|
||
|
* Caller should hold session snap_rwsem (read) and ci->i_ceph_lock
|
||
|
*
|
||
|
* @fmode is the open file mode, if we are opening a file, otherwise
|
||
|
* it is < 0. (This is so we can atomically add the cap and add an
|
||
|
* open file reference to it.)
|
||
|
*/
|
||
|
void ceph_add_cap(struct inode *inode,
|
||
|
struct ceph_mds_session *session, u64 cap_id,
|
||
|
unsigned issued, unsigned wanted,
|
||
|
unsigned seq, unsigned mseq, u64 realmino, int flags,
|
||
|
struct ceph_cap **new_cap)
|
||
|
{
|
||
|
struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
|
||
|
struct ceph_inode_info *ci = ceph_inode(inode);
|
||
|
struct ceph_cap *cap;
|
||
|
int mds = session->s_mds;
|
||
|
int actual_wanted;
|
||
|
u32 gen;
|
||
|
|
||
|
lockdep_assert_held(&ci->i_ceph_lock);
|
||
|
|
||
|
dout("add_cap %p mds%d cap %llx %s seq %d\n", inode,
|
||
|
session->s_mds, cap_id, ceph_cap_string(issued), seq);
|
||
|
|
||
|
gen = atomic_read(&session->s_cap_gen);
|
||
|
|
||
|
cap = __get_cap_for_mds(ci, mds);
|
||
|
if (!cap) {
|
||
|
cap = *new_cap;
|
||
|
*new_cap = NULL;
|
||
|
|
||
|
cap->issued = 0;
|
||
|
cap->implemented = 0;
|
||
|
cap->mds = mds;
|
||
|
cap->mds_wanted = 0;
|
||
|
cap->mseq = 0;
|
||
|
|
||
|
cap->ci = ci;
|
||
|
__insert_cap_node(ci, cap);
|
||
|
|
||
|
/* add to session cap list */
|
||
|
cap->session = session;
|
||
|
spin_lock(&session->s_cap_lock);
|
||
|
list_add_tail(&cap->session_caps, &session->s_caps);
|
||
|
session->s_nr_caps++;
|
||
|
atomic64_inc(&mdsc->metric.total_caps);
|
||
|
spin_unlock(&session->s_cap_lock);
|
||
|
} else {
|
||
|
spin_lock(&session->s_cap_lock);
|
||
|
list_move_tail(&cap->session_caps, &session->s_caps);
|
||
|
spin_unlock(&session->s_cap_lock);
|
||
|
|
||
|
if (cap->cap_gen < gen)
|
||
|
cap->issued = cap->implemented = CEPH_CAP_PIN;
|
||
|
|
||
|
/*
|
||
|
* auth mds of the inode changed. we received the cap export
|
||
|
* message, but still haven't received the cap import message.
|
||
|
* handle_cap_export() updated the new auth MDS' cap.
|
||
|
*
|
||
|
* "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing
|
||
|
* a message that was send before the cap import message. So
|
||
|
* don't remove caps.
|
||
|
*/
|
||
|
if (ceph_seq_cmp(seq, cap->seq) <= 0) {
|
||
|
WARN_ON(cap != ci->i_auth_cap);
|
||
|
WARN_ON(cap->cap_id != cap_id);
|
||
|
seq = cap->seq;
|
||
|
mseq = cap->mseq;
|
||
|
issued |= cap->issued;
|
||
|
flags |= CEPH_CAP_FLAG_AUTH;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if (!ci->i_snap_realm ||
|
||
|
((flags & CEPH_CAP_FLAG_AUTH) &&
|
||
|
realmino != (u64)-1 && ci->i_snap_realm->ino != realmino)) {
|
||
|
/*
|
||
|
* add this inode to the appropriate snap realm
|
||
|
*/
|
||
|
struct ceph_snap_realm *realm = ceph_lookup_snap_realm(mdsc,
|
||
|
realmino);
|
||
|
if (realm)
|
||
|
ceph_change_snap_realm(inode, realm);
|
||
|
else
|
||
|
WARN(1, "%s: couldn't find snap realm 0x%llx (ino 0x%llx oldrealm 0x%llx)\n",
|
||
|
__func__, realmino, ci->i_vino.ino,
|
||
|
ci->i_snap_realm ? ci->i_snap_realm->ino : 0);
|
||
|
}
|
||
|
|
||
|
__check_cap_issue(ci, cap, issued);
|
||
|
|
||
|
/*
|
||
|
* If we are issued caps we don't want, or the mds' wanted
|
||
|
* value appears to be off, queue a check so we'll release
|
||
|
* later and/or update the mds wanted value.
|
||
|
*/
|
||
|
actual_wanted = __ceph_caps_wanted(ci);
|
||
|
if ((wanted & ~actual_wanted) ||
|
||
|
(issued & ~actual_wanted & CEPH_CAP_ANY_WR)) {
|
||
|
dout(" issued %s, mds wanted %s, actual %s, queueing\n",
|
||
|
ceph_cap_string(issued), ceph_cap_string(wanted),
|
||
|
ceph_cap_string(actual_wanted));
|
||
|
__cap_delay_requeue(mdsc, ci);
|
||
|
}
|
||
|
|
||
|
if (flags & CEPH_CAP_FLAG_AUTH) {
|
||
|
if (!ci->i_auth_cap ||
|
||
|
ceph_seq_cmp(ci->i_auth_cap->mseq, mseq) < 0) {
|
||
|
if (ci->i_auth_cap &&
|
||
|
ci->i_auth_cap->session != cap->session)
|
||
|
change_auth_cap_ses(ci, cap->session);
|
||
|
ci->i_auth_cap = cap;
|
||
|
cap->mds_wanted = wanted;
|
||
|
}
|
||
|
} else {
|
||
|
WARN_ON(ci->i_auth_cap == cap);
|
||
|
}
|
||
|
|
||
|
dout("add_cap inode %p (%llx.%llx) cap %p %s now %s seq %d mds%d\n",
|
||
|
inode, ceph_vinop(inode), cap, ceph_cap_string(issued),
|
||
|
ceph_cap_string(issued|cap->issued), seq, mds);
|
||
|
cap->cap_id = cap_id;
|
||
|
cap->issued = issued;
|
||
|
cap->implemented |= issued;
|
||
|
if (ceph_seq_cmp(mseq, cap->mseq) > 0)
|
||
|
cap->mds_wanted = wanted;
|
||
|
else
|
||
|
cap->mds_wanted |= wanted;
|
||
|
cap->seq = seq;
|
||
|
cap->issue_seq = seq;
|
||
|
cap->mseq = mseq;
|
||
|
cap->cap_gen = gen;
|
||
|
wake_up_all(&ci->i_cap_wq);
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
* Return true if cap has not timed out and belongs to the current
|
||
|
* generation of the MDS session (i.e. has not gone 'stale' due to
|
||
|
* us losing touch with the mds).
|
||
|
*/
|
||
|
static int __cap_is_valid(struct ceph_cap *cap)
|
||
|
{
|
||
|
unsigned long ttl;
|
||
|
u32 gen;
|
||
|
|
||
|
gen = atomic_read(&cap->session->s_cap_gen);
|
||
|
ttl = cap->session->s_cap_ttl;
|
||
|
|
||
|
if (cap->cap_gen < gen || time_after_eq(jiffies, ttl)) {
|
||
|
dout("__cap_is_valid %p cap %p issued %s "
|
||
|
"but STALE (gen %u vs %u)\n", &cap->ci->netfs.inode,
|
||
|
cap, ceph_cap_string(cap->issued), cap->cap_gen, gen);
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
return 1;
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
* Return set of valid cap bits issued to us. Note that caps time
|
||
|
* out, and may be invalidated in bulk if the client session times out
|
||
|
* and session->s_cap_gen is bumped.
|
||
|
*/
|
||
|
int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented)
|
||
|
{
|
||
|
int have = ci->i_snap_caps;
|
||
|
struct ceph_cap *cap;
|
||
|
struct rb_node *p;
|
||
|
|
||
|
if (implemented)
|
||
|
*implemented = 0;
|
||
|
for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
|
||
|
cap = rb_entry(p, struct ceph_cap, ci_node);
|
||
|
if (!__cap_is_valid(cap))
|
||
|
continue;
|
||
|
dout("__ceph_caps_issued %p cap %p issued %s\n",
|
||
|
&ci->netfs.inode, cap, ceph_cap_string(cap->issued));
|
||
|
have |= cap->issued;
|
||
|
if (implemented)
|
||
|
*implemented |= cap->implemented;
|
||
|
}
|
||
|
/*
|
||
|
* exclude caps issued by non-auth MDS, but are been revoking
|
||
|
* by the auth MDS. The non-auth MDS should be revoking/exporting
|
||
|
* these caps, but the message is delayed.
|
||
|
*/
|
||
|
if (ci->i_auth_cap) {
|
||
|
cap = ci->i_auth_cap;
|
||
|
have &= ~cap->implemented | cap->issued;
|
||
|
}
|
||
|
return have;
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
* Get cap bits issued by caps other than @ocap
|
||
|
*/
|
||
|
int __ceph_caps_issued_other(struct ceph_inode_info *ci, struct ceph_cap *ocap)
|
||
|
{
|
||
|
int have = ci->i_snap_caps;
|
||
|
struct ceph_cap *cap;
|
||
|
struct rb_node *p;
|
||
|
|
||
|
for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
|
||
|
cap = rb_entry(p, struct ceph_cap, ci_node);
|
||
|
if (cap == ocap)
|
||
|
continue;
|
||
|
if (!__cap_is_valid(cap))
|
||
|
continue;
|
||
|
have |= cap->issued;
|
||
|
}
|
||
|
return have;
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
* Move a cap to the end of the LRU (oldest caps at list head, newest
|
||
|
* at list tail).
|
||
|
*/
|
||
|
static void __touch_cap(struct ceph_cap *cap)
|
||
|
{
|
||
|
struct ceph_mds_session *s = cap->session;
|
||
|
|
||
|
spin_lock(&s->s_cap_lock);
|
||
|
if (!s->s_cap_iterator) {
|
||
|
dout("__touch_cap %p cap %p mds%d\n", &cap->ci->netfs.inode, cap,
|
||
|
s->s_mds);
|
||
|
list_move_tail(&cap->session_caps, &s->s_caps);
|
||
|
} else {
|
||
|
dout("__touch_cap %p cap %p mds%d NOP, iterating over caps\n",
|
||
|
&cap->ci->netfs.inode, cap, s->s_mds);
|
||
|
}
|
||
|
spin_unlock(&s->s_cap_lock);
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
* Check if we hold the given mask. If so, move the cap(s) to the
|
||
|
* front of their respective LRUs. (This is the preferred way for
|
||
|
* callers to check for caps they want.)
|
||
|
*/
|
||
|
int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
|
||
|
{
|
||
|
struct ceph_cap *cap;
|
||
|
struct rb_node *p;
|
||
|
int have = ci->i_snap_caps;
|
||
|
|
||
|
if ((have & mask) == mask) {
|
||
|
dout("__ceph_caps_issued_mask ino 0x%llx snap issued %s"
|
||
|
" (mask %s)\n", ceph_ino(&ci->netfs.inode),
|
||
|
ceph_cap_string(have),
|
||
|
ceph_cap_string(mask));
|
||
|
return 1;
|
||
|
}
|
||
|
|
||
|
for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
|
||
|
cap = rb_entry(p, struct ceph_cap, ci_node);
|
||
|
if (!__cap_is_valid(cap))
|
||
|
continue;
|
||
|
if ((cap->issued & mask) == mask) {
|
||
|
dout("__ceph_caps_issued_mask ino 0x%llx cap %p issued %s"
|
||
|
" (mask %s)\n", ceph_ino(&ci->netfs.inode), cap,
|
||
|
ceph_cap_string(cap->issued),
|
||
|
ceph_cap_string(mask));
|
||
|
if (touch)
|
||
|
__touch_cap(cap);
|
||
|
return 1;
|
||
|
}
|
||
|
|
||
|
/* does a combination of caps satisfy mask? */
|
||
|
have |= cap->issued;
|
||
|
if ((have & mask) == mask) {
|
||
|
dout("__ceph_caps_issued_mask ino 0x%llx combo issued %s"
|
||
|
" (mask %s)\n", ceph_ino(&ci->netfs.inode),
|
||
|
ceph_cap_string(cap->issued),
|
||
|
ceph_cap_string(mask));
|
||
|
if (touch) {
|
||
|
struct rb_node *q;
|
||
|
|
||
|
/* touch this + preceding caps */
|
||
|
__touch_cap(cap);
|
||
|
for (q = rb_first(&ci->i_caps); q != p;
|
||
|
q = rb_next(q)) {
|
||
|
cap = rb_entry(q, struct ceph_cap,
|
||
|
ci_node);
|
||
|
if (!__cap_is_valid(cap))
|
||
|
continue;
|
||
|
if (cap->issued & mask)
|
||
|
__touch_cap(cap);
|
||
|
}
|
||
|
}
|
||
|
return 1;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
int __ceph_caps_issued_mask_metric(struct ceph_inode_info *ci, int mask,
|
||
|
int touch)
|
||
|
{
|
||
|
struct ceph_fs_client *fsc = ceph_sb_to_client(ci->netfs.inode.i_sb);
|
||
|
int r;
|
||
|
|
||
|
r = __ceph_caps_issued_mask(ci, mask, touch);
|
||
|
if (r)
|
||
|
ceph_update_cap_hit(&fsc->mdsc->metric);
|
||
|
else
|
||
|
ceph_update_cap_mis(&fsc->mdsc->metric);
|
||
|
return r;
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
* Return true if mask caps are currently being revoked by an MDS.
|
||
|
*/
|
||
|
int __ceph_caps_revoking_other(struct ceph_inode_info *ci,
|
||
|
struct ceph_cap *ocap, int mask)
|
||
|
{
|
||
|
struct ceph_cap *cap;
|
||
|
struct rb_node *p;
|
||
|
|
||
|
for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
|
||
|
cap = rb_entry(p, struct ceph_cap, ci_node);
|
||
|
if (cap != ocap &&
|
||
|
(cap->implemented & ~cap->issued & mask))
|
||
|
return 1;
|
||
|
}
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
int ceph_caps_revoking(struct ceph_inode_info *ci, int mask)
|
||
|
{
|
||
|
struct inode *inode = &ci->netfs.inode;
|
||
|
int ret;
|
||
|
|
||
|
spin_lock(&ci->i_ceph_lock);
|
||
|
ret = __ceph_caps_revoking_other(ci, NULL, mask);
|
||
|
spin_unlock(&ci->i_ceph_lock);
|
||
|
dout("ceph_caps_revoking %p %s = %d\n", inode,
|
||
|
ceph_cap_string(mask), ret);
|
||
|
return ret;
|
||
|
}
|
||
|
|
||
|
int __ceph_caps_used(struct ceph_inode_info *ci)
|
||
|
{
|
||
|
int used = 0;
|
||
|
if (ci->i_pin_ref)
|
||
|
used |= CEPH_CAP_PIN;
|
||
|
if (ci->i_rd_ref)
|
||
|
used |= CEPH_CAP_FILE_RD;
|
||
|
if (ci->i_rdcache_ref ||
|
||
|
(S_ISREG(ci->netfs.inode.i_mode) &&
|
||
|
ci->netfs.inode.i_data.nrpages))
|
||
|
used |= CEPH_CAP_FILE_CACHE;
|
||
|
if (ci->i_wr_ref)
|
||
|
used |= CEPH_CAP_FILE_WR;
|
||
|
if (ci->i_wb_ref || ci->i_wrbuffer_ref)
|
||
|
used |= CEPH_CAP_FILE_BUFFER;
|
||
|
if (ci->i_fx_ref)
|
||
|
used |= CEPH_CAP_FILE_EXCL;
|
||
|
return used;
|
||
|
}
|
||
|
|
||
|
#define FMODE_WAIT_BIAS 1000
|
||
|
|
||
|
/*
|
||
|
* wanted, by virtue of open file modes
|
||
|
*/
|
||
|
int __ceph_caps_file_wanted(struct ceph_inode_info *ci)
|
||
|
{
|
||
|
const int PIN_SHIFT = ffs(CEPH_FILE_MODE_PIN);
|
||
|
const int RD_SHIFT = ffs(CEPH_FILE_MODE_RD);
|
||
|
const int WR_SHIFT = ffs(CEPH_FILE_MODE_WR);
|
||
|
const int LAZY_SHIFT = ffs(CEPH_FILE_MODE_LAZY);
|
||
|
struct ceph_mount_options *opt =
|
||
|
ceph_inode_to_client(&ci->netfs.inode)->mount_options;
|
||
|
unsigned long used_cutoff = jiffies - opt->caps_wanted_delay_max * HZ;
|
||
|
unsigned long idle_cutoff = jiffies - opt->caps_wanted_delay_min * HZ;
|
||
|
|
||
|
if (S_ISDIR(ci->netfs.inode.i_mode)) {
|
||
|
int want = 0;
|
||
|
|
||
|
/* use used_cutoff here, to keep dir's wanted caps longer */
|
||
|
if (ci->i_nr_by_mode[RD_SHIFT] > 0 ||
|
||
|
time_after(ci->i_last_rd, used_cutoff))
|
||
|
want |= CEPH_CAP_ANY_SHARED;
|
||
|
|
||
|
if (ci->i_nr_by_mode[WR_SHIFT] > 0 ||
|
||
|
time_after(ci->i_last_wr, used_cutoff)) {
|
||
|
want |= CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL;
|
||
|
if (opt->flags & CEPH_MOUNT_OPT_ASYNC_DIROPS)
|
||
|
want |= CEPH_CAP_ANY_DIR_OPS;
|
||
|
}
|
||
|
|
||
|
if (want || ci->i_nr_by_mode[PIN_SHIFT] > 0)
|
||
|
want |= CEPH_CAP_PIN;
|
||
|
|
||
|
return want;
|
||
|
} else {
|
||
|
int bits = 0;
|
||
|
|
||
|
if (ci->i_nr_by_mode[RD_SHIFT] > 0) {
|
||
|
if (ci->i_nr_by_mode[RD_SHIFT] >= FMODE_WAIT_BIAS ||
|
||
|
time_after(ci->i_last_rd, used_cutoff))
|
||
|
bits |= 1 << RD_SHIFT;
|
||
|
} else if (time_after(ci->i_last_rd, idle_cutoff)) {
|
||
|
bits |= 1 << RD_SHIFT;
|
||
|
}
|
||
|
|
||
|
if (ci->i_nr_by_mode[WR_SHIFT] > 0) {
|
||
|
if (ci->i_nr_by_mode[WR_SHIFT] >= FMODE_WAIT_BIAS ||
|
||
|
time_after(ci->i_last_wr, used_cutoff))
|
||
|
bits |= 1 << WR_SHIFT;
|
||
|
} else if (time_after(ci->i_last_wr, idle_cutoff)) {
|
||
|
bits |= 1 << WR_SHIFT;
|
||
|
}
|
||
|
|
||
|
/* check lazyio only when read/write is wanted */
|
||
|
if ((bits & (CEPH_FILE_MODE_RDWR << 1)) &&
|
||
|
ci->i_nr_by_mode[LAZY_SHIFT] > 0)
|
||
|
bits |= 1 << LAZY_SHIFT;
|
||
|
|
||
|
return bits ? ceph_caps_for_mode(bits >> 1) : 0;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
* wanted, by virtue of open file modes AND cap refs (buffered/cached data)
|
||
|
*/
|
||
|
int __ceph_caps_wanted(struct ceph_inode_info *ci)
|
||
|
{
|
||
|
int w = __ceph_caps_file_wanted(ci) | __ceph_caps_used(ci);
|
||
|
if (S_ISDIR(ci->netfs.inode.i_mode)) {
|
||
|
/* we want EXCL if holding caps of dir ops */
|
||
|
if (w & CEPH_CAP_ANY_DIR_OPS)
|
||
|
w |= CEPH_CAP_FILE_EXCL;
|
||
|
} else {
|
||
|
/* we want EXCL if dirty data */
|
||
|
if (w & CEPH_CAP_FILE_BUFFER)
|
||
|
w |= CEPH_CAP_FILE_EXCL;
|
||
|
}
|
||
|
return w;
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
* Return caps we have registered with the MDS(s) as 'wanted'.
|
||
|
*/
|
||
|
int __ceph_caps_mds_wanted(struct ceph_inode_info *ci, bool check)
|
||
|
{
|
||
|
struct ceph_cap *cap;
|
||
|
struct rb_node *p;
|
||
|
int mds_wanted = 0;
|
||
|
|
||
|
for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
|
||
|
cap = rb_entry(p, struct ceph_cap, ci_node);
|
||
|
if (check && !__cap_is_valid(cap))
|
||
|
continue;
|
||
|
if (cap == ci->i_auth_cap)
|
||
|
mds_wanted |= cap->mds_wanted;
|
||
|
else
|
||
|
mds_wanted |= (cap->mds_wanted & ~CEPH_CAP_ANY_FILE_WR);
|
||
|
}
|
||
|
return mds_wanted;
|
||
|
}
|
||
|
|
||
|
int ceph_is_any_caps(struct inode *inode)
|
||
|
{
|
||
|
struct ceph_inode_info *ci = ceph_inode(inode);
|
||
|
int ret;
|
||
|
|
||
|
spin_lock(&ci->i_ceph_lock);
|
||
|
ret = __ceph_is_any_real_caps(ci);
|
||
|
spin_unlock(&ci->i_ceph_lock);
|
||
|
|
||
|
return ret;
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
* Remove a cap. Take steps to deal with a racing iterate_session_caps.
|
||
|
*
|
||
|
* caller should hold i_ceph_lock.
|
||
|
* caller will not hold session s_mutex if called from destroy_inode.
|
||
|
*/
|
||
|
void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release)
|
||
|
{
|
||
|
struct ceph_mds_session *session = cap->session;
|
||
|
struct ceph_inode_info *ci = cap->ci;
|
||
|
struct ceph_mds_client *mdsc;
|
||
|
int removed = 0;
|
||
|
|
||
|
/* 'ci' being NULL means the remove have already occurred */
|
||
|
if (!ci) {
|
||
|
dout("%s: cap inode is NULL\n", __func__);
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
lockdep_assert_held(&ci->i_ceph_lock);
|
||
|
|
||
|
dout("__ceph_remove_cap %p from %p\n", cap, &ci->netfs.inode);
|
||
|
|
||
|
mdsc = ceph_inode_to_client(&ci->netfs.inode)->mdsc;
|
||
|
|
||
|
/* remove from inode's cap rbtree, and clear auth cap */
|
||
|
rb_erase(&cap->ci_node, &ci->i_caps);
|
||
|
if (ci->i_auth_cap == cap)
|
||
|
ci->i_auth_cap = NULL;
|
||
|
|
||
|
/* remove from session list */
|
||
|
spin_lock(&session->s_cap_lock);
|
||
|
if (session->s_cap_iterator == cap) {
|
||
|
/* not yet, we are iterating over this very cap */
|
||
|
dout("__ceph_remove_cap delaying %p removal from session %p\n",
|
||
|
cap, cap->session);
|
||
|
} else {
|
||
|
list_del_init(&cap->session_caps);
|
||
|
session->s_nr_caps--;
|
||
|
atomic64_dec(&mdsc->metric.total_caps);
|
||
|
cap->session = NULL;
|
||
|
removed = 1;
|
||
|
}
|
||
|
/* protect backpointer with s_cap_lock: see iterate_session_caps */
|
||
|
cap->ci = NULL;
|
||
|
|
||
|
/*
|
||
|
* s_cap_reconnect is protected by s_cap_lock. no one changes
|
||
|
* s_cap_gen while session is in the reconnect state.
|
||
|
*/
|
||
|
if (queue_release &&
|
||
|
(!session->s_cap_reconnect ||
|
||
|
cap->cap_gen == atomic_read(&session->s_cap_gen))) {
|
||
|
cap->queue_release = 1;
|
||
|
if (removed) {
|
||
|
__ceph_queue_cap_release(session, cap);
|
||
|
removed = 0;
|
||
|
}
|
||
|
} else {
|
||
|
cap->queue_release = 0;
|
||
|
}
|
||
|
cap->cap_ino = ci->i_vino.ino;
|
||
|
|
||
|
spin_unlock(&session->s_cap_lock);
|
||
|
|
||
|
if (removed)
|
||
|
ceph_put_cap(mdsc, cap);
|
||
|
|
||
|
if (!__ceph_is_any_real_caps(ci)) {
|
||
|
/* when reconnect denied, we remove session caps forcibly,
|
||
|
* i_wr_ref can be non-zero. If there are ongoing write,
|
||
|
* keep i_snap_realm.
|
||
|
*/
|
||
|
if (ci->i_wr_ref == 0 && ci->i_snap_realm)
|
||
|
ceph_change_snap_realm(&ci->netfs.inode, NULL);
|
||
|
|
||
|
__cap_delay_cancel(mdsc, ci);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
void ceph_remove_cap(struct ceph_cap *cap, bool queue_release)
|
||
|
{
|
||
|
struct ceph_inode_info *ci = cap->ci;
|
||
|
struct ceph_fs_client *fsc;
|
||
|
|
||
|
/* 'ci' being NULL means the remove have already occurred */
|
||
|
if (!ci) {
|
||
|
dout("%s: cap inode is NULL\n", __func__);
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
lockdep_assert_held(&ci->i_ceph_lock);
|
||
|
|
||
|
fsc = ceph_inode_to_client(&ci->netfs.inode);
|
||
|
WARN_ON_ONCE(ci->i_auth_cap == cap &&
|
||
|
!list_empty(&ci->i_dirty_item) &&
|
||
|
!fsc->blocklisted &&
|
||
|
!ceph_inode_is_shutdown(&ci->netfs.inode));
|
||
|
|
||
|
__ceph_remove_cap(cap, queue_release);
|
||
|
}
|
||
|
|
||
|
struct cap_msg_args {
|
||
|
struct ceph_mds_session *session;
|
||
|
u64 ino, cid, follows;
|
||
|
u64 flush_tid, oldest_flush_tid, size, max_size;
|
||
|