Browse Source

Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client

* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client: (205 commits)
  ceph: update for write_inode API change
  ceph: reset osd after relevant messages timed out
  ceph: fix flush_dirty_caps race with caps migration
  ceph: include migrating caps in issued set
  ceph: fix osdmap decoding when pools include (removed) snaps
  ceph: return EBADF if waiting for caps on closed file
  ceph: set osd request message front length correctly
  ceph: reset front len on return to msgpool; BUG on mismatched front iov
  ceph: fix snaptrace decoding on cap migration between mds
  ceph: use single osd op reply msg
  ceph: reset bits on connection close
  ceph: remove bogus mds forward warning
  ceph: remove fragile __map_osds optimization
  ceph: fix connection fault STANDBY check
  ceph: invalidate_authorizer without con->mutex held
  ceph: don't clobber write return value when using O_SYNC
  ceph: fix client_request_forward decoding
  ceph: drop messages on unregistered mds sessions; cleanup
  ceph: fix comments, locking in destroy_inode
  ceph: move dereference after NULL test
  ...

Fix trivial conflicts in Documentation/ioctl/ioctl-number.txt
master
Linus Torvalds 12 years ago
parent
commit
fc7f99cf36
  1. 139
      Documentation/filesystems/ceph.txt
  2. 1
      Documentation/ioctl/ioctl-number.txt
  3. 9
      MAINTAINERS
  4. 1
      fs/Kconfig
  5. 1
      fs/Makefile
  6. 27
      fs/ceph/Kconfig
  7. 39
      fs/ceph/Makefile
  8. 20
      fs/ceph/README
  9. 1188
      fs/ceph/addr.c
  10. 99
      fs/ceph/armor.c
  11. 257
      fs/ceph/auth.c
  12. 84
      fs/ceph/auth.h
  13. 121
      fs/ceph/auth_none.c
  14. 28
      fs/ceph/auth_none.h
  15. 656
      fs/ceph/auth_x.c
  16. 49
      fs/ceph/auth_x.h
  17. 90
      fs/ceph/auth_x_protocol.h
  18. 78
      fs/ceph/buffer.c
  19. 39
      fs/ceph/buffer.h
  20. 2927
      fs/ceph/caps.c
  21. 37
      fs/ceph/ceph_debug.h
  22. 21
      fs/ceph/ceph_frag.c
  23. 109
      fs/ceph/ceph_frag.h
  24. 74
      fs/ceph/ceph_fs.c
  25. 650
      fs/ceph/ceph_fs.h
  26. 118
      fs/ceph/ceph_hash.c
  27. 13
      fs/ceph/ceph_hash.h
  28. 176
      fs/ceph/ceph_strings.c
  29. 151
      fs/ceph/crush/crush.c
  30. 180
      fs/ceph/crush/crush.h
  31. 149
      fs/ceph/crush/hash.c
  32. 17
      fs/ceph/crush/hash.h
  33. 596
      fs/ceph/crush/mapper.c
  34. 20
      fs/ceph/crush/mapper.h
  35. 408
      fs/ceph/crypto.c
  36. 48
      fs/ceph/crypto.h
  37. 483
      fs/ceph/debugfs.c
  38. 194
      fs/ceph/decode.h
  39. 1220
      fs/ceph/dir.c
  40. 223
      fs/ceph/export.c
  41. 937
      fs/ceph/file.c
  42. 1750
      fs/ceph/inode.c
  43. 160
      fs/ceph/ioctl.c
  44. 40
      fs/ceph/ioctl.h
  45. 3021
      fs/ceph/mds_client.c
  46. 335
      fs/ceph/mds_client.h
  47. 174
      fs/ceph/mdsmap.c
  48. 54
      fs/ceph/mdsmap.h
  49. 2240
      fs/ceph/messenger.c
  50. 254
      fs/ceph/messenger.h
  51. 834
      fs/ceph/mon_client.c
  52. 119
      fs/ceph/mon_client.h
  53. 186
      fs/ceph/msgpool.c
  54. 27
      fs/ceph/msgpool.h
  55. 158
      fs/ceph/msgr.h
  56. 1537
      fs/ceph/osd_client.c
  57. 166
      fs/ceph/osd_client.h
  58. 1019
      fs/ceph/osdmap.c
  59. 125
      fs/ceph/osdmap.h
  60. 54
      fs/ceph/pagelist.c
  61. 54
      fs/ceph/pagelist.h
  62. 374
      fs/ceph/rados.h
  63. 904
      fs/ceph/snap.c
  64. 1030
      fs/ceph/super.c
  65. 901
      fs/ceph/super.h
  66. 29
      fs/ceph/types.h
  67. 844
      fs/ceph/xattr.c

139
Documentation/filesystems/ceph.txt

@ -0,0 +1,139 @@ @@ -0,0 +1,139 @@
Ceph Distributed File System
============================
Ceph is a distributed network file system designed to provide good
performance, reliability, and scalability.
Basic features include:
* POSIX semantics
* Seamless scaling from 1 to many thousands of nodes
* High availability and reliability. No single points of failure.
* N-way replication of data across storage nodes
* Fast recovery from node failures
* Automatic rebalancing of data on node addition/removal
* Easy deployment: most FS components are userspace daemons
Also,
* Flexible snapshots (on any directory)
* Recursive accounting (nested files, directories, bytes)
In contrast to cluster filesystems like GFS, OCFS2, and GPFS that rely
on symmetric access by all clients to shared block devices, Ceph
separates data and metadata management into independent server
clusters, similar to Lustre. Unlike Lustre, however, metadata and
storage nodes run entirely as user space daemons. Storage nodes
utilize btrfs to store data objects, leveraging its advanced features
(checksumming, metadata replication, etc.). File data is striped
across storage nodes in large chunks to distribute workload and
facilitate high throughputs. When storage nodes fail, data is
re-replicated in a distributed fashion by the storage nodes themselves
(with some minimal coordination from a cluster monitor), making the
system extremely efficient and scalable.
Metadata servers effectively form a large, consistent, distributed
in-memory cache above the file namespace that is extremely scalable,
dynamically redistributes metadata in response to workload changes,
and can tolerate arbitrary (well, non-Byzantine) node failures. The
metadata server takes a somewhat unconventional approach to metadata
storage to significantly improve performance for common workloads. In
particular, inodes with only a single link are embedded in
directories, allowing entire directories of dentries and inodes to be
loaded into its cache with a single I/O operation. The contents of
extremely large directories can be fragmented and managed by
independent metadata servers, allowing scalable concurrent access.
The system offers automatic data rebalancing/migration when scaling
from a small cluster of just a few nodes to many hundreds, without
requiring an administrator carve the data set into static volumes or
go through the tedious process of migrating data between servers.
When the file system approaches full, new nodes can be easily added
and things will "just work."
Ceph includes flexible snapshot mechanism that allows a user to create
a snapshot on any subdirectory (and its nested contents) in the
system. Snapshot creation and deletion are as simple as 'mkdir
.snap/foo' and 'rmdir .snap/foo'.
Ceph also provides some recursive accounting on directories for nested
files and bytes. That is, a 'getfattr -d foo' on any directory in the
system will reveal the total number of nested regular files and
subdirectories, and a summation of all nested file sizes. This makes
the identification of large disk space consumers relatively quick, as
no 'du' or similar recursive scan of the file system is required.
Mount Syntax
============
The basic mount syntax is:
# mount -t ceph monip[:port][,monip2[:port]...]:/[subdir] mnt
You only need to specify a single monitor, as the client will get the
full list when it connects. (However, if the monitor you specify
happens to be down, the mount won't succeed.) The port can be left
off if the monitor is using the default. So if the monitor is at
1.2.3.4,
# mount -t ceph 1.2.3.4:/ /mnt/ceph
is sufficient. If /sbin/mount.ceph is installed, a hostname can be
used instead of an IP address.
Mount Options
=============
ip=A.B.C.D[:N]
Specify the IP and/or port the client should bind to locally.
There is normally not much reason to do this. If the IP is not
specified, the client's IP address is determined by looking at the
address it's connection to the monitor originates from.
wsize=X
Specify the maximum write size in bytes. By default there is no
maximu. Ceph will normally size writes based on the file stripe
size.
rsize=X
Specify the maximum readahead.
mount_timeout=X
Specify the timeout value for mount (in seconds), in the case
of a non-responsive Ceph file system. The default is 30
seconds.
rbytes
When stat() is called on a directory, set st_size to 'rbytes',
the summation of file sizes over all files nested beneath that
directory. This is the default.
norbytes
When stat() is called on a directory, set st_size to the
number of entries in that directory.
nocrc
Disable CRC32C calculation for data writes. If set, the OSD
must rely on TCP's error correction to detect data corruption
in the data payload.
noasyncreaddir
Disable client's use its local cache to satisfy readdir
requests. (This does not change correctness; the client uses
cached metadata only when a lease or capability ensures it is
valid.)
More Information
================
For more information on Ceph, see the home page at
http://ceph.newdream.net/
The Linux kernel client source tree is available at
git://ceph.newdream.net/linux-ceph-client.git
and the source for the full system is at
git://ceph.newdream.net/ceph.git

1
Documentation/ioctl/ioctl-number.txt

@ -291,6 +291,7 @@ Code Seq#(hex) Include File Comments @@ -291,6 +291,7 @@ Code Seq#(hex) Include File Comments
0x92 00-0F drivers/usb/mon/mon_bin.c
0x93 60-7F linux/auto_fs.h
0x94 all fs/btrfs/ioctl.h
0x97 00-7F fs/ceph/ioctl.h Ceph file system
0x99 00-0F 537-Addinboard driver
<mailto:buk@buks.ipn.de>
0xA0 all linux/sdp/sdp.h Industrial Device Project

9
MAINTAINERS

@ -1441,6 +1441,15 @@ F: arch/powerpc/include/asm/spu*.h @@ -1441,6 +1441,15 @@ F: arch/powerpc/include/asm/spu*.h
F: arch/powerpc/oprofile/*cell*
F: arch/powerpc/platforms/cell/
CEPH DISTRIBUTED FILE SYSTEM CLIENT
M: Sage Weil <sage@newdream.net>
L: ceph-devel@lists.sourceforge.net
W: http://ceph.newdream.net/
T: git git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client.git
S: Supported
F: Documentation/filesystems/ceph.txt
F: fs/ceph
CERTIFIED WIRELESS USB (WUSB) SUBSYSTEM:
M: David Vrabel <david.vrabel@csr.com>
L: linux-usb@vger.kernel.org

1
fs/Kconfig

@ -235,6 +235,7 @@ config NFS_COMMON @@ -235,6 +235,7 @@ config NFS_COMMON
source "net/sunrpc/Kconfig"
source "fs/smbfs/Kconfig"
source "fs/ceph/Kconfig"
source "fs/cifs/Kconfig"
source "fs/ncpfs/Kconfig"
source "fs/coda/Kconfig"

1
fs/Makefile

@ -125,3 +125,4 @@ obj-$(CONFIG_OCFS2_FS) += ocfs2/ @@ -125,3 +125,4 @@ obj-$(CONFIG_OCFS2_FS) += ocfs2/
obj-$(CONFIG_BTRFS_FS) += btrfs/
obj-$(CONFIG_GFS2_FS) += gfs2/
obj-$(CONFIG_EXOFS_FS) += exofs/
obj-$(CONFIG_CEPH_FS) += ceph/

27
fs/ceph/Kconfig

@ -0,0 +1,27 @@ @@ -0,0 +1,27 @@
config CEPH_FS
tristate "Ceph distributed file system (EXPERIMENTAL)"
depends on INET && EXPERIMENTAL
select LIBCRC32C
select CONFIG_CRYPTO_AES
help
Choose Y or M here to include support for mounting the
experimental Ceph distributed file system. Ceph is an extremely
scalable file system designed to provide high performance,
reliable access to petabytes of storage.
More information at http://ceph.newdream.net/.
If unsure, say N.
config CEPH_FS_PRETTYDEBUG
bool "Include file:line in ceph debug output"
depends on CEPH_FS
default n
help
If you say Y here, debug output will include a filename and
line to aid debugging. This icnreases kernel size and slows
execution slightly when debug call sites are enabled (e.g.,
via CONFIG_DYNAMIC_DEBUG).
If unsure, say N.

39
fs/ceph/Makefile

@ -0,0 +1,39 @@ @@ -0,0 +1,39 @@
#
# Makefile for CEPH filesystem.
#
ifneq ($(KERNELRELEASE),)
obj-$(CONFIG_CEPH_FS) += ceph.o
ceph-objs := super.o inode.o dir.o file.o addr.o ioctl.o \
export.o caps.o snap.o xattr.o \
messenger.o msgpool.o buffer.o pagelist.o \
mds_client.o mdsmap.o \
mon_client.o \
osd_client.o osdmap.o crush/crush.o crush/mapper.o crush/hash.o \
debugfs.o \
auth.o auth_none.o \
crypto.o armor.o \
auth_x.o \
ceph_fs.o ceph_strings.o ceph_hash.o ceph_frag.o
else
#Otherwise we were called directly from the command
# line; invoke the kernel build system.
KERNELDIR ?= /lib/modules/$(shell uname -r)/build
PWD := $(shell pwd)
default: all
all:
$(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_FS=m modules
modules_install:
$(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_FS=m modules_install
clean:
$(MAKE) -C $(KERNELDIR) M=$(PWD) clean
endif

20
fs/ceph/README

@ -0,0 +1,20 @@ @@ -0,0 +1,20 @@
#
# The following files are shared by (and manually synchronized
# between) the Ceph userland and kernel client.
#
# userland kernel
src/include/ceph_fs.h fs/ceph/ceph_fs.h
src/include/ceph_fs.cc fs/ceph/ceph_fs.c
src/include/msgr.h fs/ceph/msgr.h
src/include/rados.h fs/ceph/rados.h
src/include/ceph_strings.cc fs/ceph/ceph_strings.c
src/include/ceph_frag.h fs/ceph/ceph_frag.h
src/include/ceph_frag.cc fs/ceph/ceph_frag.c
src/include/ceph_hash.h fs/ceph/ceph_hash.h
src/include/ceph_hash.cc fs/ceph/ceph_hash.c
src/crush/crush.c fs/ceph/crush/crush.c
src/crush/crush.h fs/ceph/crush/crush.h
src/crush/mapper.c fs/ceph/crush/mapper.c
src/crush/mapper.h fs/ceph/crush/mapper.h
src/crush/hash.h fs/ceph/crush/hash.h
src/crush/hash.c fs/ceph/crush/hash.c

1188
fs/ceph/addr.c

File diff suppressed because it is too large Load Diff

99
fs/ceph/armor.c

@ -0,0 +1,99 @@ @@ -0,0 +1,99 @@
#include <linux/errno.h>
/*
* base64 encode/decode.
*/
const char *pem_key = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
static int encode_bits(int c)
{
return pem_key[c];
}
static int decode_bits(char c)
{
if (c >= 'A' && c <= 'Z')
return c - 'A';
if (c >= 'a' && c <= 'z')
return c - 'a' + 26;
if (c >= '0' && c <= '9')
return c - '0' + 52;
if (c == '+')
return 62;
if (c == '/')
return 63;
if (c == '=')
return 0; /* just non-negative, please */
return -EINVAL;
}
int ceph_armor(char *dst, const char *src, const char *end)
{
int olen = 0;
int line = 0;
while (src < end) {
unsigned char a, b, c;
a = *src++;
*dst++ = encode_bits(a >> 2);
if (src < end) {
b = *src++;
*dst++ = encode_bits(((a & 3) << 4) | (b >> 4));
if (src < end) {
c = *src++;
*dst++ = encode_bits(((b & 15) << 2) |
(c >> 6));
*dst++ = encode_bits(c & 63);
} else {
*dst++ = encode_bits((b & 15) << 2);
*dst++ = '=';
}
} else {
*dst++ = encode_bits(((a & 3) << 4));
*dst++ = '=';
*dst++ = '=';
}
olen += 4;
line += 4;
if (line == 64) {
line = 0;
*(dst++) = '\n';
olen++;
}
}
return olen;
}
int ceph_unarmor(char *dst, const char *src, const char *end)
{
int olen = 0;
while (src < end) {
int a, b, c, d;
if (src < end && src[0] == '\n')
src++;
if (src + 4 > end)
return -EINVAL;
a = decode_bits(src[0]);
b = decode_bits(src[1]);
c = decode_bits(src[2]);
d = decode_bits(src[3]);
if (a < 0 || b < 0 || c < 0 || d < 0)
return -EINVAL;
*dst++ = (a << 2) | (b >> 4);
if (src[2] == '=')
return olen + 1;
*dst++ = ((b & 15) << 4) | (c >> 2);
if (src[3] == '=')
return olen + 2;
*dst++ = ((c & 3) << 6) | d;
olen += 3;
src += 4;
}
return olen;
}

257
fs/ceph/auth.c

@ -0,0 +1,257 @@ @@ -0,0 +1,257 @@
#include "ceph_debug.h"
#include <linux/module.h>
#include <linux/err.h>
#include "types.h"
#include "auth_none.h"
#include "auth_x.h"
#include "decode.h"
#include "super.h"
#include "messenger.h"
/*
* get protocol handler
*/
static u32 supported_protocols[] = {
CEPH_AUTH_NONE,
CEPH_AUTH_CEPHX
};
int ceph_auth_init_protocol(struct ceph_auth_client *ac, int protocol)
{
switch (protocol) {
case CEPH_AUTH_NONE:
return ceph_auth_none_init(ac);
case CEPH_AUTH_CEPHX:
return ceph_x_init(ac);
default:
return -ENOENT;
}
}
/*
* setup, teardown.
*/
struct ceph_auth_client *ceph_auth_init(const char *name, const char *secret)
{
struct ceph_auth_client *ac;
int ret;
dout("auth_init name '%s' secret '%s'\n", name, secret);
ret = -ENOMEM;
ac = kzalloc(sizeof(*ac), GFP_NOFS);
if (!ac)
goto out;
ac->negotiating = true;
if (name)
ac->name = name;
else
ac->name = CEPH_AUTH_NAME_DEFAULT;
dout("auth_init name %s secret %s\n", ac->name, secret);
ac->secret = secret;
return ac;
out:
return ERR_PTR(ret);
}
void ceph_auth_destroy(struct ceph_auth_client *ac)
{
dout("auth_destroy %p\n", ac);
if (ac->ops)
ac->ops->destroy(ac);
kfree(ac);
}
/*
* Reset occurs when reconnecting to the monitor.
*/
void ceph_auth_reset(struct ceph_auth_client *ac)
{
dout("auth_reset %p\n", ac);
if (ac->ops && !ac->negotiating)
ac->ops->reset(ac);
ac->negotiating = true;
}
int ceph_entity_name_encode(const char *name, void **p, void *end)
{
int len = strlen(name);
if (*p + 2*sizeof(u32) + len > end)
return -ERANGE;
ceph_encode_32(p, CEPH_ENTITY_TYPE_CLIENT);
ceph_encode_32(p, len);
ceph_encode_copy(p, name, len);
return 0;
}
/*
* Initiate protocol negotiation with monitor. Include entity name
* and list supported protocols.
*/
int ceph_auth_build_hello(struct ceph_auth_client *ac, void *buf, size_t len)
{
struct ceph_mon_request_header *monhdr = buf;
void *p = monhdr + 1, *end = buf + len, *lenp;
int i, num;
int ret;
dout("auth_build_hello\n");
monhdr->have_version = 0;
monhdr->session_mon = cpu_to_le16(-1);
monhdr->session_mon_tid = 0;
ceph_encode_32(&p, 0); /* no protocol, yet */
lenp = p;
p += sizeof(u32);
ceph_decode_need(&p, end, 1 + sizeof(u32), bad);
ceph_encode_8(&p, 1);
num = ARRAY_SIZE(supported_protocols);
ceph_encode_32(&p, num);
ceph_decode_need(&p, end, num * sizeof(u32), bad);
for (i = 0; i < num; i++)
ceph_encode_32(&p, supported_protocols[i]);
ret = ceph_entity_name_encode(ac->name, &p, end);
if (ret < 0)
return ret;
ceph_decode_need(&p, end, sizeof(u64), bad);
ceph_encode_64(&p, ac->global_id);
ceph_encode_32(&lenp, p - lenp - sizeof(u32));
return p - buf;
bad:
return -ERANGE;
}
int ceph_build_auth_request(struct ceph_auth_client *ac,
void *msg_buf, size_t msg_len)
{
struct ceph_mon_request_header *monhdr = msg_buf;
void *p = monhdr + 1;
void *end = msg_buf + msg_len;
int ret;
monhdr->have_version = 0;
monhdr->session_mon = cpu_to_le16(-1);
monhdr->session_mon_tid = 0;
ceph_encode_32(&p, ac->protocol);
ret = ac->ops->build_request(ac, p + sizeof(u32), end);
if (ret < 0) {
pr_err("error %d building request\n", ret);
return ret;
}
dout(" built request %d bytes\n", ret);
ceph_encode_32(&p, ret);
return p + ret - msg_buf;
}
/*
* Handle auth message from monitor.
*/
int ceph_handle_auth_reply(struct ceph_auth_client *ac,
void *buf, size_t len,
void *reply_buf, size_t reply_len)
{
void *p = buf;
void *end = buf + len;
int protocol;
s32 result;
u64 global_id;
void *payload, *payload_end;
int payload_len;
char *result_msg;
int result_msg_len;
int ret = -EINVAL;
dout("handle_auth_reply %p %p\n", p, end);
ceph_decode_need(&p, end, sizeof(u32) * 3 + sizeof(u64), bad);
protocol = ceph_decode_32(&p);
result = ceph_decode_32(&p);
global_id = ceph_decode_64(&p);
payload_len = ceph_decode_32(&p);
payload = p;
p += payload_len;
ceph_decode_need(&p, end, sizeof(u32), bad);
result_msg_len = ceph_decode_32(&p);
result_msg = p;
p += result_msg_len;
if (p != end)
goto bad;
dout(" result %d '%.*s' gid %llu len %d\n", result, result_msg_len,
result_msg, global_id, payload_len);
payload_end = payload + payload_len;
if (global_id && ac->global_id != global_id) {
dout(" set global_id %lld -> %lld\n", ac->global_id, global_id);
ac->global_id = global_id;
}
if (ac->negotiating) {
/* server does not support our protocols? */
if (!protocol && result < 0) {
ret = result;
goto out;
}
/* set up (new) protocol handler? */
if (ac->protocol && ac->protocol != protocol) {
ac->ops->destroy(ac);
ac->protocol = 0;
ac->ops = NULL;
}
if (ac->protocol != protocol) {
ret = ceph_auth_init_protocol(ac, protocol);
if (ret) {
pr_err("error %d on auth protocol %d init\n",
ret, protocol);
goto out;
}
}
ac->negotiating = false;
}
ret = ac->ops->handle_reply(ac, result, payload, payload_end);
if (ret == -EAGAIN) {
return ceph_build_auth_request(ac, reply_buf, reply_len);
} else if (ret) {
pr_err("authentication error %d\n", ret);
return ret;
}
return 0;
bad:
pr_err("failed to decode auth msg\n");
out:
return ret;
}
int ceph_build_auth(struct ceph_auth_client *ac,
void *msg_buf, size_t msg_len)
{
if (!ac->protocol)
return ceph_auth_build_hello(ac, msg_buf, msg_len);
BUG_ON(!ac->ops);
if (!ac->ops->is_authenticated(ac))
return ceph_build_auth_request(ac, msg_buf, msg_len);
return 0;
}
int ceph_auth_is_authenticated(struct ceph_auth_client *ac)
{
if (!ac->ops)
return 0;
return ac->ops->is_authenticated(ac);
}

84
fs/ceph/auth.h

@ -0,0 +1,84 @@ @@ -0,0 +1,84 @@
#ifndef _FS_CEPH_AUTH_H
#define _FS_CEPH_AUTH_H
#include "types.h"
#include "buffer.h"
/*
* Abstract interface for communicating with the authenticate module.
* There is some handshake that takes place between us and the monitor
* to acquire the necessary keys. These are used to generate an
* 'authorizer' that we use when connecting to a service (mds, osd).
*/
struct ceph_auth_client;
struct ceph_authorizer;
struct ceph_auth_client_ops {
/*
* true if we are authenticated and can connect to
* services.
*/
int (*is_authenticated)(struct ceph_auth_client *ac);
/*
* build requests and process replies during monitor
* handshake. if handle_reply returns -EAGAIN, we build
* another request.
*/
int (*build_request)(struct ceph_auth_client *ac, void *buf, void *end);
int (*handle_reply)(struct ceph_auth_client *ac, int result,
void *buf, void *end);
/*
* Create authorizer for connecting to a service, and verify
* the response to authenticate the service.
*/
int (*create_authorizer)(struct ceph_auth_client *ac, int peer_type,
struct ceph_authorizer **a,
void **buf, size_t *len,
void **reply_buf, size_t *reply_len);
int (*verify_authorizer_reply)(struct ceph_auth_client *ac,
struct ceph_authorizer *a, size_t len);
void (*destroy_authorizer)(struct ceph_auth_client *ac,
struct ceph_authorizer *a);
void (*invalidate_authorizer)(struct ceph_auth_client *ac,
int peer_type);
/* reset when we (re)connect to a monitor */
void (*reset)(struct ceph_auth_client *ac);
void (*destroy)(struct ceph_auth_client *ac);
};
struct ceph_auth_client {
u32 protocol; /* CEPH_AUTH_* */
void *private; /* for use by protocol implementation */
const struct ceph_auth_client_ops *ops; /* null iff protocol==0 */
bool negotiating; /* true if negotiating protocol */
const char *name; /* entity name */
u64 global_id; /* our unique id in system */
const char *secret; /* our secret key */
unsigned want_keys; /* which services we want */
};
extern struct ceph_auth_client *ceph_auth_init(const char *name,
const char *secret);
extern void ceph_auth_destroy(struct ceph_auth_client *ac);
extern void ceph_auth_reset(struct ceph_auth_client *ac);
extern int ceph_auth_build_hello(struct ceph_auth_client *ac,
void *buf, size_t len);
extern int ceph_handle_auth_reply(struct ceph_auth_client *ac,
void *buf, size_t len,
void *reply_buf, size_t reply_len);
extern int ceph_entity_name_encode(const char *name, void **p, void *end);
extern int ceph_build_auth(struct ceph_auth_client *ac,
void *msg_buf, size_t msg_len);
extern int ceph_auth_is_authenticated(struct ceph_auth_client *ac);
#endif

121
fs/ceph/auth_none.c

@ -0,0 +1,121 @@ @@ -0,0 +1,121 @@
#include "ceph_debug.h"
#include <linux/err.h>
#include <linux/module.h>
#include <linux/random.h>
#include "auth_none.h"
#include "auth.h"
#include "decode.h"
static void reset(struct ceph_auth_client *ac)
{
struct ceph_auth_none_info *xi = ac->private;
xi->starting = true;
xi->built_authorizer = false;
}
static void destroy(struct ceph_auth_client *ac)
{
kfree(ac->private);
ac->private = NULL;
}
static int is_authenticated(struct ceph_auth_client *ac)
{
struct ceph_auth_none_info *xi = ac->private;
return !xi->starting;
}
/*
* the generic auth code decode the global_id, and we carry no actual
* authenticate state, so nothing happens here.
*/
static int handle_reply(struct ceph_auth_client *ac, int result,
void *buf, void *end)
{
struct ceph_auth_none_info *xi = ac->private;
xi->starting = false;
return result;
}
/*
* build an 'authorizer' with our entity_name and global_id. we can
* reuse a single static copy since it is identical for all services
* we connect to.
*/
static int ceph_auth_none_create_authorizer(
struct ceph_auth_client *ac, int peer_type,
struct ceph_authorizer **a,
void **buf, size_t *len,
void **reply_buf, size_t *reply_len)
{
struct ceph_auth_none_info *ai = ac->private;
struct ceph_none_authorizer *au = &ai->au;
void *p, *end;
int ret;
if (!ai->built_authorizer) {
p = au->buf;
end = p + sizeof(au->buf);
ceph_encode_8(&p, 1);
ret = ceph_entity_name_encode(ac->name, &p, end - 8);
if (ret < 0)
goto bad;
ceph_decode_need(&p, end, sizeof(u64), bad2);
ceph_encode_64(&p, ac->global_id);
au->buf_len = p - (void *)au->buf;
ai->built_authorizer = true;
dout("built authorizer len %d\n", au->buf_len);
}
*a = (struct ceph_authorizer *)au;
*buf = au->buf;
*len = au->buf_len;
*reply_buf = au->reply_buf;
*reply_len = sizeof(au->reply_buf);
return 0;
bad2:
ret = -ERANGE;
bad:
return ret;
}
static void ceph_auth_none_destroy_authorizer(struct ceph_auth_client *ac,
struct ceph_authorizer *a)
{
/* nothing to do */
}
static const struct ceph_auth_client_ops ceph_auth_none_ops = {
.reset = reset,
.destroy = destroy,
.is_authenticated = is_authenticated,
.handle_reply = handle_reply,
.create_authorizer = ceph_auth_none_create_authorizer,
.destroy_authorizer = ceph_auth_none_destroy_authorizer,
};
int ceph_auth_none_init(struct ceph_auth_client *ac)
{
struct ceph_auth_none_info *xi;
dout("ceph_auth_none_init %p\n", ac);
xi = kzalloc(sizeof(*xi), GFP_NOFS);
if (!xi)
return -ENOMEM;
xi->starting = true;
xi->built_authorizer = false;
ac->protocol = CEPH_AUTH_NONE;
ac->private = xi;
ac->ops = &ceph_auth_none_ops;
return 0;
}

28
fs/ceph/auth_none.h

@ -0,0 +1,28 @@ @@ -0,0 +1,28 @@
#ifndef _FS_CEPH_AUTH_NONE_H
#define _FS_CEPH_AUTH_NONE_H
#include "auth.h"
/*
* null security mode.
*
* we use a single static authorizer that simply encodes our entity name
* and global id.
*/
struct ceph_none_authorizer {
char buf[128];
int buf_len;
char reply_buf[0];
};
struct ceph_auth_none_info {
bool starting;
bool built_authorizer;
struct ceph_none_authorizer au; /* we only need one; it's static */
};
extern int ceph_auth_none_init(struct ceph_auth_client *ac);
#endif

656
fs/ceph/auth_x.c

@ -0,0 +1,656 @@ @@ -0,0 +1,656 @@
#include "ceph_debug.h"
#include <linux/err.h>
#include <linux/module.h>
#include <linux/random.h>
#include "auth_x.h"
#include "auth_x_protocol.h"
#include "crypto.h"
#include "auth.h"
#include "decode.h"
struct kmem_cache *ceph_x_ticketbuf_cachep;
#define TEMP_TICKET_BUF_LEN 256
static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed);
static int ceph_x_is_authenticated(struct ceph_auth_client *ac)
{
struct ceph_x_info *xi = ac->private;
int need;
ceph_x_validate_tickets(ac, &need);
dout("ceph_x_is_authenticated want=%d need=%d have=%d\n",
ac->want_keys, need, xi->have_keys);
return (ac->want_keys & xi->have_keys) == ac->want_keys;
}
static int ceph_x_encrypt(struct ceph_crypto_key *secret,
void *ibuf, int ilen, void *obuf, size_t olen)
{
struct ceph_x_encrypt_header head = {
.struct_v = 1,
.magic = cpu_to_le64(CEPHX_ENC_MAGIC)
};
size_t len = olen - sizeof(u32);
int ret;
ret = ceph_encrypt2(secret, obuf + sizeof(u32), &len,
&head, sizeof(head), ibuf, ilen);
if (ret)
return ret;
ceph_encode_32(&obuf, len);
return len + sizeof(u32);
}
static int ceph_x_decrypt(struct ceph_crypto_key *secret,
void **p, void *end, void *obuf, size_t olen)
{
struct ceph_x_encrypt_header head;
size_t head_len = sizeof(head);
int len, ret;
len = ceph_decode_32(p);
if (*p + len > end)
return -EINVAL;
dout("ceph_x_decrypt len %d\n", len);
ret = ceph_decrypt2(secret, &head, &head_len, obuf, &olen,
*p, len);
if (ret)
return ret;
if (head.struct_v != 1 || le64_to_cpu(head.magic) != CEPHX_ENC_MAGIC)
return -EPERM;
*p += len;
return olen;
}
/*
* get existing (or insert new) ticket handler
*/
struct ceph_x_ticket_handler *get_ticket_handler(struct ceph_auth_client *ac,
int service)
{
struct ceph_x_ticket_handler *th;
struct ceph_x_info *xi = ac->private;
struct rb_node *parent = NULL, **p = &xi->ticket_handlers.rb_node;
while (*p) {
parent = *p;
th = rb_entry(parent, struct ceph_x_ticket_handler, node);
if (service < th->service)
p = &(*p)->rb_left;
else if (service > th->service)
p = &(*p)->rb_right;
else
return th;
}
/* add it */
th = kzalloc(sizeof(*th), GFP_NOFS);
if (!th)
return ERR_PTR(-ENOMEM);
th->service = service;
rb_link_node(&th->node, parent, p);
rb_insert_color(&th->node, &xi->ticket_handlers);
return th;
}
static void remove_ticket_handler(struct ceph_auth_client *ac,
struct ceph_x_ticket_handler *th)
{
struct ceph_x_info *xi = ac->private;
dout("remove_ticket_handler %p %d\n", th, th->service);
rb_erase(&th->node, &xi->ticket_handlers);
ceph_crypto_key_destroy(&th->session_key);
if (th->ticket_blob)
ceph_buffer_put(th->ticket_blob);
kfree(th);
}
static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
struct ceph_crypto_key *secret,
void *buf, void *end)
{
struct ceph_x_info *xi = ac->private;
int num;
void *p = buf;
int ret;
char *dbuf;
char *ticket_buf;
u8 struct_v;
dbuf = kmem_cache_alloc(ceph_x_ticketbuf_cachep, GFP_NOFS | GFP_ATOMIC);
if (!dbuf)
return -ENOMEM;
ret = -ENOMEM;
ticket_buf = kmem_cache_alloc(ceph_x_ticketbuf_cachep,
GFP_NOFS | GFP_ATOMIC);
if (!ticket_buf)
goto out_dbuf;
ceph_decode_need(&p, end, 1 + sizeof(u32), bad);
struct_v = ceph_decode_8(&p);
if (struct_v != 1)
goto bad;
num = ceph_decode_32(&p);
dout("%d tickets\n", num);
while (num--) {
int type;
u8 struct_v;
struct ceph_x_ticket_handler *th;
void *dp, *dend;
int dlen;
char is_enc;
struct timespec validity;
struct ceph_crypto_key old_key;
void *tp, *tpend;
ceph_decode_need(&p, end, sizeof(u32) + 1, bad);
type = ceph_decode_32(&p);
dout(" ticket type %d %s\n", type, ceph_entity_type_name(type));
struct_v = ceph_decode_8(&p);
if (struct_v != 1)
goto bad;
th = get_ticket_handler(ac, type);
if (IS_ERR(th)) {
ret = PTR_ERR(th);
goto out;
}
/* blob for me */
dlen = ceph_x_decrypt(secret, &p, end, dbuf,
TEMP_TICKET_BUF_LEN);
if (dlen <= 0) {
ret = dlen;
goto out;
}
dout(" decrypted %d bytes\n", dlen);
dend = dbuf + dlen;
dp = dbuf;
struct_v = ceph_decode_8(&dp);
if (struct_v != 1)
goto bad;
memcpy(&old_key, &th->session_key, sizeof(old_key));
ret = ceph_crypto_key_decode(&th->session_key, &dp, dend);
if (ret)
goto out;
ceph_decode_copy(&dp, &th->validity, sizeof(th->validity));
ceph_decode_timespec(&validity, &th->validity);
th->expires = get_seconds() + validity.tv_sec;
th->renew_after = th->expires - (validity.tv_sec / 4);
dout(" expires=%lu renew_after=%lu\n", th->expires,
th->renew_after);
/* ticket blob for service */
ceph_decode_8_safe(&p, end, is_enc, bad);
tp = ticket_buf;
if (is_enc) {
/* encrypted */
dout(" encrypted ticket\n");
dlen = ceph_x_decrypt(&old_key, &p, end, ticket_buf,
TEMP_TICKET_BUF_LEN);
if (dlen < 0) {
ret = dlen;
goto out;
}
dlen = ceph_decode_32(&tp);
} else {
/* unencrypted */
ceph_decode_32_safe(&p, end, dlen, bad);
ceph_decode_need(&p, end, dlen, bad);
ceph_decode_copy(&p, ticket_buf, dlen);
}
tpend = tp + dlen;
dout(" ticket blob is %d bytes\n", dlen);
ceph_decode_need(&tp, tpend, 1 + sizeof(u64), bad);
struct_v = ceph_decode_8(&tp);
th->secret_id = ceph_decode_64(&tp);
ret = ceph_decode_buffer(&th->ticket_blob, &tp, tpend);
if (ret)
goto out;
dout(" got ticket service %d (%s) secret_id %lld len %d\n",
type, ceph_entity_type_name(type), th->secret_id,
(int)th->ticket_blob->vec.iov_len);
xi->have_keys |= th->service;
}
ret = 0;
out:
kmem_cache_free(ceph_x_ticketbuf_cachep, ticket_buf);
out_dbuf:
kmem_cache_free(ceph_x_ticketbuf_cachep, dbuf);
return ret;
bad:
ret = -EINVAL;
goto out;
}
static int ceph_x_build_authorizer(struct ceph_auth_client *ac,
struct ceph_x_ticket_handler *th,
struct ceph_x_authorizer *au)
{
int len;
struct ceph_x_authorize_a *msg_a;
struct ceph_x_authorize_b msg_b;
void *p, *end;
int ret;
int ticket_blob_len =
(th->ticket_blob ? th->ticket_blob->vec.iov_len : 0);
dout("build_authorizer for %s %p\n",
ceph_entity_type_name(th->service), au);
len = sizeof(*msg_a) + sizeof(msg_b) + sizeof(u32) +
ticket_blob_len + 16;
dout(" need len %d\n", len);
if (au->buf && au->buf->alloc_len < len) {
ceph_buffer_put(au->buf);
au->buf = NULL;
}
if (!au->buf) {
au->buf = ceph_buffer_new(len, GFP_NOFS);
if (!au->buf)
return -ENOMEM;
}
au->service = th->service;
msg_a = au->buf->vec.iov_base;
msg_a->struct_v = 1;
msg_a->global_id = cpu_to_le64(ac->global_id);
msg_a->service_id = cpu_to_le32(th->service);
msg_a->ticket_blob.struct_v = 1;
msg_a->ticket_blob.secret_id = cpu_to_le64(th->secret_id);
msg_a->ticket_blob.blob_len = cpu_to_le32(ticket_blob_len);
if (ticket_blob_len) {
memcpy(msg_a->ticket_blob.blob, th->ticket_blob->vec.iov_base,
th->ticket_blob->vec.iov_len);
}
dout(" th %p secret_id %lld %lld\n", th, th->secret_id,
le64_to_cpu(msg_a->ticket_blob.secret_id));
p = msg_a + 1;
p += ticket_blob_len;
end = au->buf->vec.iov_base + au->buf->vec.iov_len;
get_random_bytes(&au->nonce, sizeof(au->nonce));
msg_b.struct_v = 1;
msg_b.nonce = cpu_to_le64(au->nonce);
ret = ceph_x_encrypt(&th->session_key, &msg_b, sizeof(msg_b),
p, end - p);
if (ret < 0)
goto out_buf;
p += ret;
au->buf->vec.iov_len = p - au->buf->vec.iov_base;
dout(" built authorizer nonce %llx len %d\n", au->nonce,
(int)au->buf->vec.iov_len);
return 0;
out_buf:
ceph_buffer_put(au->buf);
au->buf = NULL;
return ret;
}
static int ceph_x_encode_ticket(struct ceph_x_ticket_handler *th,
void **p, void *end)
{
ceph_decode_need(p, end, 1 + sizeof(u64), bad);
ceph_encode_8(p, 1);
ceph_encode_64(p, th->secret_id);
if (th->ticket_blob) {
const char *buf = th->ticket_blob->vec.iov_base;
u32 len = th->ticket_blob->vec.iov_len;
ceph_encode_32_safe(p, end, len, bad);
ceph_encode_copy_safe(p, end, buf, len, bad);
} else {
ceph_encode_32_safe(p, end, 0, bad);
}
return 0;
bad:
return -ERANGE;
}
static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed)
{
int want = ac->want_keys;
struct ceph_x_info *xi = ac->private;
int service;
*pneed = ac->want_keys & ~(xi->have_keys);
for (service = 1; service <= want; service <<= 1) {
struct ceph_x_ticket_handler *th;
if (!(ac->want_keys & service))
continue;
if (*pneed & service)
continue;
th = get_ticket_handler(ac, service);
if (!th) {
*pneed |= service;
continue;
}
if (get_seconds() >= th->renew_after)
*pneed |= service;
if (get_seconds() >= th->expires)
xi->have_keys &= ~service;
}
}
static int ceph_x_build_request(struct ceph_auth_client *ac,
void *buf, void *end)
{
struct ceph_x_info *xi = ac->private;
int need;
struct ceph_x_request_header *head = buf;
int ret;
struct ceph_x_ticket_handler *th =
get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
ceph_x_validate_tickets(ac, &need);
dout("build_request want %x have %x need %x\n",
ac->want_keys, xi->have_keys, need);
if (need & CEPH_ENTITY_TYPE_AUTH) {
struct ceph_x_authenticate *auth = (void *)(head + 1);
void *p = auth + 1;
struct ceph_x_challenge_blob tmp;
char tmp_enc[40];
u64 *u;
if (p > end)
return -ERANGE;
dout(" get_auth_session_key\n");
head->op = cpu_to_le16(CEPHX_GET_AUTH_SESSION_KEY);
/* encrypt and hash */
get_random_bytes(&auth->client_challenge, sizeof(u64));
tmp.client_challenge = auth->client_challenge;
tmp.server_challenge = cpu_to_le64(xi->server_challenge);
ret = ceph_x_encrypt(&xi->secret, &tmp, sizeof(tmp),
tmp_enc, sizeof(tmp_enc));
if (ret < 0)
return ret;
auth->struct_v = 1;
auth->key = 0;
for (u = (u64 *)tmp_enc; u + 1 <= (u64 *)(tmp_enc + ret); u++)
auth->key ^= *u;
dout(" server_challenge %llx client_challenge %llx key %llx\n",
xi->server_challenge, le64_to_cpu(auth->client_challenge),
le64_to_cpu(auth->key));
/* now encode the old ticket if exists */
ret = ceph_x_encode_ticket(th, &p, end);
if (ret < 0)
return ret;
return p - buf;
}
if (need) {
void *p = head + 1;
struct ceph_x_service_ticket_request *req;
if (p > end)
return -ERANGE;
head->op = cpu_to_le16(CEPHX_GET_PRINCIPAL_SESSION_KEY);
BUG_ON(!th);
ret = ceph_x_build_authorizer(ac, th, &xi->auth_authorizer);
if (ret)
return ret;
ceph_encode_copy(&p, xi->auth_authorizer.buf->vec.iov_base,
xi->auth_authorizer.buf->vec.iov_len);
req = p;
req->keys = cpu_to_le32(need);
p += sizeof(*req);
return p - buf;
}
return 0;
}
static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result,
void *buf, void *end)
{
struct ceph_x_info *xi = ac->private;
struct ceph_x_reply_header *head = buf;
struct ceph_x_ticket_handler *th;
int len = end - buf;
int op;
int ret;
if (result)
return result; /* XXX hmm? */
if (xi->starting) {
/* it's a hello */
struct ceph_x_server_challenge *sc = buf;
if (len != sizeof(*sc))
return -EINVAL;
xi->server_challenge = le64_to_cpu(sc->server_challenge);
dout("handle_reply got server challenge %llx\n",
xi->server_challenge);
xi->starting = false;
xi->have_keys &= ~CEPH_ENTITY_TYPE_AUTH;
return -EAGAIN;
}
op = le32_to_cpu(head->op);
result = le32_to_cpu(head->result);
dout("handle_reply op %d result %d\n", op, result);
switch (op) {
case CEPHX_GET_AUTH_SESSION_KEY:
/* verify auth key */
ret = ceph_x_proc_ticket_reply(ac, &xi->secret,
buf + sizeof(*head), end);
break;
case CEPHX_GET_PRINCIPAL_SESSION_KEY:
th = get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
BUG_ON(!th);
ret = ceph_x_proc_ticket_reply(ac, &th->session_key,
buf + sizeof(*head), end);
break;
default:
return -EINVAL;
}
if (ret)
return ret;
if (ac->want_keys == xi->have_keys)
return 0;
return -EAGAIN;
}
static int ceph_x_create_authorizer(
struct ceph_auth_client *ac, int peer_type,
struct ceph_authorizer **a,
void **buf, size_t *len,
void **reply_buf, size_t *reply_len)
{
struct ceph_x_authorizer *au;
struct ceph_x_ticket_handler *th;
int ret;
th = get_ticket_handler(ac, peer_type);
if (IS_ERR(th))
return PTR_ERR(th);
au = kzalloc(sizeof(*au), GFP_NOFS);
if (!au)
return -ENOMEM;
ret = ceph_x_build_authorizer(ac, th, au);
if (ret) {
kfree(au);
return ret;
}
*a = (struct ceph_authorizer *)au;
*buf = au->buf->vec.iov_base;
*len = au->buf->vec.iov_len;
*reply_buf = au->reply_buf;
*reply_len = sizeof(au->reply_buf);
return 0;
}
static int ceph_x_verify_authorizer_reply(struct ceph_auth_client *ac,
struct ceph_authorizer *a, size_t len)
{
struct ceph_x_au