original development tree for Linux kernel GTP module; now long in mainline.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
linux-gtp/include/linux/fs.h

2118 lines
73 KiB

#ifndef _LINUX_FS_H
#define _LINUX_FS_H
/*
* This file has definitions for some important file table
* structures etc.
*/
#include <linux/limits.h>
#include <linux/ioctl.h>
/*
* It's silly to have NR_OPEN bigger than NR_FILE, but you can change
* the file limit at runtime and only root can increase the per-process
* nr_file rlimit, so it's safe to set up a ridiculously high absolute
* upper limit on files-per-process.
*
* Some programs (notably those using select()) may have to be
* recompiled to take full advantage of the new limits..
*/
/* Fixed constants first: */
#undef NR_OPEN
#define NR_OPEN (1024*1024) /* Absolute upper limit on fd num */
#define INR_OPEN 1024 /* Initial setting for nfile rlimits */
#define BLOCK_SIZE_BITS 10
#define BLOCK_SIZE (1<<BLOCK_SIZE_BITS)
#define SEEK_SET 0 /* seek relative to beginning of file */
#define SEEK_CUR 1 /* seek relative to current file position */
#define SEEK_END 2 /* seek relative to end of file */
#define SEEK_MAX SEEK_END
/* And dynamically-tunable limits and defaults: */
struct files_stat_struct {
int nr_files; /* read only */
int nr_free_files; /* read only */
int max_files; /* tunable */
};
extern struct files_stat_struct files_stat;
extern int get_max_files(void);
struct inodes_stat_t {
int nr_inodes;
int nr_unused;
int dummy[5]; /* padding for sysctl ABI compatibility */
};
extern struct inodes_stat_t inodes_stat;
extern int leases_enable, lease_break_time;
#ifdef CONFIG_DNOTIFY
extern int dir_notify_enable;
#endif
#define NR_FILE 8192 /* this can well be larger on a larger system */
#define MAY_EXEC 1
#define MAY_WRITE 2
#define MAY_READ 4
#define MAY_APPEND 8
#define FMODE_READ 1
#define FMODE_WRITE 2
/* Internal kernel extensions */
#define FMODE_LSEEK 4
#define FMODE_PREAD 8
#define FMODE_PWRITE FMODE_PREAD /* These go hand in hand */
/* File is being opened for execution. Primary users of this flag are
distributed filesystems that can use it to achieve correct ETXTBUSY
behavior for cross-node execution/opening_for_writing of files */
#define FMODE_EXEC 16
#define RW_MASK 1
#define RWA_MASK 2
#define READ 0
#define WRITE 1
#define READA 2 /* read-ahead - don't block if no resources */
#define SWRITE 3 /* for ll_rw_block() - wait for buffer lock */
#define READ_SYNC (READ | (1 << BIO_RW_SYNC))
#define READ_META (READ | (1 << BIO_RW_META))
#define WRITE_SYNC (WRITE | (1 << BIO_RW_SYNC))
#define WRITE_BARRIER ((1 << BIO_RW) | (1 << BIO_RW_BARRIER))
#define SEL_IN 1
#define SEL_OUT 2
#define SEL_EX 4
/* public flags for file_system_type */
#define FS_REQUIRES_DEV 1
#define FS_BINARY_MOUNTDATA 2
#define FS_HAS_SUBTYPE 4
#define FS_REVAL_DOT 16384 /* Check the paths ".", ".." for staleness */
#define FS_RENAME_DOES_D_MOVE 32768 /* FS will handle d_move()
* during rename() internally.
*/
/*
* These are the fs-independent mount-flags: up to 32 flags are supported
*/
#define MS_RDONLY 1 /* Mount read-only */
#define MS_NOSUID 2 /* Ignore suid and sgid bits */
#define MS_NODEV 4 /* Disallow access to device special files */
#define MS_NOEXEC 8 /* Disallow program execution */
#define MS_SYNCHRONOUS 16 /* Writes are synced at once */
#define MS_REMOUNT 32 /* Alter flags of a mounted FS */
#define MS_MANDLOCK 64 /* Allow mandatory locks on an FS */
#define MS_DIRSYNC 128 /* Directory modifications are synchronous */
#define MS_NOATIME 1024 /* Do not update access times. */
#define MS_NODIRATIME 2048 /* Do not update directory access times */
#define MS_BIND 4096
#define MS_MOVE 8192
#define MS_REC 16384
#define MS_VERBOSE 32768 /* War is peace. Verbosity is silence.
MS_VERBOSE is deprecated. */
#define MS_SILENT 32768
#define MS_POSIXACL (1<<16) /* VFS does not apply the umask */
#define MS_UNBINDABLE (1<<17) /* change to unbindable */
#define MS_PRIVATE (1<<18) /* change to private */
#define MS_SLAVE (1<<19) /* change to slave */
#define MS_SHARED (1<<20) /* change to shared */
#define MS_RELATIME (1<<21) /* Update atime relative to mtime/ctime. */
#define MS_KERNMOUNT (1<<22) /* this is a kern_mount call */
#define MS_I_VERSION (1<<23) /* Update inode I_version field */
#define MS_ACTIVE (1<<30)
#define MS_NOUSER (1<<31)
/*
* Superblock flags that can be altered by MS_REMOUNT
*/
#define MS_RMT_MASK (MS_RDONLY|MS_SYNCHRONOUS|MS_MANDLOCK)
/*
* Old magic mount flag and mask
*/
#define MS_MGC_VAL 0xC0ED0000
#define MS_MGC_MSK 0xffff0000
/* Inode flags - they have nothing to superblock flags now */
#define S_SYNC 1 /* Writes are synced at once */
#define S_NOATIME 2 /* Do not update access times */
#define S_APPEND 4 /* Append-only file */
#define S_IMMUTABLE 8 /* Immutable file */
#define S_DEAD 16 /* removed, but still open directory */
#define S_NOQUOTA 32 /* Inode is not counted to quota */
#define S_DIRSYNC 64 /* Directory modifications are synchronous */
#define S_NOCMTIME 128 /* Do not update file c/mtime */
#define S_SWAPFILE 256 /* Do not truncate: swapon got its bmaps */
#define S_PRIVATE 512 /* Inode is fs-internal */
/*
* Note that nosuid etc flags are inode-specific: setting some file-system
* flags just means all the inodes inherit those flags by default. It might be
* possible to override it selectively if you really wanted to with some
* ioctl() that is not currently implemented.
*
* Exception: MS_RDONLY is always applied to the entire file system.
*
* Unfortunately, it is possible to change a filesystems flags with it mounted
* with files in use. This means that all of the inodes will not have their
* i_flags updated. Hence, i_flags no longer inherit the superblock mount
* flags, so these have to be checked separately. -- rmk@arm.uk.linux.org
*/
#define __IS_FLG(inode,flg) ((inode)->i_sb->s_flags & (flg))
#define IS_RDONLY(inode) ((inode)->i_sb->s_flags & MS_RDONLY)
#define IS_SYNC(inode) (__IS_FLG(inode, MS_SYNCHRONOUS) || \
((inode)->i_flags & S_SYNC))
#define IS_DIRSYNC(inode) (__IS_FLG(inode, MS_SYNCHRONOUS|MS_DIRSYNC) || \
((inode)->i_flags & (S_SYNC|S_DIRSYNC)))
#define IS_MANDLOCK(inode) __IS_FLG(inode, MS_MANDLOCK)
#define IS_NOATIME(inode) __IS_FLG(inode, MS_RDONLY|MS_NOATIME)
#define IS_I_VERSION(inode) __IS_FLG(inode, MS_I_VERSION)
#define IS_NOQUOTA(inode) ((inode)->i_flags & S_NOQUOTA)
#define IS_APPEND(inode) ((inode)->i_flags & S_APPEND)
#define IS_IMMUTABLE(inode) ((inode)->i_flags & S_IMMUTABLE)
#define IS_POSIXACL(inode) __IS_FLG(inode, MS_POSIXACL)
#define IS_DEADDIR(inode) ((inode)->i_flags & S_DEAD)
#define IS_NOCMTIME(inode) ((inode)->i_flags & S_NOCMTIME)
#define IS_SWAPFILE(inode) ((inode)->i_flags & S_SWAPFILE)
#define IS_PRIVATE(inode) ((inode)->i_flags & S_PRIVATE)
/* the read-only stuff doesn't really belong here, but any other place is
probably as bad and I don't want to create yet another include file. */
#define BLKROSET _IO(0x12,93) /* set device read-only (0 = read-write) */
#define BLKROGET _IO(0x12,94) /* get read-only status (0 = read_write) */
#define BLKRRPART _IO(0x12,95) /* re-read partition table */
#define BLKGETSIZE _IO(0x12,96) /* return device size /512 (long *arg) */
#define BLKFLSBUF _IO(0x12,97) /* flush buffer cache */
#define BLKRASET _IO(0x12,98) /* set read ahead for block device */
#define BLKRAGET _IO(0x12,99) /* get current read ahead setting */
#define BLKFRASET _IO(0x12,100)/* set filesystem (mm/filemap.c) read-ahead */
#define BLKFRAGET _IO(0x12,101)/* get filesystem (mm/filemap.c) read-ahead */
#define BLKSECTSET _IO(0x12,102)/* set max sectors per request (ll_rw_blk.c) */
#define BLKSECTGET _IO(0x12,103)/* get max sectors per request (ll_rw_blk.c) */
#define BLKSSZGET _IO(0x12,104)/* get block device sector size */
#if 0
#define BLKPG _IO(0x12,105)/* See blkpg.h */
/* Some people are morons. Do not use sizeof! */
#define BLKELVGET _IOR(0x12,106,size_t)/* elevator get */
#define BLKELVSET _IOW(0x12,107,size_t)/* elevator set */
/* This was here just to show that the number is taken -
probably all these _IO(0x12,*) ioctls should be moved to blkpg.h. */
#endif
/* A jump here: 108-111 have been used for various private purposes. */
#define BLKBSZGET _IOR(0x12,112,size_t)
#define BLKBSZSET _IOW(0x12,113,size_t)
#define BLKGETSIZE64 _IOR(0x12,114,size_t) /* return device size in bytes (u64 *arg) */
#define BLKTRACESETUP _IOWR(0x12,115,struct blk_user_trace_setup)
#define BLKTRACESTART _IO(0x12,116)
#define BLKTRACESTOP _IO(0x12,117)
#define BLKTRACETEARDOWN _IO(0x12,118)
#define BMAP_IOCTL 1 /* obsolete - kept for compatibility */
#define FIBMAP _IO(0x00,1) /* bmap access */
#define FIGETBSZ _IO(0x00,2) /* get the block size used for bmap */
#define FS_IOC_GETFLAGS _IOR('f', 1, long)
#define FS_IOC_SETFLAGS _IOW('f', 2, long)
#define FS_IOC_GETVERSION _IOR('v', 1, long)
#define FS_IOC_SETVERSION _IOW('v', 2, long)
#define FS_IOC32_GETFLAGS _IOR('f', 1, int)
#define FS_IOC32_SETFLAGS _IOW('f', 2, int)
#define FS_IOC32_GETVERSION _IOR('v', 1, int)
#define FS_IOC32_SETVERSION _IOW('v', 2, int)
/*
* Inode flags (FS_IOC_GETFLAGS / FS_IOC_SETFLAGS)
*/
#define FS_SECRM_FL 0x00000001 /* Secure deletion */
#define FS_UNRM_FL 0x00000002 /* Undelete */
#define FS_COMPR_FL 0x00000004 /* Compress file */
#define FS_SYNC_FL 0x00000008 /* Synchronous updates */
#define FS_IMMUTABLE_FL 0x00000010 /* Immutable file */
#define FS_APPEND_FL 0x00000020 /* writes to file may only append */
#define FS_NODUMP_FL 0x00000040 /* do not dump file */
#define FS_NOATIME_FL 0x00000080 /* do not update atime */
/* Reserved for compression usage... */
#define FS_DIRTY_FL 0x00000100
#define FS_COMPRBLK_FL 0x00000200 /* One or more compressed clusters */
#define FS_NOCOMP_FL 0x00000400 /* Don't compress */
#define FS_ECOMPR_FL 0x00000800 /* Compression error */
/* End compression flags --- maybe not all used */
#define FS_BTREE_FL 0x00001000 /* btree format dir */
#define FS_INDEX_FL 0x00001000 /* hash-indexed directory */
#define FS_IMAGIC_FL 0x00002000 /* AFS directory */
#define FS_JOURNAL_DATA_FL 0x00004000 /* Reserved for ext3 */
#define FS_NOTAIL_FL 0x00008000 /* file tail should not be merged */
#define FS_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */
#define FS_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/
#define FS_EXTENT_FL 0x00080000 /* Extents */
#define FS_DIRECTIO_FL 0x00100000 /* Use direct i/o */
#define FS_RESERVED_FL 0x80000000 /* reserved for ext2 lib */
#define FS_FL_USER_VISIBLE 0x0003DFFF /* User visible flags */
#define FS_FL_USER_MODIFIABLE 0x000380FF /* User modifiable flags */
#define SYNC_FILE_RANGE_WAIT_BEFORE 1
#define SYNC_FILE_RANGE_WRITE 2
#define SYNC_FILE_RANGE_WAIT_AFTER 4
#ifdef __KERNEL__
#include <linux/linkage.h>
#include <linux/wait.h>
#include <linux/types.h>
#include <linux/kdev_t.h>
#include <linux/dcache.h>
#include <linux/namei.h>
#include <linux/stat.h>
#include <linux/cache.h>
#include <linux/kobject.h>
#include <linux/list.h>
#include <linux/radix-tree.h>
#include <linux/prio_tree.h>
#include <linux/init.h>
#include <linux/pid.h>
#include <linux/mutex.h>
#include <linux/capability.h>
#include <asm/atomic.h>
#include <asm/semaphore.h>
#include <asm/byteorder.h>
struct export_operations;
struct hd_geometry;
struct iovec;
struct nameidata;
struct kiocb;
struct pipe_inode_info;
struct poll_table_struct;
struct kstatfs;
struct vm_area_struct;
struct vfsmount;
extern void __init inode_init(void);
extern void __init inode_init_early(void);
extern void __init mnt_init(void);
extern void __init files_init(unsigned long);
struct buffer_head;
typedef int (get_block_t)(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create);
typedef void (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
ssize_t bytes, void *private);
/*
* Attribute flags. These should be or-ed together to figure out what
* has been changed!
*/
#define ATTR_MODE 1
#define ATTR_UID 2
#define ATTR_GID 4
#define ATTR_SIZE 8
#define ATTR_ATIME 16
#define ATTR_MTIME 32
#define ATTR_CTIME 64
#define ATTR_ATIME_SET 128
#define ATTR_MTIME_SET 256
#define ATTR_FORCE 512 /* Not a change, but a change it */
#define ATTR_ATTR_FLAG 1024
#define ATTR_KILL_SUID 2048
#define ATTR_KILL_SGID 4096
#define ATTR_FILE 8192
Implement file posix capabilities Implement file posix capabilities. This allows programs to be given a subset of root's powers regardless of who runs them, without having to use setuid and giving the binary all of root's powers. This version works with Kaigai Kohei's userspace tools, found at http://www.kaigai.gr.jp/index.php. For more information on how to use this patch, Chris Friedhoff has posted a nice page at http://www.friedhoff.org/fscaps.html. Changelog: Nov 27: Incorporate fixes from Andrew Morton (security-introduce-file-caps-tweaks and security-introduce-file-caps-warning-fix) Fix Kconfig dependency. Fix change signaling behavior when file caps are not compiled in. Nov 13: Integrate comments from Alexey: Remove CONFIG_ ifdef from capability.h, and use %zd for printing a size_t. Nov 13: Fix endianness warnings by sparse as suggested by Alexey Dobriyan. Nov 09: Address warnings of unused variables at cap_bprm_set_security when file capabilities are disabled, and simultaneously clean up the code a little, by pulling the new code into a helper function. Nov 08: For pointers to required userspace tools and how to use them, see http://www.friedhoff.org/fscaps.html. Nov 07: Fix the calculation of the highest bit checked in check_cap_sanity(). Nov 07: Allow file caps to be enabled without CONFIG_SECURITY, since capabilities are the default. Hook cap_task_setscheduler when !CONFIG_SECURITY. Move capable(TASK_KILL) to end of cap_task_kill to reduce audit messages. Nov 05: Add secondary calls in selinux/hooks.c to task_setioprio and task_setscheduler so that selinux and capabilities with file cap support can be stacked. Sep 05: As Seth Arnold points out, uid checks are out of place for capability code. Sep 01: Define task_setscheduler, task_setioprio, cap_task_kill, and task_setnice to make sure a user cannot affect a process in which they called a program with some fscaps. One remaining question is the note under task_setscheduler: are we ok with CAP_SYS_NICE being sufficient to confine a process to a cpuset? It is a semantic change, as without fsccaps, attach_task doesn't allow CAP_SYS_NICE to override the uid equivalence check. But since it uses security_task_setscheduler, which elsewhere is used where CAP_SYS_NICE can be used to override the uid equivalence check, fixing it might be tough. task_setscheduler note: this also controls cpuset:attach_task. Are we ok with CAP_SYS_NICE being used to confine to a cpuset? task_setioprio task_setnice sys_setpriority uses this (through set_one_prio) for another process. Need same checks as setrlimit Aug 21: Updated secureexec implementation to reflect the fact that euid and uid might be the same and nonzero, but the process might still have elevated caps. Aug 15: Handle endianness of xattrs. Enforce capability version match between kernel and disk. Enforce that no bits beyond the known max capability are set, else return -EPERM. With this extra processing, it may be worth reconsidering doing all the work at bprm_set_security rather than d_instantiate. Aug 10: Always call getxattr at bprm_set_security, rather than caching it at d_instantiate. [morgan@kernel.org: file-caps clean up for linux/capability.h] [bunk@kernel.org: unexport cap_inode_killpriv] Signed-off-by: Serge E. Hallyn <serue@us.ibm.com> Cc: Stephen Smalley <sds@tycho.nsa.gov> Cc: James Morris <jmorris@namei.org> Cc: Chris Wright <chrisw@sous-sol.org> Cc: Andrew Morgan <morgan@kernel.org> Signed-off-by: Andrew Morgan <morgan@kernel.org> Signed-off-by: Adrian Bunk <bunk@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
15 years ago
#define ATTR_KILL_PRIV 16384
#define ATTR_OPEN 32768 /* Truncating from open(O_TRUNC) */
/*
* This is the Inode Attributes structure, used for notify_change(). It
* uses the above definitions as flags, to know which values have changed.
* Also, in this manner, a Filesystem can look at only the values it cares
* about. Basically, these are the attributes that the VFS layer can
* request to change from the FS layer.
*
* Derek Atkins <warlord@MIT.EDU> 94-10-20
*/
struct iattr {
unsigned int ia_valid;
umode_t ia_mode;
uid_t ia_uid;
gid_t ia_gid;
loff_t ia_size;
struct timespec ia_atime;
struct timespec ia_mtime;
struct timespec ia_ctime;
/*
* Not an attribute, but an auxilary info for filesystems wanting to
* implement an ftruncate() like method. NOTE: filesystem should
* check for (ia_valid & ATTR_FILE), and not for (ia_file != NULL).
*/
struct file *ia_file;
};
/*
* Includes for diskquotas.
*/
#include <linux/quota.h>
/**
* enum positive_aop_returns - aop return codes with specific semantics
*
* @AOP_WRITEPAGE_ACTIVATE: Informs the caller that page writeback has
* completed, that the page is still locked, and
* should be considered active. The VM uses this hint
* to return the page to the active list -- it won't
* be a candidate for writeback again in the near
* future. Other callers must be careful to unlock
* the page if they get this return. Returned by
* writepage();
*
* @AOP_TRUNCATED_PAGE: The AOP method that was handed a locked page has
* unlocked it and the page might have been truncated.
* The caller should back up to acquiring a new page and
* trying again. The aop will be taking reasonable
* precautions not to livelock. If the caller held a page
* reference, it should drop it before retrying. Returned
* by readpage().
*
* address_space_operation functions return these large constants to indicate
* special semantics to the caller. These are much larger than the bytes in a
* page to allow for functions that return the number of bytes operated on in a
* given page.
*/
enum positive_aop_returns {
AOP_WRITEPAGE_ACTIVATE = 0x80000,
AOP_TRUNCATED_PAGE = 0x80001,
};
#define AOP_FLAG_UNINTERRUPTIBLE 0x0001 /* will not do a short write */
#define AOP_FLAG_CONT_EXPAND 0x0002 /* called from cont_expand */
/*
* oh the beauties of C type declarations.
*/
struct page;
struct address_space;
struct writeback_control;
struct iov_iter {
const struct iovec *iov;
unsigned long nr_segs;
size_t iov_offset;
size_t count;
};
size_t iov_iter_copy_from_user_atomic(struct page *page,
struct iov_iter *i, unsigned long offset, size_t bytes);
size_t iov_iter_copy_from_user(struct page *page,
struct iov_iter *i, unsigned long offset, size_t bytes);
void iov_iter_advance(struct iov_iter *i, size_t bytes);
int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes);
size_t iov_iter_single_seg_count(struct iov_iter *i);
static inline void iov_iter_init(struct iov_iter *i,
const struct iovec *iov, unsigned long nr_segs,
size_t count, size_t written)
{
i->iov = iov;
i->nr_segs = nr_segs;
i->iov_offset = 0;
i->count = count + written;
iov_iter_advance(i, written);
}
static inline size_t iov_iter_count(struct iov_iter *i)
{
return i->count;
}
struct address_space_operations {
int (*writepage)(struct page *page, struct writeback_control *wbc);
int (*readpage)(struct file *, struct page *);
void (*sync_page)(struct page *);
/* Write back some dirty pages from this mapping. */
int (*writepages)(struct address_space *, struct writeback_control *);
/* Set a page dirty. Return true if this dirtied it */
int (*set_page_dirty)(struct page *page);
int (*readpages)(struct file *filp, struct address_space *mapping,
struct list_head *pages, unsigned nr_pages);
/*
* ext3 requires that a successful prepare_write() call be followed
* by a commit_write() call - they must be balanced
*/
int (*prepare_write)(struct file *, struct page *, unsigned, unsigned);
int (*commit_write)(struct file *, struct page *, unsigned, unsigned);
int (*write_begin)(struct file *, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata);
int (*write_end)(struct file *, struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct page *page, void *fsdata);
/* Unfortunately this kludge is needed for FIBMAP. Don't use it */
sector_t (*bmap)(struct address_space *, sector_t);
void (*invalidatepage) (struct page *, unsigned long);
int (*releasepage) (struct page *, gfp_t);
ssize_t (*direct_IO)(int, struct kiocb *, const struct iovec *iov,
loff_t offset, unsigned long nr_segs);
struct page* (*get_xip_page)(struct address_space *, sector_t,
int);
/* migrate the contents of a page to the specified target */
int (*migratepage) (struct address_space *,
struct page *, struct page *);
int (*launder_page) (struct page *);
};
/*
* pagecache_write_begin/pagecache_write_end must be used by general code
* to write into the pagecache.
*/
int pagecache_write_begin(struct file *, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata);
int pagecache_write_end(struct file *, struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct page *page, void *fsdata);
struct backing_dev_info;
struct address_space {
struct inode *host; /* owner: inode, block_device */
struct radix_tree_root page_tree; /* radix tree of all pages */
rwlock_t tree_lock; /* and rwlock protecting it */
unsigned int i_mmap_writable;/* count VM_SHARED mappings */
struct prio_tree_root i_mmap; /* tree of private and shared mappings */
struct list_head i_mmap_nonlinear;/*list VM_NONLINEAR mappings */
spinlock_t i_mmap_lock; /* protect tree, count, list */
unsigned int truncate_count; /* Cover race condition with truncate */
unsigned long nrpages; /* number of total pages */
pgoff_t writeback_index;/* writeback starts here */
const struct address_space_operations *a_ops; /* methods */
unsigned long flags; /* error bits/gfp mask */
struct backing_dev_info *backing_dev_info; /* device readahead, etc */
spinlock_t private_lock; /* for use by the address_space */
struct list_head private_list; /* ditto */
struct address_space *assoc_mapping; /* ditto */
} __attribute__((aligned(sizeof(long))));
/*
* On most architectures that alignment is already the case; but
* must be enforced here for CRIS, to let the least signficant bit
* of struct page's "mapping" pointer be used for PAGE_MAPPING_ANON.
*/
struct block_device {
dev_t bd_dev; /* not a kdev_t - it's a search key */
struct inode * bd_inode; /* will die */
int bd_openers;
struct mutex bd_mutex; /* open/close mutex */
struct semaphore bd_mount_sem;
struct list_head bd_inodes;
void * bd_holder;
int bd_holders;
#ifdef CONFIG_SYSFS
struct list_head bd_holder_list;
#endif
struct block_device * bd_contains;
unsigned bd_block_size;
struct hd_struct * bd_part;
/* number of times partitions within this device have been opened. */
unsigned bd_part_count;
int bd_invalidated;
struct gendisk * bd_disk;
struct list_head bd_list;
struct backing_dev_info *bd_inode_backing_dev_info;
/*
* Private data. You must have bd_claim'ed the block_device
* to use this. NOTE: bd_claim allows an owner to claim
* the same device multiple times, the owner must take special
* care to not mess up bd_private for that case.
*/
unsigned long bd_private;
};
/*
* Radix-tree tags, for tagging dirty and writeback pages within the pagecache
* radix trees
*/
#define PAGECACHE_TAG_DIRTY 0
#define PAGECACHE_TAG_WRITEBACK 1
int mapping_tagged(struct address_space *mapping, int tag);
/*
* Might pages of this file be mapped into userspace?
*/
static inline int mapping_mapped(struct address_space *mapping)
{
return !prio_tree_empty(&mapping->i_mmap) ||
!list_empty(&mapping->i_mmap_nonlinear);
}
/*
* Might pages of this file have been modified in userspace?
* Note that i_mmap_writable counts all VM_SHARED vmas: do_mmap_pgoff
* marks vma as VM_SHARED if it is shared, and the file was opened for
* writing i.e. vma may be mprotected writable even if now readonly.
*/
static inline int mapping_writably_mapped(struct address_space *mapping)
{
return mapping->i_mmap_writable != 0;
}
/*
* Use sequence counter to get consistent i_size on 32-bit processors.
*/
#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
#include <linux/seqlock.h>
#define __NEED_I_SIZE_ORDERED
#define i_size_ordered_init(inode) seqcount_init(&inode->i_size_seqcount)
#else
#define i_size_ordered_init(inode) do { } while (0)
#endif
struct inode {
struct hlist_node i_hash;
struct list_head i_list;
struct list_head i_sb_list;
struct list_head i_dentry;
unsigned long i_ino;
atomic_t i_count;
unsigned int i_nlink;
uid_t i_uid;
gid_t i_gid;
dev_t i_rdev;
u64 i_version;
loff_t i_size;
#ifdef __NEED_I_SIZE_ORDERED
seqcount_t i_size_seqcount;
#endif
struct timespec i_atime;
struct timespec i_mtime;
struct timespec i_ctime;
unsigned int i_blkbits;
blkcnt_t i_blocks;
unsigned short i_bytes;
[PATCH] Save some bytes in struct inode [acme@newtoy net-2.6.20]$ pahole --cacheline 64 fs/inode.o inode /* /pub/scm/linux/kernel/git/acme/net-2.6.20/include/linux/dcache.h:86 */ struct inode { struct hlist_node i_hash; /* 0 8 */ struct list_head i_list; /* 8 8 */ struct list_head i_sb_list; /* 16 8 */ struct list_head i_dentry; /* 24 8 */ long unsigned int i_ino; /* 32 4 */ atomic_t i_count; /* 36 4 */ umode_t i_mode; /* 40 2 */ /* XXX 2 bytes hole, try to pack */ unsigned int i_nlink; /* 44 4 */ uid_t i_uid; /* 48 4 */ gid_t i_gid; /* 52 4 */ dev_t i_rdev; /* 56 4 */ loff_t i_size; /* 60 8 */ struct timespec i_atime; /* 68 8 */ struct timespec i_mtime; /* 76 8 */ struct timespec i_ctime; /* 84 8 */ unsigned int i_blkbits; /* 92 4 */ long unsigned int i_version; /* 96 4 */ blkcnt_t i_blocks; /* 100 4 */ short unsigned int i_bytes; /* 104 2 */ /* XXX 2 bytes hole, try to pack */ spinlock_t i_lock; /* 108 40 */ struct mutex i_mutex; /* 148 76 */ struct rw_semaphore i_alloc_sem; /* 224 64 */ struct inode_operations * i_op; /* 288 4 */ const struct file_operations * i_fop; /* 292 4 */ struct super_block * i_sb; /* 296 4 */ struct file_lock * i_flock; /* 300 4 */ struct address_space * i_mapping; /* 304 4 */ struct address_space i_data; /* 308 188 */ struct list_head i_devices; /* 496 8 */ union ; /* 504 4 */ int i_cindex; /* 508 4 */ __u32 i_generation; /* 512 4 */ /* ---------- cacheline 8 boundary ---------- */ long unsigned int i_dnotify_mask; /* 516 4 */ struct dnotify_struct * i_dnotify; /* 520 4 */ struct list_head inotify_watches; /* 524 8 */ struct mutex inotify_mutex; /* 532 76 */ long unsigned int i_state; /* 608 4 */ long unsigned int dirtied_when; /* 612 4 */ unsigned int i_flags; /* 616 4 */ atomic_t i_writecount; /* 620 4 */ void * i_security; /* 624 4 */ void * i_private; /* 628 4 */ }; /* size: 632, sum members: 628, holes: 2, sum holes: 4 */ [acme@newtoy net-2.6.20]$ So just moving i_mode to after i_bytes we save 4 bytes by nuking both holes: [acme@newtoy net-2.6.20]$ codiff -V /tmp/inode.o.before fs/inode.o /pub/scm/linux/kernel/git/acme/net-2.6.20/fs/inode.c: struct inode | -4 i_mode; from: umode_t /* 40(0) 2(0) */ to: umode_t /* 102(0) 2(0) */ 1 struct changed [acme@newtoy net-2.6.20]$ I've prunned all the other offset changes, only this one is of interest here. So now we have: [acme@newtoy net-2.6.20]$ pahole --cacheline 64 ../OUTPUT/qemu/net-2.6.20/fs/inode.o inode /* /pub/scm/linux/kernel/git/acme/net-2.6.20/include/linux/dcache.h:86 */ struct inode { struct hlist_node i_hash; /* 0 8 */ struct list_head i_list; /* 8 8 */ struct list_head i_sb_list; /* 16 8 */ struct list_head i_dentry; /* 24 8 */ long unsigned int i_ino; /* 32 4 */ atomic_t i_count; /* 36 4 */ unsigned int i_nlink; /* 40 4 */ uid_t i_uid; /* 44 4 */ gid_t i_gid; /* 48 4 */ dev_t i_rdev; /* 52 4 */ loff_t i_size; /* 56 8 */ /* ---------- cacheline 1 boundary ---------- */ struct timespec i_atime; /* 64 8 */ struct timespec i_mtime; /* 72 8 */ struct timespec i_ctime; /* 80 8 */ unsigned int i_blkbits; /* 88 4 */ long unsigned int i_version; /* 92 4 */ blkcnt_t i_blocks; /* 96 4 */ short unsigned int i_bytes; /* 100 2 */ umode_t i_mode; /* 102 2 */ spinlock_t i_lock; /* 104 40 */ struct mutex i_mutex; /* 144 76 */ struct rw_semaphore i_alloc_sem; /* 220 64 */ struct inode_operations * i_op; /* 284 4 */ const struct file_operations * i_fop; /* 288 4 */ struct super_block * i_sb; /* 292 4 */ struct file_lock * i_flock; /* 296 4 */ struct address_space * i_mapping; /* 300 4 */ struct address_space i_data; /* 304 188 */ struct list_head i_devices; /* 492 8 */ union ; /* 500 4 */ int i_cindex; /* 504 4 */ __u32 i_generation; /* 508 4 */ /* ---------- cacheline 8 boundary ---------- */ long unsigned int i_dnotify_mask; /* 512 4 */ struct dnotify_struct * i_dnotify; /* 516 4 */ struct list_head inotify_watches; /* 520 8 */ struct mutex inotify_mutex; /* 528 76 */ long unsigned int i_state; /* 604 4 */ long unsigned int dirtied_when; /* 608 4 */ unsigned int i_flags; /* 612 4 */ atomic_t i_writecount; /* 616 4 */ void * i_security; /* 620 4 */ void * i_private; /* 624 4 */ }; /* size: 628 */ [acme@newtoy net-2.6.20]$ Signed-off-by: Arnaldo Carvalho de Melo <acme@mandriva.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
16 years ago
umode_t i_mode;
spinlock_t i_lock; /* i_blocks, i_bytes, maybe i_size */
struct mutex i_mutex;
struct rw_semaphore i_alloc_sem;
const struct inode_operations *i_op;
const struct file_operations *i_fop; /* former ->i_op->default_file_ops */
struct super_block *i_sb;
struct file_lock *i_flock;
struct address_space *i_mapping;
struct address_space i_data;
#ifdef CONFIG_QUOTA
struct dquot *i_dquot[MAXQUOTAS];
#endif
struct list_head i_devices;
union {
struct pipe_inode_info *i_pipe;
struct block_device *i_bdev;
struct cdev *i_cdev;
};
int i_cindex;
__u32 i_generation;
#ifdef CONFIG_DNOTIFY
unsigned long i_dnotify_mask; /* Directory notify events */
struct dnotify_struct *i_dnotify; /* for directory notifications */
#endif
#ifdef CONFIG_INOTIFY
struct list_head inotify_watches; /* watches on this inode */
struct mutex inotify_mutex; /* protects the watches list */
#endif
unsigned long i_state;
unsigned long dirtied_when; /* jiffies of first dirtying */
unsigned int i_flags;
atomic_t i_writecount;
#ifdef CONFIG_SECURITY
void *i_security;
#endif
void *i_private; /* fs or device private pointer */
};
/*
* inode->i_mutex nesting subclasses for the lock validator:
*
* 0: the object of the current VFS operation
* 1: parent
* 2: child/target
* 3: quota file
*
* The locking order between these classes is
* parent -> child -> normal -> xattr -> quota
*/
enum inode_i_mutex_lock_class
{
I_MUTEX_NORMAL,
I_MUTEX_PARENT,
I_MUTEX_CHILD,
I_MUTEX_XATTR,
I_MUTEX_QUOTA
};
extern void inode_double_lock(struct inode *inode1, struct inode *inode2);
extern void inode_double_unlock(struct inode *inode1, struct inode *inode2);
/*
* NOTE: in a 32bit arch with a preemptable kernel and
* an UP compile the i_size_read/write must be atomic
* with respect to the local cpu (unlike with preempt disabled),
* but they don't need to be atomic with respect to other cpus like in
* true SMP (so they need either to either locally disable irq around
* the read or for example on x86 they can be still implemented as a
* cmpxchg8b without the need of the lock prefix). For SMP compiles
* and 64bit archs it makes no difference if preempt is enabled or not.
*/
static inline loff_t i_size_read(const struct inode *inode)
{
#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
loff_t i_size;
unsigned int seq;
do {
seq = read_seqcount_begin(&inode->i_size_seqcount);
i_size = inode->i_size;
} while (read_seqcount_retry(&inode->i_size_seqcount, seq));
return i_size;
#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPT)
loff_t i_size;
preempt_disable();
i_size = inode->i_size;
preempt_enable();
return i_size;
#else
return inode->i_size;
#endif
}
/*
* NOTE: unlike i_size_read(), i_size_write() does need locking around it
* (normally i_mutex), otherwise on 32bit/SMP an update of i_size_seqcount
* can be lost, resulting in subsequent i_size_read() calls spinning forever.
*/
static inline void i_size_write(struct inode *inode, loff_t i_size)
{
#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
write_seqcount_begin(&inode->i_size_seqcount);
inode->i_size = i_size;
write_seqcount_end(&inode->i_size_seqcount);
#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPT)
preempt_disable();
inode->i_size = i_size;
preempt_enable();
#else
inode->i_size = i_size;
#endif
}
static inline unsigned iminor(const struct inode *inode)
{
return MINOR(inode->i_rdev);
}
static inline unsigned imajor(const struct inode *inode)
{
return MAJOR(inode->i_rdev);
}
extern struct block_device *I_BDEV(struct inode *inode);
struct fown_struct {
rwlock_t lock; /* protects pid, uid, euid fields */
struct pid *pid; /* pid or -pgrp where SIGIO should be sent */
enum pid_type pid_type; /* Kind of process group SIGIO should be sent to */
uid_t uid, euid; /* uid/euid of process setting the owner */
int signum; /* posix.1b rt signal to be delivered on IO */
};
/*
* Track a single file's readahead state
*/
struct file_ra_state {