Merge tag 'x86-urgent-2023-09-10' of git://git.kernel.org/pub/scm/linux/kernel/git...
[platform/kernel/linux-rpi.git] / fs / ocfs2 / file.c
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * file.c
4  *
5  * File open, close, extend, truncate
6  *
7  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
8  */
9
10 #include <linux/capability.h>
11 #include <linux/fs.h>
12 #include <linux/types.h>
13 #include <linux/slab.h>
14 #include <linux/highmem.h>
15 #include <linux/pagemap.h>
16 #include <linux/uio.h>
17 #include <linux/sched.h>
18 #include <linux/splice.h>
19 #include <linux/mount.h>
20 #include <linux/writeback.h>
21 #include <linux/falloc.h>
22 #include <linux/quotaops.h>
23 #include <linux/blkdev.h>
24 #include <linux/backing-dev.h>
25
26 #include <cluster/masklog.h>
27
28 #include "ocfs2.h"
29
30 #include "alloc.h"
31 #include "aops.h"
32 #include "dir.h"
33 #include "dlmglue.h"
34 #include "extent_map.h"
35 #include "file.h"
36 #include "sysfile.h"
37 #include "inode.h"
38 #include "ioctl.h"
39 #include "journal.h"
40 #include "locks.h"
41 #include "mmap.h"
42 #include "suballoc.h"
43 #include "super.h"
44 #include "xattr.h"
45 #include "acl.h"
46 #include "quota.h"
47 #include "refcounttree.h"
48 #include "ocfs2_trace.h"
49
50 #include "buffer_head_io.h"
51
52 static int ocfs2_init_file_private(struct inode *inode, struct file *file)
53 {
54         struct ocfs2_file_private *fp;
55
56         fp = kzalloc(sizeof(struct ocfs2_file_private), GFP_KERNEL);
57         if (!fp)
58                 return -ENOMEM;
59
60         fp->fp_file = file;
61         mutex_init(&fp->fp_mutex);
62         ocfs2_file_lock_res_init(&fp->fp_flock, fp);
63         file->private_data = fp;
64
65         return 0;
66 }
67
68 static void ocfs2_free_file_private(struct inode *inode, struct file *file)
69 {
70         struct ocfs2_file_private *fp = file->private_data;
71         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
72
73         if (fp) {
74                 ocfs2_simple_drop_lockres(osb, &fp->fp_flock);
75                 ocfs2_lock_res_free(&fp->fp_flock);
76                 kfree(fp);
77                 file->private_data = NULL;
78         }
79 }
80
81 static int ocfs2_file_open(struct inode *inode, struct file *file)
82 {
83         int status;
84         int mode = file->f_flags;
85         struct ocfs2_inode_info *oi = OCFS2_I(inode);
86
87         trace_ocfs2_file_open(inode, file, file->f_path.dentry,
88                               (unsigned long long)oi->ip_blkno,
89                               file->f_path.dentry->d_name.len,
90                               file->f_path.dentry->d_name.name, mode);
91
92         if (file->f_mode & FMODE_WRITE) {
93                 status = dquot_initialize(inode);
94                 if (status)
95                         goto leave;
96         }
97
98         spin_lock(&oi->ip_lock);
99
100         /* Check that the inode hasn't been wiped from disk by another
101          * node. If it hasn't then we're safe as long as we hold the
102          * spin lock until our increment of open count. */
103         if (oi->ip_flags & OCFS2_INODE_DELETED) {
104                 spin_unlock(&oi->ip_lock);
105
106                 status = -ENOENT;
107                 goto leave;
108         }
109
110         if (mode & O_DIRECT)
111                 oi->ip_flags |= OCFS2_INODE_OPEN_DIRECT;
112
113         oi->ip_open_count++;
114         spin_unlock(&oi->ip_lock);
115
116         status = ocfs2_init_file_private(inode, file);
117         if (status) {
118                 /*
119                  * We want to set open count back if we're failing the
120                  * open.
121                  */
122                 spin_lock(&oi->ip_lock);
123                 oi->ip_open_count--;
124                 spin_unlock(&oi->ip_lock);
125         }
126
127         file->f_mode |= FMODE_NOWAIT;
128
129 leave:
130         return status;
131 }
132
133 static int ocfs2_file_release(struct inode *inode, struct file *file)
134 {
135         struct ocfs2_inode_info *oi = OCFS2_I(inode);
136
137         spin_lock(&oi->ip_lock);
138         if (!--oi->ip_open_count)
139                 oi->ip_flags &= ~OCFS2_INODE_OPEN_DIRECT;
140
141         trace_ocfs2_file_release(inode, file, file->f_path.dentry,
142                                  oi->ip_blkno,
143                                  file->f_path.dentry->d_name.len,
144                                  file->f_path.dentry->d_name.name,
145                                  oi->ip_open_count);
146         spin_unlock(&oi->ip_lock);
147
148         ocfs2_free_file_private(inode, file);
149
150         return 0;
151 }
152
153 static int ocfs2_dir_open(struct inode *inode, struct file *file)
154 {
155         return ocfs2_init_file_private(inode, file);
156 }
157
158 static int ocfs2_dir_release(struct inode *inode, struct file *file)
159 {
160         ocfs2_free_file_private(inode, file);
161         return 0;
162 }
163
164 static int ocfs2_sync_file(struct file *file, loff_t start, loff_t end,
165                            int datasync)
166 {
167         int err = 0;
168         struct inode *inode = file->f_mapping->host;
169         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
170         struct ocfs2_inode_info *oi = OCFS2_I(inode);
171         journal_t *journal = osb->journal->j_journal;
172         int ret;
173         tid_t commit_tid;
174         bool needs_barrier = false;
175
176         trace_ocfs2_sync_file(inode, file, file->f_path.dentry,
177                               oi->ip_blkno,
178                               file->f_path.dentry->d_name.len,
179                               file->f_path.dentry->d_name.name,
180                               (unsigned long long)datasync);
181
182         if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
183                 return -EROFS;
184
185         err = file_write_and_wait_range(file, start, end);
186         if (err)
187                 return err;
188
189         commit_tid = datasync ? oi->i_datasync_tid : oi->i_sync_tid;
190         if (journal->j_flags & JBD2_BARRIER &&
191             !jbd2_trans_will_send_data_barrier(journal, commit_tid))
192                 needs_barrier = true;
193         err = jbd2_complete_transaction(journal, commit_tid);
194         if (needs_barrier) {
195                 ret = blkdev_issue_flush(inode->i_sb->s_bdev);
196                 if (!err)
197                         err = ret;
198         }
199
200         if (err)
201                 mlog_errno(err);
202
203         return (err < 0) ? -EIO : 0;
204 }
205
206 int ocfs2_should_update_atime(struct inode *inode,
207                               struct vfsmount *vfsmnt)
208 {
209         struct timespec64 now;
210         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
211
212         if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
213                 return 0;
214
215         if ((inode->i_flags & S_NOATIME) ||
216             ((inode->i_sb->s_flags & SB_NODIRATIME) && S_ISDIR(inode->i_mode)))
217                 return 0;
218
219         /*
220          * We can be called with no vfsmnt structure - NFSD will
221          * sometimes do this.
222          *
223          * Note that our action here is different than touch_atime() -
224          * if we can't tell whether this is a noatime mount, then we
225          * don't know whether to trust the value of s_atime_quantum.
226          */
227         if (vfsmnt == NULL)
228                 return 0;
229
230         if ((vfsmnt->mnt_flags & MNT_NOATIME) ||
231             ((vfsmnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode)))
232                 return 0;
233
234         if (vfsmnt->mnt_flags & MNT_RELATIME) {
235                 struct timespec64 ctime = inode_get_ctime(inode);
236
237                 if ((timespec64_compare(&inode->i_atime, &inode->i_mtime) <= 0) ||
238                     (timespec64_compare(&inode->i_atime, &ctime) <= 0))
239                         return 1;
240
241                 return 0;
242         }
243
244         now = current_time(inode);
245         if ((now.tv_sec - inode->i_atime.tv_sec <= osb->s_atime_quantum))
246                 return 0;
247         else
248                 return 1;
249 }
250
251 int ocfs2_update_inode_atime(struct inode *inode,
252                              struct buffer_head *bh)
253 {
254         int ret;
255         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
256         handle_t *handle;
257         struct ocfs2_dinode *di = (struct ocfs2_dinode *) bh->b_data;
258
259         handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
260         if (IS_ERR(handle)) {
261                 ret = PTR_ERR(handle);
262                 mlog_errno(ret);
263                 goto out;
264         }
265
266         ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh,
267                                       OCFS2_JOURNAL_ACCESS_WRITE);
268         if (ret) {
269                 mlog_errno(ret);
270                 goto out_commit;
271         }
272
273         /*
274          * Don't use ocfs2_mark_inode_dirty() here as we don't always
275          * have i_rwsem to guard against concurrent changes to other
276          * inode fields.
277          */
278         inode->i_atime = current_time(inode);
279         di->i_atime = cpu_to_le64(inode->i_atime.tv_sec);
280         di->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec);
281         ocfs2_update_inode_fsync_trans(handle, inode, 0);
282         ocfs2_journal_dirty(handle, bh);
283
284 out_commit:
285         ocfs2_commit_trans(osb, handle);
286 out:
287         return ret;
288 }
289
290 int ocfs2_set_inode_size(handle_t *handle,
291                                 struct inode *inode,
292                                 struct buffer_head *fe_bh,
293                                 u64 new_i_size)
294 {
295         int status;
296
297         i_size_write(inode, new_i_size);
298         inode->i_blocks = ocfs2_inode_sector_count(inode);
299         inode->i_mtime = inode_set_ctime_current(inode);
300
301         status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
302         if (status < 0) {
303                 mlog_errno(status);
304                 goto bail;
305         }
306
307 bail:
308         return status;
309 }
310
311 int ocfs2_simple_size_update(struct inode *inode,
312                              struct buffer_head *di_bh,
313                              u64 new_i_size)
314 {
315         int ret;
316         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
317         handle_t *handle = NULL;
318
319         handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
320         if (IS_ERR(handle)) {
321                 ret = PTR_ERR(handle);
322                 mlog_errno(ret);
323                 goto out;
324         }
325
326         ret = ocfs2_set_inode_size(handle, inode, di_bh,
327                                    new_i_size);
328         if (ret < 0)
329                 mlog_errno(ret);
330
331         ocfs2_update_inode_fsync_trans(handle, inode, 0);
332         ocfs2_commit_trans(osb, handle);
333 out:
334         return ret;
335 }
336
337 static int ocfs2_cow_file_pos(struct inode *inode,
338                               struct buffer_head *fe_bh,
339                               u64 offset)
340 {
341         int status;
342         u32 phys, cpos = offset >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
343         unsigned int num_clusters = 0;
344         unsigned int ext_flags = 0;
345
346         /*
347          * If the new offset is aligned to the range of the cluster, there is
348          * no space for ocfs2_zero_range_for_truncate to fill, so no need to
349          * CoW either.
350          */
351         if ((offset & (OCFS2_SB(inode->i_sb)->s_clustersize - 1)) == 0)
352                 return 0;
353
354         status = ocfs2_get_clusters(inode, cpos, &phys,
355                                     &num_clusters, &ext_flags);
356         if (status) {
357                 mlog_errno(status);
358                 goto out;
359         }
360
361         if (!(ext_flags & OCFS2_EXT_REFCOUNTED))
362                 goto out;
363
364         return ocfs2_refcount_cow(inode, fe_bh, cpos, 1, cpos+1);
365
366 out:
367         return status;
368 }
369
370 static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
371                                      struct inode *inode,
372                                      struct buffer_head *fe_bh,
373                                      u64 new_i_size)
374 {
375         int status;
376         handle_t *handle;
377         struct ocfs2_dinode *di;
378         u64 cluster_bytes;
379
380         /*
381          * We need to CoW the cluster contains the offset if it is reflinked
382          * since we will call ocfs2_zero_range_for_truncate later which will
383          * write "0" from offset to the end of the cluster.
384          */
385         status = ocfs2_cow_file_pos(inode, fe_bh, new_i_size);
386         if (status) {
387                 mlog_errno(status);
388                 return status;
389         }
390
391         /* TODO: This needs to actually orphan the inode in this
392          * transaction. */
393
394         handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
395         if (IS_ERR(handle)) {
396                 status = PTR_ERR(handle);
397                 mlog_errno(status);
398                 goto out;
399         }
400
401         status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), fe_bh,
402                                          OCFS2_JOURNAL_ACCESS_WRITE);
403         if (status < 0) {
404                 mlog_errno(status);
405                 goto out_commit;
406         }
407
408         /*
409          * Do this before setting i_size.
410          */
411         cluster_bytes = ocfs2_align_bytes_to_clusters(inode->i_sb, new_i_size);
412         status = ocfs2_zero_range_for_truncate(inode, handle, new_i_size,
413                                                cluster_bytes);
414         if (status) {
415                 mlog_errno(status);
416                 goto out_commit;
417         }
418
419         i_size_write(inode, new_i_size);
420         inode->i_mtime = inode_set_ctime_current(inode);
421
422         di = (struct ocfs2_dinode *) fe_bh->b_data;
423         di->i_size = cpu_to_le64(new_i_size);
424         di->i_ctime = di->i_mtime = cpu_to_le64(inode_get_ctime(inode).tv_sec);
425         di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode_get_ctime(inode).tv_nsec);
426         ocfs2_update_inode_fsync_trans(handle, inode, 0);
427
428         ocfs2_journal_dirty(handle, fe_bh);
429
430 out_commit:
431         ocfs2_commit_trans(osb, handle);
432 out:
433         return status;
434 }
435
436 int ocfs2_truncate_file(struct inode *inode,
437                                struct buffer_head *di_bh,
438                                u64 new_i_size)
439 {
440         int status = 0;
441         struct ocfs2_dinode *fe = NULL;
442         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
443
444         /* We trust di_bh because it comes from ocfs2_inode_lock(), which
445          * already validated it */
446         fe = (struct ocfs2_dinode *) di_bh->b_data;
447
448         trace_ocfs2_truncate_file((unsigned long long)OCFS2_I(inode)->ip_blkno,
449                                   (unsigned long long)le64_to_cpu(fe->i_size),
450                                   (unsigned long long)new_i_size);
451
452         mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode),
453                         "Inode %llu, inode i_size = %lld != di "
454                         "i_size = %llu, i_flags = 0x%x\n",
455                         (unsigned long long)OCFS2_I(inode)->ip_blkno,
456                         i_size_read(inode),
457                         (unsigned long long)le64_to_cpu(fe->i_size),
458                         le32_to_cpu(fe->i_flags));
459
460         if (new_i_size > le64_to_cpu(fe->i_size)) {
461                 trace_ocfs2_truncate_file_error(
462                         (unsigned long long)le64_to_cpu(fe->i_size),
463                         (unsigned long long)new_i_size);
464                 status = -EINVAL;
465                 mlog_errno(status);
466                 goto bail;
467         }
468
469         down_write(&OCFS2_I(inode)->ip_alloc_sem);
470
471         ocfs2_resv_discard(&osb->osb_la_resmap,
472                            &OCFS2_I(inode)->ip_la_data_resv);
473
474         /*
475          * The inode lock forced other nodes to sync and drop their
476          * pages, which (correctly) happens even if we have a truncate
477          * without allocation change - ocfs2 cluster sizes can be much
478          * greater than page size, so we have to truncate them
479          * anyway.
480          */
481
482         if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
483                 unmap_mapping_range(inode->i_mapping,
484                                     new_i_size + PAGE_SIZE - 1, 0, 1);
485                 truncate_inode_pages(inode->i_mapping, new_i_size);
486                 status = ocfs2_truncate_inline(inode, di_bh, new_i_size,
487                                                i_size_read(inode), 1);
488                 if (status)
489                         mlog_errno(status);
490
491                 goto bail_unlock_sem;
492         }
493
494         /* alright, we're going to need to do a full blown alloc size
495          * change. Orphan the inode so that recovery can complete the
496          * truncate if necessary. This does the task of marking
497          * i_size. */
498         status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size);
499         if (status < 0) {
500                 mlog_errno(status);
501                 goto bail_unlock_sem;
502         }
503
504         unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1);
505         truncate_inode_pages(inode->i_mapping, new_i_size);
506
507         status = ocfs2_commit_truncate(osb, inode, di_bh);
508         if (status < 0) {
509                 mlog_errno(status);
510                 goto bail_unlock_sem;
511         }
512
513         /* TODO: orphan dir cleanup here. */
514 bail_unlock_sem:
515         up_write(&OCFS2_I(inode)->ip_alloc_sem);
516
517 bail:
518         if (!status && OCFS2_I(inode)->ip_clusters == 0)
519                 status = ocfs2_try_remove_refcount_tree(inode, di_bh);
520
521         return status;
522 }
523
524 /*
525  * extend file allocation only here.
526  * we'll update all the disk stuff, and oip->alloc_size
527  *
528  * expect stuff to be locked, a transaction started and enough data /
529  * metadata reservations in the contexts.
530  *
531  * Will return -EAGAIN, and a reason if a restart is needed.
532  * If passed in, *reason will always be set, even in error.
533  */
534 int ocfs2_add_inode_data(struct ocfs2_super *osb,
535                          struct inode *inode,
536                          u32 *logical_offset,
537                          u32 clusters_to_add,
538                          int mark_unwritten,
539                          struct buffer_head *fe_bh,
540                          handle_t *handle,
541                          struct ocfs2_alloc_context *data_ac,
542                          struct ocfs2_alloc_context *meta_ac,
543                          enum ocfs2_alloc_restarted *reason_ret)
544 {
545         struct ocfs2_extent_tree et;
546
547         ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), fe_bh);
548         return ocfs2_add_clusters_in_btree(handle, &et, logical_offset,
549                                            clusters_to_add, mark_unwritten,
550                                            data_ac, meta_ac, reason_ret);
551 }
552
553 static int ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
554                                    u32 clusters_to_add, int mark_unwritten)
555 {
556         int status = 0;
557         int restart_func = 0;
558         int credits;
559         u32 prev_clusters;
560         struct buffer_head *bh = NULL;
561         struct ocfs2_dinode *fe = NULL;
562         handle_t *handle = NULL;
563         struct ocfs2_alloc_context *data_ac = NULL;
564         struct ocfs2_alloc_context *meta_ac = NULL;
565         enum ocfs2_alloc_restarted why = RESTART_NONE;
566         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
567         struct ocfs2_extent_tree et;
568         int did_quota = 0;
569
570         /*
571          * Unwritten extent only exists for file systems which
572          * support holes.
573          */
574         BUG_ON(mark_unwritten && !ocfs2_sparse_alloc(osb));
575
576         status = ocfs2_read_inode_block(inode, &bh);
577         if (status < 0) {
578                 mlog_errno(status);
579                 goto leave;
580         }
581         fe = (struct ocfs2_dinode *) bh->b_data;
582
583 restart_all:
584         BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);
585
586         ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), bh);
587         status = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0,
588                                        &data_ac, &meta_ac);
589         if (status) {
590                 mlog_errno(status);
591                 goto leave;
592         }
593
594         credits = ocfs2_calc_extend_credits(osb->sb, &fe->id2.i_list);
595         handle = ocfs2_start_trans(osb, credits);
596         if (IS_ERR(handle)) {
597                 status = PTR_ERR(handle);
598                 handle = NULL;
599                 mlog_errno(status);
600                 goto leave;
601         }
602
603 restarted_transaction:
604         trace_ocfs2_extend_allocation(
605                 (unsigned long long)OCFS2_I(inode)->ip_blkno,
606                 (unsigned long long)i_size_read(inode),
607                 le32_to_cpu(fe->i_clusters), clusters_to_add,
608                 why, restart_func);
609
610         status = dquot_alloc_space_nodirty(inode,
611                         ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
612         if (status)
613                 goto leave;
614         did_quota = 1;
615
616         /* reserve a write to the file entry early on - that we if we
617          * run out of credits in the allocation path, we can still
618          * update i_size. */
619         status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh,
620                                          OCFS2_JOURNAL_ACCESS_WRITE);
621         if (status < 0) {
622                 mlog_errno(status);
623                 goto leave;
624         }
625
626         prev_clusters = OCFS2_I(inode)->ip_clusters;
627
628         status = ocfs2_add_inode_data(osb,
629                                       inode,
630                                       &logical_start,
631                                       clusters_to_add,
632                                       mark_unwritten,
633                                       bh,
634                                       handle,
635                                       data_ac,
636                                       meta_ac,
637                                       &why);
638         if ((status < 0) && (status != -EAGAIN)) {
639                 if (status != -ENOSPC)
640                         mlog_errno(status);
641                 goto leave;
642         }
643         ocfs2_update_inode_fsync_trans(handle, inode, 1);
644         ocfs2_journal_dirty(handle, bh);
645
646         spin_lock(&OCFS2_I(inode)->ip_lock);
647         clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters);
648         spin_unlock(&OCFS2_I(inode)->ip_lock);
649         /* Release unused quota reservation */
650         dquot_free_space(inode,
651                         ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
652         did_quota = 0;
653
654         if (why != RESTART_NONE && clusters_to_add) {
655                 if (why == RESTART_META) {
656                         restart_func = 1;
657                         status = 0;
658                 } else {
659                         BUG_ON(why != RESTART_TRANS);
660
661                         status = ocfs2_allocate_extend_trans(handle, 1);
662                         if (status < 0) {
663                                 /* handle still has to be committed at
664                                  * this point. */
665                                 status = -ENOMEM;
666                                 mlog_errno(status);
667                                 goto leave;
668                         }
669                         goto restarted_transaction;
670                 }
671         }
672
673         trace_ocfs2_extend_allocation_end(OCFS2_I(inode)->ip_blkno,
674              le32_to_cpu(fe->i_clusters),
675              (unsigned long long)le64_to_cpu(fe->i_size),
676              OCFS2_I(inode)->ip_clusters,
677              (unsigned long long)i_size_read(inode));
678
679 leave:
680         if (status < 0 && did_quota)
681                 dquot_free_space(inode,
682                         ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
683         if (handle) {
684                 ocfs2_commit_trans(osb, handle);
685                 handle = NULL;
686         }
687         if (data_ac) {
688                 ocfs2_free_alloc_context(data_ac);
689                 data_ac = NULL;
690         }
691         if (meta_ac) {
692                 ocfs2_free_alloc_context(meta_ac);
693                 meta_ac = NULL;
694         }
695         if ((!status) && restart_func) {
696                 restart_func = 0;
697                 goto restart_all;
698         }
699         brelse(bh);
700         bh = NULL;
701
702         return status;
703 }
704
705 /*
706  * While a write will already be ordering the data, a truncate will not.
707  * Thus, we need to explicitly order the zeroed pages.
708  */
709 static handle_t *ocfs2_zero_start_ordered_transaction(struct inode *inode,
710                                                       struct buffer_head *di_bh,
711                                                       loff_t start_byte,
712                                                       loff_t length)
713 {
714         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
715         handle_t *handle = NULL;
716         int ret = 0;
717
718         if (!ocfs2_should_order_data(inode))
719                 goto out;
720
721         handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
722         if (IS_ERR(handle)) {
723                 ret = -ENOMEM;
724                 mlog_errno(ret);
725                 goto out;
726         }
727
728         ret = ocfs2_jbd2_inode_add_write(handle, inode, start_byte, length);
729         if (ret < 0) {
730                 mlog_errno(ret);
731                 goto out;
732         }
733
734         ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
735                                       OCFS2_JOURNAL_ACCESS_WRITE);
736         if (ret)
737                 mlog_errno(ret);
738         ocfs2_update_inode_fsync_trans(handle, inode, 1);
739
740 out:
741         if (ret) {
742                 if (!IS_ERR(handle))
743                         ocfs2_commit_trans(osb, handle);
744                 handle = ERR_PTR(ret);
745         }
746         return handle;
747 }
748
749 /* Some parts of this taken from generic_cont_expand, which turned out
750  * to be too fragile to do exactly what we need without us having to
751  * worry about recursive locking in ->write_begin() and ->write_end(). */
752 static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
753                                  u64 abs_to, struct buffer_head *di_bh)
754 {
755         struct address_space *mapping = inode->i_mapping;
756         struct page *page;
757         unsigned long index = abs_from >> PAGE_SHIFT;
758         handle_t *handle;
759         int ret = 0;
760         unsigned zero_from, zero_to, block_start, block_end;
761         struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
762
763         BUG_ON(abs_from >= abs_to);
764         BUG_ON(abs_to > (((u64)index + 1) << PAGE_SHIFT));
765         BUG_ON(abs_from & (inode->i_blkbits - 1));
766
767         handle = ocfs2_zero_start_ordered_transaction(inode, di_bh,
768                                                       abs_from,
769                                                       abs_to - abs_from);
770         if (IS_ERR(handle)) {
771                 ret = PTR_ERR(handle);
772                 goto out;
773         }
774
775         page = find_or_create_page(mapping, index, GFP_NOFS);
776         if (!page) {
777                 ret = -ENOMEM;
778                 mlog_errno(ret);
779                 goto out_commit_trans;
780         }
781
782         /* Get the offsets within the page that we want to zero */
783         zero_from = abs_from & (PAGE_SIZE - 1);
784         zero_to = abs_to & (PAGE_SIZE - 1);
785         if (!zero_to)
786                 zero_to = PAGE_SIZE;
787
788         trace_ocfs2_write_zero_page(
789                         (unsigned long long)OCFS2_I(inode)->ip_blkno,
790                         (unsigned long long)abs_from,
791                         (unsigned long long)abs_to,
792                         index, zero_from, zero_to);
793
794         /* We know that zero_from is block aligned */
795         for (block_start = zero_from; block_start < zero_to;
796              block_start = block_end) {
797                 block_end = block_start + i_blocksize(inode);
798
799                 /*
800                  * block_start is block-aligned.  Bump it by one to force
801                  * __block_write_begin and block_commit_write to zero the
802                  * whole block.
803                  */
804                 ret = __block_write_begin(page, block_start + 1, 0,
805                                           ocfs2_get_block);
806                 if (ret < 0) {
807                         mlog_errno(ret);
808                         goto out_unlock;
809                 }
810
811
812                 /* must not update i_size! */
813                 block_commit_write(page, block_start + 1, block_start + 1);
814         }
815
816         /*
817          * fs-writeback will release the dirty pages without page lock
818          * whose offset are over inode size, the release happens at
819          * block_write_full_page().
820          */
821         i_size_write(inode, abs_to);
822         inode->i_blocks = ocfs2_inode_sector_count(inode);
823         di->i_size = cpu_to_le64((u64)i_size_read(inode));
824         inode->i_mtime = inode_set_ctime_current(inode);
825         di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
826         di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
827         di->i_mtime_nsec = di->i_ctime_nsec;
828         if (handle) {
829                 ocfs2_journal_dirty(handle, di_bh);
830                 ocfs2_update_inode_fsync_trans(handle, inode, 1);
831         }
832
833 out_unlock:
834         unlock_page(page);
835         put_page(page);
836 out_commit_trans:
837         if (handle)
838                 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
839 out:
840         return ret;
841 }
842
843 /*
844  * Find the next range to zero.  We do this in terms of bytes because
845  * that's what ocfs2_zero_extend() wants, and it is dealing with the
846  * pagecache.  We may return multiple extents.
847  *
848  * zero_start and zero_end are ocfs2_zero_extend()s current idea of what
849  * needs to be zeroed.  range_start and range_end return the next zeroing
850  * range.  A subsequent call should pass the previous range_end as its
851  * zero_start.  If range_end is 0, there's nothing to do.
852  *
853  * Unwritten extents are skipped over.  Refcounted extents are CoWd.
854  */
855 static int ocfs2_zero_extend_get_range(struct inode *inode,
856                                        struct buffer_head *di_bh,
857                                        u64 zero_start, u64 zero_end,
858                                        u64 *range_start, u64 *range_end)
859 {
860         int rc = 0, needs_cow = 0;
861         u32 p_cpos, zero_clusters = 0;
862         u32 zero_cpos =
863                 zero_start >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
864         u32 last_cpos = ocfs2_clusters_for_bytes(inode->i_sb, zero_end);
865         unsigned int num_clusters = 0;
866         unsigned int ext_flags = 0;
867
868         while (zero_cpos < last_cpos) {
869                 rc = ocfs2_get_clusters(inode, zero_cpos, &p_cpos,
870                                         &num_clusters, &ext_flags);
871                 if (rc) {
872                         mlog_errno(rc);
873                         goto out;
874                 }
875
876                 if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) {
877                         zero_clusters = num_clusters;
878                         if (ext_flags & OCFS2_EXT_REFCOUNTED)
879                                 needs_cow = 1;
880                         break;
881                 }
882
883                 zero_cpos += num_clusters;
884         }
885         if (!zero_clusters) {
886                 *range_end = 0;
887                 goto out;
888         }
889
890         while ((zero_cpos + zero_clusters) < last_cpos) {
891                 rc = ocfs2_get_clusters(inode, zero_cpos + zero_clusters,
892                                         &p_cpos, &num_clusters,
893                                         &ext_flags);
894                 if (rc) {
895                         mlog_errno(rc);
896                         goto out;
897                 }
898
899                 if (!p_cpos || (ext_flags & OCFS2_EXT_UNWRITTEN))
900                         break;
901                 if (ext_flags & OCFS2_EXT_REFCOUNTED)
902                         needs_cow = 1;
903                 zero_clusters += num_clusters;
904         }
905         if ((zero_cpos + zero_clusters) > last_cpos)
906                 zero_clusters = last_cpos - zero_cpos;
907
908         if (needs_cow) {
909                 rc = ocfs2_refcount_cow(inode, di_bh, zero_cpos,
910                                         zero_clusters, UINT_MAX);
911                 if (rc) {
912                         mlog_errno(rc);
913                         goto out;
914                 }
915         }
916
917         *range_start = ocfs2_clusters_to_bytes(inode->i_sb, zero_cpos);
918         *range_end = ocfs2_clusters_to_bytes(inode->i_sb,
919                                              zero_cpos + zero_clusters);
920
921 out:
922         return rc;
923 }
924
925 /*
926  * Zero one range returned from ocfs2_zero_extend_get_range().  The caller
927  * has made sure that the entire range needs zeroing.
928  */
929 static int ocfs2_zero_extend_range(struct inode *inode, u64 range_start,
930                                    u64 range_end, struct buffer_head *di_bh)
931 {
932         int rc = 0;
933         u64 next_pos;
934         u64 zero_pos = range_start;
935
936         trace_ocfs2_zero_extend_range(
937                         (unsigned long long)OCFS2_I(inode)->ip_blkno,
938                         (unsigned long long)range_start,
939                         (unsigned long long)range_end);
940         BUG_ON(range_start >= range_end);
941
942         while (zero_pos < range_end) {
943                 next_pos = (zero_pos & PAGE_MASK) + PAGE_SIZE;
944                 if (next_pos > range_end)
945                         next_pos = range_end;
946                 rc = ocfs2_write_zero_page(inode, zero_pos, next_pos, di_bh);
947                 if (rc < 0) {
948                         mlog_errno(rc);
949                         break;
950                 }
951                 zero_pos = next_pos;
952
953                 /*
954                  * Very large extends have the potential to lock up
955                  * the cpu for extended periods of time.
956                  */
957                 cond_resched();
958         }
959
960         return rc;
961 }
962
963 int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh,
964                       loff_t zero_to_size)
965 {
966         int ret = 0;
967         u64 zero_start, range_start = 0, range_end = 0;
968         struct super_block *sb = inode->i_sb;
969
970         zero_start = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode));
971         trace_ocfs2_zero_extend((unsigned long long)OCFS2_I(inode)->ip_blkno,
972                                 (unsigned long long)zero_start,
973                                 (unsigned long long)i_size_read(inode));
974         while (zero_start < zero_to_size) {
975                 ret = ocfs2_zero_extend_get_range(inode, di_bh, zero_start,
976                                                   zero_to_size,
977                                                   &range_start,
978                                                   &range_end);
979                 if (ret) {
980                         mlog_errno(ret);
981                         break;
982                 }
983                 if (!range_end)
984                         break;
985                 /* Trim the ends */
986                 if (range_start < zero_start)
987                         range_start = zero_start;
988                 if (range_end > zero_to_size)
989                         range_end = zero_to_size;
990
991                 ret = ocfs2_zero_extend_range(inode, range_start,
992                                               range_end, di_bh);
993                 if (ret) {
994                         mlog_errno(ret);
995                         break;
996                 }
997                 zero_start = range_end;
998         }
999
1000         return ret;
1001 }
1002
1003 int ocfs2_extend_no_holes(struct inode *inode, struct buffer_head *di_bh,
1004                           u64 new_i_size, u64 zero_to)
1005 {
1006         int ret;
1007         u32 clusters_to_add;
1008         struct ocfs2_inode_info *oi = OCFS2_I(inode);
1009
1010         /*
1011          * Only quota files call this without a bh, and they can't be
1012          * refcounted.
1013          */
1014         BUG_ON(!di_bh && ocfs2_is_refcount_inode(inode));
1015         BUG_ON(!di_bh && !(oi->ip_flags & OCFS2_INODE_SYSTEM_FILE));
1016
1017         clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size);
1018         if (clusters_to_add < oi->ip_clusters)
1019                 clusters_to_add = 0;
1020         else
1021                 clusters_to_add -= oi->ip_clusters;
1022
1023         if (clusters_to_add) {
1024                 ret = ocfs2_extend_allocation(inode, oi->ip_clusters,
1025                                               clusters_to_add, 0);
1026                 if (ret) {
1027                         mlog_errno(ret);
1028                         goto out;
1029                 }
1030         }
1031
1032         /*
1033          * Call this even if we don't add any clusters to the tree. We
1034          * still need to zero the area between the old i_size and the
1035          * new i_size.
1036          */
1037         ret = ocfs2_zero_extend(inode, di_bh, zero_to);
1038         if (ret < 0)
1039                 mlog_errno(ret);
1040
1041 out:
1042         return ret;
1043 }
1044
1045 static int ocfs2_extend_file(struct inode *inode,
1046                              struct buffer_head *di_bh,
1047                              u64 new_i_size)
1048 {
1049         int ret = 0;
1050         struct ocfs2_inode_info *oi = OCFS2_I(inode);
1051
1052         BUG_ON(!di_bh);
1053
1054         /* setattr sometimes calls us like this. */
1055         if (new_i_size == 0)
1056                 goto out;
1057
1058         if (i_size_read(inode) == new_i_size)
1059                 goto out;
1060         BUG_ON(new_i_size < i_size_read(inode));
1061
1062         /*
1063          * The alloc sem blocks people in read/write from reading our
1064          * allocation until we're done changing it. We depend on
1065          * i_rwsem to block other extend/truncate calls while we're
1066          * here.  We even have to hold it for sparse files because there
1067          * might be some tail zeroing.
1068          */
1069         down_write(&oi->ip_alloc_sem);
1070
1071         if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
1072                 /*
1073                  * We can optimize small extends by keeping the inodes
1074                  * inline data.
1075                  */
1076                 if (ocfs2_size_fits_inline_data(di_bh, new_i_size)) {
1077                         up_write(&oi->ip_alloc_sem);
1078                         goto out_update_size;
1079                 }
1080
1081                 ret = ocfs2_convert_inline_data_to_extents(inode, di_bh);
1082                 if (ret) {
1083                         up_write(&oi->ip_alloc_sem);
1084                         mlog_errno(ret);
1085                         goto out;
1086                 }
1087         }
1088
1089         if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
1090                 ret = ocfs2_zero_extend(inode, di_bh, new_i_size);
1091         else
1092                 ret = ocfs2_extend_no_holes(inode, di_bh, new_i_size,
1093                                             new_i_size);
1094
1095         up_write(&oi->ip_alloc_sem);
1096
1097         if (ret < 0) {
1098                 mlog_errno(ret);
1099                 goto out;
1100         }
1101
1102 out_update_size:
1103         ret = ocfs2_simple_size_update(inode, di_bh, new_i_size);
1104         if (ret < 0)
1105                 mlog_errno(ret);
1106
1107 out:
1108         return ret;
1109 }
1110
1111 int ocfs2_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
1112                   struct iattr *attr)
1113 {
1114         int status = 0, size_change;
1115         int inode_locked = 0;
1116         struct inode *inode = d_inode(dentry);
1117         struct super_block *sb = inode->i_sb;
1118         struct ocfs2_super *osb = OCFS2_SB(sb);
1119         struct buffer_head *bh = NULL;
1120         handle_t *handle = NULL;
1121         struct dquot *transfer_to[MAXQUOTAS] = { };
1122         int qtype;
1123         int had_lock;
1124         struct ocfs2_lock_holder oh;
1125
1126         trace_ocfs2_setattr(inode, dentry,
1127                             (unsigned long long)OCFS2_I(inode)->ip_blkno,
1128                             dentry->d_name.len, dentry->d_name.name,
1129                             attr->ia_valid, attr->ia_mode,
1130                             from_kuid(&init_user_ns, attr->ia_uid),
1131                             from_kgid(&init_user_ns, attr->ia_gid));
1132
1133         /* ensuring we don't even attempt to truncate a symlink */
1134         if (S_ISLNK(inode->i_mode))
1135                 attr->ia_valid &= ~ATTR_SIZE;
1136
1137 #define OCFS2_VALID_ATTRS (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME | ATTR_SIZE \
1138                            | ATTR_GID | ATTR_UID | ATTR_MODE)
1139         if (!(attr->ia_valid & OCFS2_VALID_ATTRS))
1140                 return 0;
1141
1142         status = setattr_prepare(&nop_mnt_idmap, dentry, attr);
1143         if (status)
1144                 return status;
1145
1146         if (is_quota_modification(&nop_mnt_idmap, inode, attr)) {
1147                 status = dquot_initialize(inode);
1148                 if (status)
1149                         return status;
1150         }
1151         size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE;
1152         if (size_change) {
1153                 /*
1154                  * Here we should wait dio to finish before inode lock
1155                  * to avoid a deadlock between ocfs2_setattr() and
1156                  * ocfs2_dio_end_io_write()
1157                  */
1158                 inode_dio_wait(inode);
1159
1160                 status = ocfs2_rw_lock(inode, 1);
1161                 if (status < 0) {
1162                         mlog_errno(status);
1163                         goto bail;
1164                 }
1165         }
1166
1167         had_lock = ocfs2_inode_lock_tracker(inode, &bh, 1, &oh);
1168         if (had_lock < 0) {
1169                 status = had_lock;
1170                 goto bail_unlock_rw;
1171         } else if (had_lock) {
1172                 /*
1173                  * As far as we know, ocfs2_setattr() could only be the first
1174                  * VFS entry point in the call chain of recursive cluster
1175                  * locking issue.
1176                  *
1177                  * For instance:
1178                  * chmod_common()
1179                  *  notify_change()
1180                  *   ocfs2_setattr()
1181                  *    posix_acl_chmod()
1182                  *     ocfs2_iop_get_acl()
1183                  *
1184                  * But, we're not 100% sure if it's always true, because the
1185                  * ordering of the VFS entry points in the call chain is out
1186                  * of our control. So, we'd better dump the stack here to
1187                  * catch the other cases of recursive locking.
1188                  */
1189                 mlog(ML_ERROR, "Another case of recursive locking:\n");
1190                 dump_stack();
1191         }
1192         inode_locked = 1;
1193
1194         if (size_change) {
1195                 status = inode_newsize_ok(inode, attr->ia_size);
1196                 if (status)
1197                         goto bail_unlock;
1198
1199                 if (i_size_read(inode) >= attr->ia_size) {
1200                         if (ocfs2_should_order_data(inode)) {
1201                                 status = ocfs2_begin_ordered_truncate(inode,
1202                                                                       attr->ia_size);
1203                                 if (status)
1204                                         goto bail_unlock;
1205                         }
1206                         status = ocfs2_truncate_file(inode, bh, attr->ia_size);
1207                 } else
1208                         status = ocfs2_extend_file(inode, bh, attr->ia_size);
1209                 if (status < 0) {
1210                         if (status != -ENOSPC)
1211                                 mlog_errno(status);
1212                         status = -ENOSPC;
1213                         goto bail_unlock;
1214                 }
1215         }
1216
1217         if ((attr->ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)) ||
1218             (attr->ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid))) {
1219                 /*
1220                  * Gather pointers to quota structures so that allocation /
1221                  * freeing of quota structures happens here and not inside
1222                  * dquot_transfer() where we have problems with lock ordering
1223                  */
1224                 if (attr->ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)
1225                     && OCFS2_HAS_RO_COMPAT_FEATURE(sb,
1226                     OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {
1227                         transfer_to[USRQUOTA] = dqget(sb, make_kqid_uid(attr->ia_uid));
1228                         if (IS_ERR(transfer_to[USRQUOTA])) {
1229                                 status = PTR_ERR(transfer_to[USRQUOTA]);
1230                                 transfer_to[USRQUOTA] = NULL;
1231                                 goto bail_unlock;
1232                         }
1233                 }
1234                 if (attr->ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid)
1235                     && OCFS2_HAS_RO_COMPAT_FEATURE(sb,
1236                     OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) {
1237                         transfer_to[GRPQUOTA] = dqget(sb, make_kqid_gid(attr->ia_gid));
1238                         if (IS_ERR(transfer_to[GRPQUOTA])) {
1239                                 status = PTR_ERR(transfer_to[GRPQUOTA]);
1240                                 transfer_to[GRPQUOTA] = NULL;
1241                                 goto bail_unlock;
1242                         }
1243                 }
1244                 down_write(&OCFS2_I(inode)->ip_alloc_sem);
1245                 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS +
1246                                            2 * ocfs2_quota_trans_credits(sb));
1247                 if (IS_ERR(handle)) {
1248                         status = PTR_ERR(handle);
1249                         mlog_errno(status);
1250                         goto bail_unlock_alloc;
1251                 }
1252                 status = __dquot_transfer(inode, transfer_to);
1253                 if (status < 0)
1254                         goto bail_commit;
1255         } else {
1256                 down_write(&OCFS2_I(inode)->ip_alloc_sem);
1257                 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
1258                 if (IS_ERR(handle)) {
1259                         status = PTR_ERR(handle);
1260                         mlog_errno(status);
1261                         goto bail_unlock_alloc;
1262                 }
1263         }
1264
1265         setattr_copy(&nop_mnt_idmap, inode, attr);
1266         mark_inode_dirty(inode);
1267
1268         status = ocfs2_mark_inode_dirty(handle, inode, bh);
1269         if (status < 0)
1270                 mlog_errno(status);
1271
1272 bail_commit:
1273         ocfs2_commit_trans(osb, handle);
1274 bail_unlock_alloc:
1275         up_write(&OCFS2_I(inode)->ip_alloc_sem);
1276 bail_unlock:
1277         if (status && inode_locked) {
1278                 ocfs2_inode_unlock_tracker(inode, 1, &oh, had_lock);
1279                 inode_locked = 0;
1280         }
1281 bail_unlock_rw:
1282         if (size_change)
1283                 ocfs2_rw_unlock(inode, 1);
1284 bail:
1285
1286         /* Release quota pointers in case we acquired them */
1287         for (qtype = 0; qtype < OCFS2_MAXQUOTAS; qtype++)
1288                 dqput(transfer_to[qtype]);
1289
1290         if (!status && attr->ia_valid & ATTR_MODE) {
1291                 status = ocfs2_acl_chmod(inode, bh);
1292                 if (status < 0)
1293                         mlog_errno(status);
1294         }
1295         if (inode_locked)
1296                 ocfs2_inode_unlock_tracker(inode, 1, &oh, had_lock);
1297
1298         brelse(bh);
1299         return status;
1300 }
1301
1302 int ocfs2_getattr(struct mnt_idmap *idmap, const struct path *path,
1303                   struct kstat *stat, u32 request_mask, unsigned int flags)
1304 {
1305         struct inode *inode = d_inode(path->dentry);
1306         struct super_block *sb = path->dentry->d_sb;
1307         struct ocfs2_super *osb = sb->s_fs_info;
1308         int err;
1309
1310         err = ocfs2_inode_revalidate(path->dentry);
1311         if (err) {
1312                 if (err != -ENOENT)
1313                         mlog_errno(err);
1314                 goto bail;
1315         }
1316
1317         generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
1318         /*
1319          * If there is inline data in the inode, the inode will normally not
1320          * have data blocks allocated (it may have an external xattr block).
1321          * Report at least one sector for such files, so tools like tar, rsync,
1322          * others don't incorrectly think the file is completely sparse.
1323          */
1324         if (unlikely(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL))
1325                 stat->blocks += (stat->size + 511)>>9;
1326
1327         /* We set the blksize from the cluster size for performance */
1328         stat->blksize = osb->s_clustersize;
1329
1330 bail:
1331         return err;
1332 }
1333
1334 int ocfs2_permission(struct mnt_idmap *idmap, struct inode *inode,
1335                      int mask)
1336 {
1337         int ret, had_lock;
1338         struct ocfs2_lock_holder oh;
1339
1340         if (mask & MAY_NOT_BLOCK)
1341                 return -ECHILD;
1342
1343         had_lock = ocfs2_inode_lock_tracker(inode, NULL, 0, &oh);
1344         if (had_lock < 0) {
1345                 ret = had_lock;
1346                 goto out;
1347         } else if (had_lock) {
1348                 /* See comments in ocfs2_setattr() for details.
1349                  * The call chain of this case could be:
1350                  * do_sys_open()
1351                  *  may_open()
1352                  *   inode_permission()
1353                  *    ocfs2_permission()
1354                  *     ocfs2_iop_get_acl()
1355                  */
1356                 mlog(ML_ERROR, "Another case of recursive locking:\n");
1357                 dump_stack();
1358         }
1359
1360         ret = generic_permission(&nop_mnt_idmap, inode, mask);
1361
1362         ocfs2_inode_unlock_tracker(inode, 0, &oh, had_lock);
1363 out:
1364         return ret;
1365 }
1366
1367 static int __ocfs2_write_remove_suid(struct inode *inode,
1368                                      struct buffer_head *bh)
1369 {
1370         int ret;
1371         handle_t *handle;
1372         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1373         struct ocfs2_dinode *di;
1374
1375         trace_ocfs2_write_remove_suid(
1376                         (unsigned long long)OCFS2_I(inode)->ip_blkno,
1377                         inode->i_mode);
1378
1379         handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
1380         if (IS_ERR(handle)) {
1381                 ret = PTR_ERR(handle);
1382                 mlog_errno(ret);
1383                 goto out;
1384         }
1385
1386         ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh,
1387                                       OCFS2_JOURNAL_ACCESS_WRITE);
1388         if (ret < 0) {
1389                 mlog_errno(ret);
1390                 goto out_trans;
1391         }
1392
1393         inode->i_mode &= ~S_ISUID;
1394         if ((inode->i_mode & S_ISGID) && (inode->i_mode & S_IXGRP))
1395                 inode->i_mode &= ~S_ISGID;
1396
1397         di = (struct ocfs2_dinode *) bh->b_data;
1398         di->i_mode = cpu_to_le16(inode->i_mode);
1399         ocfs2_update_inode_fsync_trans(handle, inode, 0);
1400
1401         ocfs2_journal_dirty(handle, bh);
1402
1403 out_trans:
1404         ocfs2_commit_trans(osb, handle);
1405 out:
1406         return ret;
1407 }
1408
1409 static int ocfs2_write_remove_suid(struct inode *inode)
1410 {
1411         int ret;
1412         struct buffer_head *bh = NULL;
1413
1414         ret = ocfs2_read_inode_block(inode, &bh);
1415         if (ret < 0) {
1416                 mlog_errno(ret);
1417                 goto out;
1418         }
1419
1420         ret =  __ocfs2_write_remove_suid(inode, bh);
1421 out:
1422         brelse(bh);
1423         return ret;
1424 }
1425
1426 /*
1427  * Allocate enough extents to cover the region starting at byte offset
1428  * start for len bytes. Existing extents are skipped, any extents
1429  * added are marked as "unwritten".
1430  */
1431 static int ocfs2_allocate_unwritten_extents(struct inode *inode,
1432                                             u64 start, u64 len)
1433 {
1434         int ret;
1435         u32 cpos, phys_cpos, clusters, alloc_size;
1436         u64 end = start + len;
1437         struct buffer_head *di_bh = NULL;
1438
1439         if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
1440                 ret = ocfs2_read_inode_block(inode, &di_bh);
1441                 if (ret) {
1442                         mlog_errno(ret);
1443                         goto out;
1444                 }
1445
1446                 /*
1447                  * Nothing to do if the requested reservation range
1448                  * fits within the inode.
1449                  */
1450                 if (ocfs2_size_fits_inline_data(di_bh, end))
1451                         goto out;
1452
1453                 ret = ocfs2_convert_inline_data_to_extents(inode, di_bh);
1454                 if (ret) {
1455                         mlog_errno(ret);
1456                         goto out;
1457                 }
1458         }
1459
1460         /*
1461          * We consider both start and len to be inclusive.
1462          */
1463         cpos = start >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
1464         clusters = ocfs2_clusters_for_bytes(inode->i_sb, start + len);
1465         clusters -= cpos;
1466
1467         while (clusters) {
1468                 ret = ocfs2_get_clusters(inode, cpos, &phys_cpos,
1469                                          &alloc_size, NULL);
1470                 if (ret) {
1471                         mlog_errno(ret);
1472                         goto out;
1473                 }
1474
1475                 /*
1476                  * Hole or existing extent len can be arbitrary, so
1477                  * cap it to our own allocation request.
1478                  */
1479                 if (alloc_size > clusters)
1480                         alloc_size = clusters;
1481
1482                 if (phys_cpos) {
1483                         /*
1484                          * We already have an allocation at this
1485                          * region so we can safely skip it.
1486                          */
1487                         goto next;
1488                 }
1489
1490                 ret = ocfs2_extend_allocation(inode, cpos, alloc_size, 1);
1491                 if (ret) {
1492                         if (ret != -ENOSPC)
1493                                 mlog_errno(ret);
1494                         goto out;
1495                 }
1496
1497 next:
1498                 cpos += alloc_size;
1499                 clusters -= alloc_size;
1500         }
1501
1502         ret = 0;
1503 out:
1504
1505         brelse(di_bh);
1506         return ret;
1507 }
1508
1509 /*
1510  * Truncate a byte range, avoiding pages within partial clusters. This
1511  * preserves those pages for the zeroing code to write to.
1512  */
1513 static void ocfs2_truncate_cluster_pages(struct inode *inode, u64 byte_start,
1514                                          u64 byte_len)
1515 {
1516         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1517         loff_t start, end;
1518         struct address_space *mapping = inode->i_mapping;
1519
1520         start = (loff_t)ocfs2_align_bytes_to_clusters(inode->i_sb, byte_start);
1521         end = byte_start + byte_len;
1522         end = end & ~(osb->s_clustersize - 1);
1523
1524         if (start < end) {
1525                 unmap_mapping_range(mapping, start, end - start, 0);
1526                 truncate_inode_pages_range(mapping, start, end - 1);
1527         }
1528 }
1529
1530 /*
1531  * zero out partial blocks of one cluster.
1532  *
1533  * start: file offset where zero starts, will be made upper block aligned.
1534  * len: it will be trimmed to the end of current cluster if "start + len"
1535  *      is bigger than it.
1536  */
1537 static int ocfs2_zeroout_partial_cluster(struct inode *inode,
1538                                         u64 start, u64 len)
1539 {
1540         int ret;
1541         u64 start_block, end_block, nr_blocks;
1542         u64 p_block, offset;
1543         u32 cluster, p_cluster, nr_clusters;
1544         struct super_block *sb = inode->i_sb;
1545         u64 end = ocfs2_align_bytes_to_clusters(sb, start);
1546
1547         if (start + len < end)
1548                 end = start + len;
1549
1550         start_block = ocfs2_blocks_for_bytes(sb, start);
1551         end_block = ocfs2_blocks_for_bytes(sb, end);
1552         nr_blocks = end_block - start_block;
1553         if (!nr_blocks)
1554                 return 0;
1555
1556         cluster = ocfs2_bytes_to_clusters(sb, start);
1557         ret = ocfs2_get_clusters(inode, cluster, &p_cluster,
1558                                 &nr_clusters, NULL);
1559         if (ret)
1560                 return ret;
1561         if (!p_cluster)
1562                 return 0;
1563
1564         offset = start_block - ocfs2_clusters_to_blocks(sb, cluster);
1565         p_block = ocfs2_clusters_to_blocks(sb, p_cluster) + offset;
1566         return sb_issue_zeroout(sb, p_block, nr_blocks, GFP_NOFS);
1567 }
1568
1569 static int ocfs2_zero_partial_clusters(struct inode *inode,
1570                                        u64 start, u64 len)
1571 {
1572         int ret = 0;
1573         u64 tmpend = 0;
1574         u64 end = start + len;
1575         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1576         unsigned int csize = osb->s_clustersize;
1577         handle_t *handle;
1578         loff_t isize = i_size_read(inode);
1579
1580         /*
1581          * The "start" and "end" values are NOT necessarily part of
1582          * the range whose allocation is being deleted. Rather, this
1583          * is what the user passed in with the request. We must zero
1584          * partial clusters here. There's no need to worry about
1585          * physical allocation - the zeroing code knows to skip holes.
1586          */
1587         trace_ocfs2_zero_partial_clusters(
1588                 (unsigned long long)OCFS2_I(inode)->ip_blkno,
1589                 (unsigned long long)start, (unsigned long long)end);
1590
1591         /*
1592          * If both edges are on a cluster boundary then there's no
1593          * zeroing required as the region is part of the allocation to
1594          * be truncated.
1595          */
1596         if ((start & (csize - 1)) == 0 && (end & (csize - 1)) == 0)
1597                 goto out;
1598
1599         /* No page cache for EOF blocks, issue zero out to disk. */
1600         if (end > isize) {
1601                 /*
1602                  * zeroout eof blocks in last cluster starting from
1603                  * "isize" even "start" > "isize" because it is
1604                  * complicated to zeroout just at "start" as "start"
1605                  * may be not aligned with block size, buffer write
1606                  * would be required to do that, but out of eof buffer
1607                  * write is not supported.
1608                  */
1609                 ret = ocfs2_zeroout_partial_cluster(inode, isize,
1610                                         end - isize);
1611                 if (ret) {
1612                         mlog_errno(ret);
1613                         goto out;
1614                 }
1615                 if (start >= isize)
1616                         goto out;
1617                 end = isize;
1618         }
1619         handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
1620         if (IS_ERR(handle)) {
1621                 ret = PTR_ERR(handle);
1622                 mlog_errno(ret);
1623                 goto out;
1624         }
1625
1626         /*
1627          * If start is on a cluster boundary and end is somewhere in another
1628          * cluster, we have not COWed the cluster starting at start, unless
1629          * end is also within the same cluster. So, in this case, we skip this
1630          * first call to ocfs2_zero_range_for_truncate() truncate and move on
1631          * to the next one.
1632          */
1633         if ((start & (csize - 1)) != 0) {
1634                 /*
1635                  * We want to get the byte offset of the end of the 1st
1636                  * cluster.
1637                  */
1638                 tmpend = (u64)osb->s_clustersize +
1639                         (start & ~(osb->s_clustersize - 1));
1640                 if (tmpend > end)
1641                         tmpend = end;
1642
1643                 trace_ocfs2_zero_partial_clusters_range1(
1644                         (unsigned long long)start,
1645                         (unsigned long long)tmpend);
1646
1647                 ret = ocfs2_zero_range_for_truncate(inode, handle, start,
1648                                                     tmpend);
1649                 if (ret)
1650                         mlog_errno(ret);
1651         }
1652
1653         if (tmpend < end) {
1654                 /*
1655                  * This may make start and end equal, but the zeroing
1656                  * code will skip any work in that case so there's no
1657                  * need to catch it up here.
1658                  */
1659                 start = end & ~(osb->s_clustersize - 1);
1660
1661                 trace_ocfs2_zero_partial_clusters_range2(
1662                         (unsigned long long)start, (unsigned long long)end);
1663
1664                 ret = ocfs2_zero_range_for_truncate(inode, handle, start, end);
1665                 if (ret)
1666                         mlog_errno(ret);
1667         }
1668         ocfs2_update_inode_fsync_trans(handle, inode, 1);
1669
1670         ocfs2_commit_trans(osb, handle);
1671 out:
1672         return ret;
1673 }
1674
1675 static int ocfs2_find_rec(struct ocfs2_extent_list *el, u32 pos)
1676 {
1677         int i;
1678         struct ocfs2_extent_rec *rec = NULL;
1679
1680         for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) {
1681
1682                 rec = &el->l_recs[i];
1683
1684                 if (le32_to_cpu(rec->e_cpos) < pos)
1685                         break;
1686         }
1687
1688         return i;
1689 }
1690
1691 /*
1692  * Helper to calculate the punching pos and length in one run, we handle the
1693  * following three cases in order:
1694  *
1695  * - remove the entire record
1696  * - remove a partial record
1697  * - no record needs to be removed (hole-punching completed)
1698 */
1699 static void ocfs2_calc_trunc_pos(struct inode *inode,
1700                                  struct ocfs2_extent_list *el,
1701                                  struct ocfs2_extent_rec *rec,
1702                                  u32 trunc_start, u32 *trunc_cpos,
1703                                  u32 *trunc_len, u32 *trunc_end,
1704                                  u64 *blkno, int *done)
1705 {
1706         int ret = 0;
1707         u32 coff, range;
1708
1709         range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
1710
1711         if (le32_to_cpu(rec->e_cpos) >= trunc_start) {
1712                 /*
1713                  * remove an entire extent record.
1714                  */
1715                 *trunc_cpos = le32_to_cpu(rec->e_cpos);
1716                 /*
1717                  * Skip holes if any.
1718                  */
1719                 if (range < *trunc_end)
1720                         *trunc_end = range;
1721                 *trunc_len = *trunc_end - le32_to_cpu(rec->e_cpos);
1722                 *blkno = le64_to_cpu(rec->e_blkno);
1723                 *trunc_end = le32_to_cpu(rec->e_cpos);
1724         } else if (range > trunc_start) {
1725                 /*
1726                  * remove a partial extent record, which means we're
1727                  * removing the last extent record.
1728                  */
1729                 *trunc_cpos = trunc_start;
1730                 /*
1731                  * skip hole if any.
1732                  */
1733                 if (range < *trunc_end)
1734                         *trunc_end = range;
1735                 *trunc_len = *trunc_end - trunc_start;
1736                 coff = trunc_start - le32_to_cpu(rec->e_cpos);
1737                 *blkno = le64_to_cpu(rec->e_blkno) +
1738                                 ocfs2_clusters_to_blocks(inode->i_sb, coff);
1739                 *trunc_end = trunc_start;
1740         } else {
1741                 /*
1742                  * It may have two following possibilities:
1743                  *
1744                  * - last record has been removed
1745                  * - trunc_start was within a hole
1746                  *
1747                  * both two cases mean the completion of hole punching.
1748                  */
1749                 ret = 1;
1750         }
1751
1752         *done = ret;
1753 }
1754
1755 int ocfs2_remove_inode_range(struct inode *inode,
1756                              struct buffer_head *di_bh, u64 byte_start,
1757                              u64 byte_len)
1758 {
1759         int ret = 0, flags = 0, done = 0, i;
1760         u32 trunc_start, trunc_len, trunc_end, trunc_cpos, phys_cpos;
1761         u32 cluster_in_el;
1762         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1763         struct ocfs2_cached_dealloc_ctxt dealloc;
1764         struct address_space *mapping = inode->i_mapping;
1765         struct ocfs2_extent_tree et;
1766         struct ocfs2_path *path = NULL;
1767         struct ocfs2_extent_list *el = NULL;
1768         struct ocfs2_extent_rec *rec = NULL;
1769         struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
1770         u64 blkno, refcount_loc = le64_to_cpu(di->i_refcount_loc);
1771
1772         ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
1773         ocfs2_init_dealloc_ctxt(&dealloc);
1774
1775         trace_ocfs2_remove_inode_range(
1776                         (unsigned long long)OCFS2_I(inode)->ip_blkno,
1777                         (unsigned long long)byte_start,
1778                         (unsigned long long)byte_len);
1779
1780         if (byte_len == 0)
1781                 return 0;
1782
1783         if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
1784                 ret = ocfs2_truncate_inline(inode, di_bh, byte_start,
1785                                             byte_start + byte_len, 0);
1786                 if (ret) {
1787                         mlog_errno(ret);
1788                         goto out;
1789                 }
1790                 /*
1791                  * There's no need to get fancy with the page cache
1792                  * truncate of an inline-data inode. We're talking
1793                  * about less than a page here, which will be cached
1794                  * in the dinode buffer anyway.
1795                  */
1796                 unmap_mapping_range(mapping, 0, 0, 0);
1797                 truncate_inode_pages(mapping, 0);
1798                 goto out;
1799         }
1800
1801         /*
1802          * For reflinks, we may need to CoW 2 clusters which might be
1803          * partially zero'd later, if hole's start and end offset were
1804          * within one cluster(means is not exactly aligned to clustersize).
1805          */
1806
1807         if (ocfs2_is_refcount_inode(inode)) {
1808                 ret = ocfs2_cow_file_pos(inode, di_bh, byte_start);
1809                 if (ret) {
1810                         mlog_errno(ret);
1811                         goto out;
1812                 }
1813
1814                 ret = ocfs2_cow_file_pos(inode, di_bh, byte_start + byte_len);
1815                 if (ret) {
1816                         mlog_errno(ret);
1817                         goto out;
1818                 }
1819         }
1820
1821         trunc_start = ocfs2_clusters_for_bytes(osb->sb, byte_start);
1822         trunc_end = (byte_start + byte_len) >> osb->s_clustersize_bits;
1823         cluster_in_el = trunc_end;
1824
1825         ret = ocfs2_zero_partial_clusters(inode, byte_start, byte_len);
1826         if (ret) {
1827                 mlog_errno(ret);
1828                 goto out;
1829         }
1830
1831         path = ocfs2_new_path_from_et(&et);
1832         if (!path) {
1833                 ret = -ENOMEM;
1834                 mlog_errno(ret);
1835                 goto out;
1836         }
1837
1838         while (trunc_end > trunc_start) {
1839
1840                 ret = ocfs2_find_path(INODE_CACHE(inode), path,
1841                                       cluster_in_el);
1842                 if (ret) {
1843                         mlog_errno(ret);
1844                         goto out;
1845                 }
1846
1847                 el = path_leaf_el(path);
1848
1849                 i = ocfs2_find_rec(el, trunc_end);
1850                 /*
1851                  * Need to go to previous extent block.
1852                  */
1853                 if (i < 0) {
1854                         if (path->p_tree_depth == 0)
1855                                 break;
1856
1857                         ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb,
1858                                                             path,
1859                                                             &cluster_in_el);
1860                         if (ret) {
1861                                 mlog_errno(ret);
1862                                 goto out;
1863                         }
1864
1865                         /*
1866                          * We've reached the leftmost extent block,
1867                          * it's safe to leave.
1868                          */
1869                         if (cluster_in_el == 0)
1870                                 break;
1871
1872                         /*
1873                          * The 'pos' searched for previous extent block is
1874                          * always one cluster less than actual trunc_end.
1875                          */
1876                         trunc_end = cluster_in_el + 1;
1877
1878                         ocfs2_reinit_path(path, 1);
1879
1880                         continue;
1881
1882                 } else
1883                         rec = &el->l_recs[i];
1884
1885                 ocfs2_calc_trunc_pos(inode, el, rec, trunc_start, &trunc_cpos,
1886                                      &trunc_len, &trunc_end, &blkno, &done);
1887                 if (done)
1888                         break;
1889
1890                 flags = rec->e_flags;
1891                 phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb, blkno);
1892
1893                 ret = ocfs2_remove_btree_range(inode, &et, trunc_cpos,
1894                                                phys_cpos, trunc_len, flags,
1895                                                &dealloc, refcount_loc, false);
1896                 if (ret < 0) {
1897                         mlog_errno(ret);
1898                         goto out;
1899                 }
1900
1901                 cluster_in_el = trunc_end;
1902
1903                 ocfs2_reinit_path(path, 1);
1904         }
1905
1906         ocfs2_truncate_cluster_pages(inode, byte_start, byte_len);
1907
1908 out:
1909         ocfs2_free_path(path);
1910         ocfs2_schedule_truncate_log_flush(osb, 1);
1911         ocfs2_run_deallocs(osb, &dealloc);
1912
1913         return ret;
1914 }
1915
1916 /*
1917  * Parts of this function taken from xfs_change_file_space()
1918  */
1919 static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
1920                                      loff_t f_pos, unsigned int cmd,
1921                                      struct ocfs2_space_resv *sr,
1922                                      int change_size)
1923 {
1924         int ret;
1925         s64 llen;
1926         loff_t size, orig_isize;
1927         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1928         struct buffer_head *di_bh = NULL;
1929         handle_t *handle;
1930         unsigned long long max_off = inode->i_sb->s_maxbytes;
1931
1932         if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
1933                 return -EROFS;
1934
1935         inode_lock(inode);
1936
1937         /*
1938          * This prevents concurrent writes on other nodes
1939          */
1940         ret = ocfs2_rw_lock(inode, 1);
1941         if (ret) {
1942                 mlog_errno(ret);
1943                 goto out;
1944         }
1945
1946         ret = ocfs2_inode_lock(inode, &di_bh, 1);
1947         if (ret) {
1948                 mlog_errno(ret);
1949                 goto out_rw_unlock;
1950         }
1951
1952         if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) {
1953                 ret = -EPERM;
1954                 goto out_inode_unlock;
1955         }
1956
1957         switch (sr->l_whence) {
1958         case 0: /*SEEK_SET*/
1959                 break;
1960         case 1: /*SEEK_CUR*/
1961                 sr->l_start += f_pos;
1962                 break;
1963         case 2: /*SEEK_END*/
1964                 sr->l_start += i_size_read(inode);
1965                 break;
1966         default:
1967                 ret = -EINVAL;
1968                 goto out_inode_unlock;
1969         }
1970         sr->l_whence = 0;
1971
1972         llen = sr->l_len > 0 ? sr->l_len - 1 : sr->l_len;
1973
1974         if (sr->l_start < 0
1975             || sr->l_start > max_off
1976             || (sr->l_start + llen) < 0
1977             || (sr->l_start + llen) > max_off) {
1978                 ret = -EINVAL;
1979                 goto out_inode_unlock;
1980         }
1981         size = sr->l_start + sr->l_len;
1982
1983         if (cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64 ||
1984             cmd == OCFS2_IOC_UNRESVSP || cmd == OCFS2_IOC_UNRESVSP64) {
1985                 if (sr->l_len <= 0) {
1986                         ret = -EINVAL;
1987                         goto out_inode_unlock;
1988                 }
1989         }
1990
1991         if (file && setattr_should_drop_suidgid(&nop_mnt_idmap, file_inode(file))) {
1992                 ret = __ocfs2_write_remove_suid(inode, di_bh);
1993                 if (ret) {
1994                         mlog_errno(ret);
1995                         goto out_inode_unlock;
1996                 }
1997         }
1998
1999         down_write(&OCFS2_I(inode)->ip_alloc_sem);
2000         switch (cmd) {
2001         case OCFS2_IOC_RESVSP:
2002         case OCFS2_IOC_RESVSP64:
2003                 /*
2004                  * This takes unsigned offsets, but the signed ones we
2005                  * pass have been checked against overflow above.
2006                  */
2007                 ret = ocfs2_allocate_unwritten_extents(inode, sr->l_start,
2008                                                        sr->l_len);
2009                 break;
2010         case OCFS2_IOC_UNRESVSP:
2011         case OCFS2_IOC_UNRESVSP64:
2012                 ret = ocfs2_remove_inode_range(inode, di_bh, sr->l_start,
2013                                                sr->l_len);
2014                 break;
2015         default:
2016                 ret = -EINVAL;
2017         }
2018
2019         orig_isize = i_size_read(inode);
2020         /* zeroout eof blocks in the cluster. */
2021         if (!ret && change_size && orig_isize < size) {
2022                 ret = ocfs2_zeroout_partial_cluster(inode, orig_isize,
2023                                         size - orig_isize);
2024                 if (!ret)
2025                         i_size_write(inode, size);
2026         }
2027         up_write(&OCFS2_I(inode)->ip_alloc_sem);
2028         if (ret) {
2029                 mlog_errno(ret);
2030                 goto out_inode_unlock;
2031         }
2032
2033         /*
2034          * We update c/mtime for these changes
2035          */
2036         handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
2037         if (IS_ERR(handle)) {
2038                 ret = PTR_ERR(handle);
2039                 mlog_errno(ret);
2040                 goto out_inode_unlock;
2041         }
2042
2043         inode->i_mtime = inode_set_ctime_current(inode);
2044         ret = ocfs2_mark_inode_dirty(handle, inode, di_bh);
2045         if (ret < 0)
2046                 mlog_errno(ret);
2047
2048         if (file && (file->f_flags & O_SYNC))
2049                 handle->h_sync = 1;
2050
2051         ocfs2_commit_trans(osb, handle);
2052
2053 out_inode_unlock:
2054         brelse(di_bh);
2055         ocfs2_inode_unlock(inode, 1);
2056 out_rw_unlock:
2057         ocfs2_rw_unlock(inode, 1);
2058
2059 out:
2060         inode_unlock(inode);
2061         return ret;
2062 }
2063
2064 int ocfs2_change_file_space(struct file *file, unsigned int cmd,
2065                             struct ocfs2_space_resv *sr)
2066 {
2067         struct inode *inode = file_inode(file);
2068         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2069         int ret;
2070
2071         if ((cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) &&
2072             !ocfs2_writes_unwritten_extents(osb))
2073                 return -ENOTTY;
2074         else if ((cmd == OCFS2_IOC_UNRESVSP || cmd == OCFS2_IOC_UNRESVSP64) &&
2075                  !ocfs2_sparse_alloc(osb))
2076                 return -ENOTTY;
2077
2078         if (!S_ISREG(inode->i_mode))
2079                 return -EINVAL;
2080
2081         if (!(file->f_mode & FMODE_WRITE))
2082                 return -EBADF;
2083
2084         ret = mnt_want_write_file(file);
2085         if (ret)
2086                 return ret;
2087         ret = __ocfs2_change_file_space(file, inode, file->f_pos, cmd, sr, 0);
2088         mnt_drop_write_file(file);
2089         return ret;
2090 }
2091
2092 static long ocfs2_fallocate(struct file *file, int mode, loff_t offset,
2093                             loff_t len)
2094 {
2095         struct inode *inode = file_inode(file);
2096         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2097         struct ocfs2_space_resv sr;
2098         int change_size = 1;
2099         int cmd = OCFS2_IOC_RESVSP64;
2100         int ret = 0;
2101
2102         if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
2103                 return -EOPNOTSUPP;
2104         if (!ocfs2_writes_unwritten_extents(osb))
2105                 return -EOPNOTSUPP;
2106
2107         if (mode & FALLOC_FL_KEEP_SIZE) {
2108                 change_size = 0;
2109         } else {
2110                 ret = inode_newsize_ok(inode, offset + len);
2111                 if (ret)
2112                         return ret;
2113         }
2114
2115         if (mode & FALLOC_FL_PUNCH_HOLE)
2116                 cmd = OCFS2_IOC_UNRESVSP64;
2117
2118         sr.l_whence = 0;
2119         sr.l_start = (s64)offset;
2120         sr.l_len = (s64)len;
2121
2122         return __ocfs2_change_file_space(NULL, inode, offset, cmd, &sr,
2123                                          change_size);
2124 }
2125
2126 int ocfs2_check_range_for_refcount(struct inode *inode, loff_t pos,
2127                                    size_t count)
2128 {
2129         int ret = 0;
2130         unsigned int extent_flags;
2131         u32 cpos, clusters, extent_len, phys_cpos;
2132         struct super_block *sb = inode->i_sb;
2133
2134         if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb)) ||
2135             !ocfs2_is_refcount_inode(inode) ||
2136             OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
2137                 return 0;
2138
2139         cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits;
2140         clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos;
2141
2142         while (clusters) {
2143                 ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len,
2144                                          &extent_flags);
2145                 if (ret < 0) {
2146                         mlog_errno(ret);
2147                         goto out;
2148                 }
2149
2150                 if (phys_cpos && (extent_flags & OCFS2_EXT_REFCOUNTED)) {
2151                         ret = 1;
2152                         break;
2153                 }
2154
2155                 if (extent_len > clusters)
2156                         extent_len = clusters;
2157
2158                 clusters -= extent_len;
2159                 cpos += extent_len;
2160         }
2161 out:
2162         return ret;
2163 }
2164
2165 static int ocfs2_is_io_unaligned(struct inode *inode, size_t count, loff_t pos)
2166 {
2167         int blockmask = inode->i_sb->s_blocksize - 1;
2168         loff_t final_size = pos + count;
2169
2170         if ((pos & blockmask) || (final_size & blockmask))
2171                 return 1;
2172         return 0;
2173 }
2174
2175 static int ocfs2_inode_lock_for_extent_tree(struct inode *inode,
2176                                             struct buffer_head **di_bh,
2177                                             int meta_level,
2178                                             int write_sem,
2179                                             int wait)
2180 {
2181         int ret = 0;
2182
2183         if (wait)
2184                 ret = ocfs2_inode_lock(inode, di_bh, meta_level);
2185         else
2186                 ret = ocfs2_try_inode_lock(inode, di_bh, meta_level);
2187         if (ret < 0)
2188                 goto out;
2189
2190         if (wait) {
2191                 if (write_sem)
2192                         down_write(&OCFS2_I(inode)->ip_alloc_sem);
2193                 else
2194                         down_read(&OCFS2_I(inode)->ip_alloc_sem);
2195         } else {
2196                 if (write_sem)
2197                         ret = down_write_trylock(&OCFS2_I(inode)->ip_alloc_sem);
2198                 else
2199                         ret = down_read_trylock(&OCFS2_I(inode)->ip_alloc_sem);
2200
2201                 if (!ret) {
2202                         ret = -EAGAIN;
2203                         goto out_unlock;
2204                 }
2205         }
2206
2207         return ret;
2208
2209 out_unlock:
2210         brelse(*di_bh);
2211         *di_bh = NULL;
2212         ocfs2_inode_unlock(inode, meta_level);
2213 out:
2214         return ret;
2215 }
2216
2217 static void ocfs2_inode_unlock_for_extent_tree(struct inode *inode,
2218                                                struct buffer_head **di_bh,
2219                                                int meta_level,
2220                                                int write_sem)
2221 {
2222         if (write_sem)
2223                 up_write(&OCFS2_I(inode)->ip_alloc_sem);
2224         else
2225                 up_read(&OCFS2_I(inode)->ip_alloc_sem);
2226
2227         brelse(*di_bh);
2228         *di_bh = NULL;
2229
2230         if (meta_level >= 0)
2231                 ocfs2_inode_unlock(inode, meta_level);
2232 }
2233
2234 static int ocfs2_prepare_inode_for_write(struct file *file,
2235                                          loff_t pos, size_t count, int wait)
2236 {
2237         int ret = 0, meta_level = 0, overwrite_io = 0;
2238         int write_sem = 0;
2239         struct dentry *dentry = file->f_path.dentry;
2240         struct inode *inode = d_inode(dentry);
2241         struct buffer_head *di_bh = NULL;
2242         u32 cpos;
2243         u32 clusters;
2244
2245         /*
2246          * We start with a read level meta lock and only jump to an ex
2247          * if we need to make modifications here.
2248          */
2249         for(;;) {
2250                 ret = ocfs2_inode_lock_for_extent_tree(inode,
2251                                                        &di_bh,
2252                                                        meta_level,
2253                                                        write_sem,
2254                                                        wait);
2255                 if (ret < 0) {
2256                         if (ret != -EAGAIN)
2257                                 mlog_errno(ret);
2258                         goto out;
2259                 }
2260
2261                 /*
2262                  * Check if IO will overwrite allocated blocks in case
2263                  * IOCB_NOWAIT flag is set.
2264                  */
2265                 if (!wait && !overwrite_io) {
2266                         overwrite_io = 1;
2267
2268                         ret = ocfs2_overwrite_io(inode, di_bh, pos, count);
2269                         if (ret < 0) {
2270                                 if (ret != -EAGAIN)
2271                                         mlog_errno(ret);
2272                                 goto out_unlock;
2273                         }
2274                 }
2275
2276                 /* Clear suid / sgid if necessary. We do this here
2277                  * instead of later in the write path because
2278                  * remove_suid() calls ->setattr without any hint that
2279                  * we may have already done our cluster locking. Since
2280                  * ocfs2_setattr() *must* take cluster locks to
2281                  * proceed, this will lead us to recursively lock the
2282                  * inode. There's also the dinode i_size state which
2283                  * can be lost via setattr during extending writes (we
2284                  * set inode->i_size at the end of a write. */
2285                 if (setattr_should_drop_suidgid(&nop_mnt_idmap, inode)) {
2286                         if (meta_level == 0) {
2287                                 ocfs2_inode_unlock_for_extent_tree(inode,
2288                                                                    &di_bh,
2289                                                                    meta_level,
2290                                                                    write_sem);
2291                                 meta_level = 1;
2292                                 continue;
2293                         }
2294
2295                         ret = ocfs2_write_remove_suid(inode);
2296                         if (ret < 0) {
2297                                 mlog_errno(ret);
2298                                 goto out_unlock;
2299                         }
2300                 }
2301
2302                 ret = ocfs2_check_range_for_refcount(inode, pos, count);
2303                 if (ret == 1) {
2304                         ocfs2_inode_unlock_for_extent_tree(inode,
2305                                                            &di_bh,
2306                                                            meta_level,
2307                                                            write_sem);
2308                         meta_level = 1;
2309                         write_sem = 1;
2310                         ret = ocfs2_inode_lock_for_extent_tree(inode,
2311                                                                &di_bh,
2312                                                                meta_level,
2313                                                                write_sem,
2314                                                                wait);
2315                         if (ret < 0) {
2316                                 if (ret != -EAGAIN)
2317                                         mlog_errno(ret);
2318                                 goto out;
2319                         }
2320
2321                         cpos = pos >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
2322                         clusters =
2323                                 ocfs2_clusters_for_bytes(inode->i_sb, pos + count) - cpos;
2324                         ret = ocfs2_refcount_cow(inode, di_bh, cpos, clusters, UINT_MAX);
2325                 }
2326
2327                 if (ret < 0) {
2328                         if (ret != -EAGAIN)
2329                                 mlog_errno(ret);
2330                         goto out_unlock;
2331                 }
2332
2333                 break;
2334         }
2335
2336 out_unlock:
2337         trace_ocfs2_prepare_inode_for_write(OCFS2_I(inode)->ip_blkno,
2338                                             pos, count, wait);
2339
2340         ocfs2_inode_unlock_for_extent_tree(inode,
2341                                            &di_bh,
2342                                            meta_level,
2343                                            write_sem);
2344
2345 out:
2346         return ret;
2347 }
2348
2349 static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
2350                                     struct iov_iter *from)
2351 {
2352         int rw_level;
2353         ssize_t written = 0;
2354         ssize_t ret;
2355         size_t count = iov_iter_count(from);
2356         struct file *file = iocb->ki_filp;
2357         struct inode *inode = file_inode(file);
2358         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2359         int full_coherency = !(osb->s_mount_opt &
2360                                OCFS2_MOUNT_COHERENCY_BUFFERED);
2361         void *saved_ki_complete = NULL;
2362         int append_write = ((iocb->ki_pos + count) >=
2363                         i_size_read(inode) ? 1 : 0);
2364         int direct_io = iocb->ki_flags & IOCB_DIRECT ? 1 : 0;
2365         int nowait = iocb->ki_flags & IOCB_NOWAIT ? 1 : 0;
2366
2367         trace_ocfs2_file_write_iter(inode, file, file->f_path.dentry,
2368                 (unsigned long long)OCFS2_I(inode)->ip_blkno,
2369                 file->f_path.dentry->d_name.len,
2370                 file->f_path.dentry->d_name.name,
2371                 (unsigned int)from->nr_segs);   /* GRRRRR */
2372
2373         if (!direct_io && nowait)
2374                 return -EOPNOTSUPP;
2375
2376         if (count == 0)
2377                 return 0;
2378
2379         if (nowait) {
2380                 if (!inode_trylock(inode))
2381                         return -EAGAIN;
2382         } else
2383                 inode_lock(inode);
2384
2385         /*
2386          * Concurrent O_DIRECT writes are allowed with
2387          * mount_option "coherency=buffered".
2388          * For append write, we must take rw EX.
2389          */
2390         rw_level = (!direct_io || full_coherency || append_write);
2391
2392         if (nowait)
2393                 ret = ocfs2_try_rw_lock(inode, rw_level);
2394         else
2395                 ret = ocfs2_rw_lock(inode, rw_level);
2396         if (ret < 0) {
2397                 if (ret != -EAGAIN)
2398                         mlog_errno(ret);
2399                 goto out_mutex;
2400         }
2401
2402         /*
2403          * O_DIRECT writes with "coherency=full" need to take EX cluster
2404          * inode_lock to guarantee coherency.
2405          */
2406         if (direct_io && full_coherency) {
2407                 /*
2408                  * We need to take and drop the inode lock to force
2409                  * other nodes to drop their caches.  Buffered I/O
2410                  * already does this in write_begin().
2411                  */
2412                 if (nowait)
2413                         ret = ocfs2_try_inode_lock(inode, NULL, 1);
2414                 else
2415                         ret = ocfs2_inode_lock(inode, NULL, 1);
2416                 if (ret < 0) {
2417                         if (ret != -EAGAIN)
2418                                 mlog_errno(ret);
2419                         goto out;
2420                 }
2421
2422                 ocfs2_inode_unlock(inode, 1);
2423         }
2424
2425         ret = generic_write_checks(iocb, from);
2426         if (ret <= 0) {
2427                 if (ret)
2428                         mlog_errno(ret);
2429                 goto out;
2430         }
2431         count = ret;
2432
2433         ret = ocfs2_prepare_inode_for_write(file, iocb->ki_pos, count, !nowait);
2434         if (ret < 0) {
2435                 if (ret != -EAGAIN)
2436                         mlog_errno(ret);
2437                 goto out;
2438         }
2439
2440         if (direct_io && !is_sync_kiocb(iocb) &&
2441             ocfs2_is_io_unaligned(inode, count, iocb->ki_pos)) {
2442                 /*
2443                  * Make it a sync io if it's an unaligned aio.
2444                  */
2445                 saved_ki_complete = xchg(&iocb->ki_complete, NULL);
2446         }
2447
2448         /* communicate with ocfs2_dio_end_io */
2449         ocfs2_iocb_set_rw_locked(iocb, rw_level);
2450
2451         written = __generic_file_write_iter(iocb, from);
2452         /* buffered aio wouldn't have proper lock coverage today */
2453         BUG_ON(written == -EIOCBQUEUED && !direct_io);
2454
2455         /*
2456          * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io
2457          * function pointer which is called when o_direct io completes so that
2458          * it can unlock our rw lock.
2459          * Unfortunately there are error cases which call end_io and others
2460          * that don't.  so we don't have to unlock the rw_lock if either an
2461          * async dio is going to do it in the future or an end_io after an
2462          * error has already done it.
2463          */
2464         if ((written == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) {
2465                 rw_level = -1;
2466         }
2467
2468         if (unlikely(written <= 0))
2469                 goto out;
2470
2471         if (((file->f_flags & O_DSYNC) && !direct_io) ||
2472             IS_SYNC(inode)) {
2473                 ret = filemap_fdatawrite_range(file->f_mapping,
2474                                                iocb->ki_pos - written,
2475                                                iocb->ki_pos - 1);
2476                 if (ret < 0)
2477                         written = ret;
2478
2479                 if (!ret) {
2480                         ret = jbd2_journal_force_commit(osb->journal->j_journal);
2481                         if (ret < 0)
2482                                 written = ret;
2483                 }
2484
2485                 if (!ret)
2486                         ret = filemap_fdatawait_range(file->f_mapping,
2487                                                       iocb->ki_pos - written,
2488                                                       iocb->ki_pos - 1);
2489         }
2490
2491 out:
2492         if (saved_ki_complete)
2493                 xchg(&iocb->ki_complete, saved_ki_complete);
2494
2495         if (rw_level != -1)
2496                 ocfs2_rw_unlock(inode, rw_level);
2497
2498 out_mutex:
2499         inode_unlock(inode);
2500
2501         if (written)
2502                 ret = written;
2503         return ret;
2504 }
2505
2506 static ssize_t ocfs2_file_read_iter(struct kiocb *iocb,
2507                                    struct iov_iter *to)
2508 {
2509         int ret = 0, rw_level = -1, lock_level = 0;
2510         struct file *filp = iocb->ki_filp;
2511         struct inode *inode = file_inode(filp);
2512         int direct_io = iocb->ki_flags & IOCB_DIRECT ? 1 : 0;
2513         int nowait = iocb->ki_flags & IOCB_NOWAIT ? 1 : 0;
2514
2515         trace_ocfs2_file_read_iter(inode, filp, filp->f_path.dentry,
2516                         (unsigned long long)OCFS2_I(inode)->ip_blkno,
2517                         filp->f_path.dentry->d_name.len,
2518                         filp->f_path.dentry->d_name.name,
2519                         to->nr_segs);   /* GRRRRR */
2520
2521
2522         if (!inode) {
2523                 ret = -EINVAL;
2524                 mlog_errno(ret);
2525                 goto bail;
2526         }
2527
2528         if (!direct_io && nowait)
2529                 return -EOPNOTSUPP;
2530
2531         /*
2532          * buffered reads protect themselves in ->read_folio().  O_DIRECT reads
2533          * need locks to protect pending reads from racing with truncate.
2534          */
2535         if (direct_io) {
2536                 if (nowait)
2537                         ret = ocfs2_try_rw_lock(inode, 0);
2538                 else
2539                         ret = ocfs2_rw_lock(inode, 0);
2540
2541                 if (ret < 0) {
2542                         if (ret != -EAGAIN)
2543                                 mlog_errno(ret);
2544                         goto bail;
2545                 }
2546                 rw_level = 0;
2547                 /* communicate with ocfs2_dio_end_io */
2548                 ocfs2_iocb_set_rw_locked(iocb, rw_level);
2549         }
2550
2551         /*
2552          * We're fine letting folks race truncates and extending
2553          * writes with read across the cluster, just like they can
2554          * locally. Hence no rw_lock during read.
2555          *
2556          * Take and drop the meta data lock to update inode fields
2557          * like i_size. This allows the checks down below
2558          * copy_splice_read() a chance of actually working.
2559          */
2560         ret = ocfs2_inode_lock_atime(inode, filp->f_path.mnt, &lock_level,
2561                                      !nowait);
2562         if (ret < 0) {
2563                 if (ret != -EAGAIN)
2564                         mlog_errno(ret);
2565                 goto bail;
2566         }
2567         ocfs2_inode_unlock(inode, lock_level);
2568
2569         ret = generic_file_read_iter(iocb, to);
2570         trace_generic_file_read_iter_ret(ret);
2571
2572         /* buffered aio wouldn't have proper lock coverage today */
2573         BUG_ON(ret == -EIOCBQUEUED && !direct_io);
2574
2575         /* see ocfs2_file_write_iter */
2576         if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) {
2577                 rw_level = -1;
2578         }
2579
2580 bail:
2581         if (rw_level != -1)
2582                 ocfs2_rw_unlock(inode, rw_level);
2583
2584         return ret;
2585 }
2586
2587 static ssize_t ocfs2_file_splice_read(struct file *in, loff_t *ppos,
2588                                       struct pipe_inode_info *pipe,
2589                                       size_t len, unsigned int flags)
2590 {
2591         struct inode *inode = file_inode(in);
2592         ssize_t ret = 0;
2593         int lock_level = 0;
2594
2595         trace_ocfs2_file_splice_read(inode, in, in->f_path.dentry,
2596                                      (unsigned long long)OCFS2_I(inode)->ip_blkno,
2597                                      in->f_path.dentry->d_name.len,
2598                                      in->f_path.dentry->d_name.name,
2599                                      flags);
2600
2601         /*
2602          * We're fine letting folks race truncates and extending writes with
2603          * read across the cluster, just like they can locally.  Hence no
2604          * rw_lock during read.
2605          *
2606          * Take and drop the meta data lock to update inode fields like i_size.
2607          * This allows the checks down below filemap_splice_read() a chance of
2608          * actually working.
2609          */
2610         ret = ocfs2_inode_lock_atime(inode, in->f_path.mnt, &lock_level, 1);
2611         if (ret < 0) {
2612                 if (ret != -EAGAIN)
2613                         mlog_errno(ret);
2614                 goto bail;
2615         }
2616         ocfs2_inode_unlock(inode, lock_level);
2617
2618         ret = filemap_splice_read(in, ppos, pipe, len, flags);
2619         trace_filemap_splice_read_ret(ret);
2620 bail:
2621         return ret;
2622 }
2623
2624 /* Refer generic_file_llseek_unlocked() */
2625 static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int whence)
2626 {
2627         struct inode *inode = file->f_mapping->host;
2628         int ret = 0;
2629
2630         inode_lock(inode);
2631
2632         switch (whence) {
2633         case SEEK_SET:
2634                 break;
2635         case SEEK_END:
2636                 /* SEEK_END requires the OCFS2 inode lock for the file
2637                  * because it references the file's size.
2638                  */
2639                 ret = ocfs2_inode_lock(inode, NULL, 0);
2640                 if (ret < 0) {
2641                         mlog_errno(ret);
2642                         goto out;
2643                 }
2644                 offset += i_size_read(inode);
2645                 ocfs2_inode_unlock(inode, 0);
2646                 break;
2647         case SEEK_CUR:
2648                 if (offset == 0) {
2649                         offset = file->f_pos;
2650                         goto out;
2651                 }
2652                 offset += file->f_pos;
2653                 break;
2654         case SEEK_DATA:
2655         case SEEK_HOLE:
2656                 ret = ocfs2_seek_data_hole_offset(file, &offset, whence);
2657                 if (ret)
2658                         goto out;
2659                 break;
2660         default:
2661                 ret = -EINVAL;
2662                 goto out;
2663         }
2664
2665         offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
2666
2667 out:
2668         inode_unlock(inode);
2669         if (ret)
2670                 return ret;
2671         return offset;
2672 }
2673
2674 static loff_t ocfs2_remap_file_range(struct file *file_in, loff_t pos_in,
2675                                      struct file *file_out, loff_t pos_out,
2676                                      loff_t len, unsigned int remap_flags)
2677 {
2678         struct inode *inode_in = file_inode(file_in);
2679         struct inode *inode_out = file_inode(file_out);
2680         struct ocfs2_super *osb = OCFS2_SB(inode_in->i_sb);
2681         struct buffer_head *in_bh = NULL, *out_bh = NULL;
2682         bool same_inode = (inode_in == inode_out);
2683         loff_t remapped = 0;
2684         ssize_t ret;
2685
2686         if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
2687                 return -EINVAL;
2688         if (!ocfs2_refcount_tree(osb))
2689                 return -EOPNOTSUPP;
2690         if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
2691                 return -EROFS;
2692
2693         /* Lock both files against IO */
2694         ret = ocfs2_reflink_inodes_lock(inode_in, &in_bh, inode_out, &out_bh);
2695         if (ret)
2696                 return ret;
2697
2698         /* Check file eligibility and prepare for block sharing. */
2699         ret = -EINVAL;
2700         if ((OCFS2_I(inode_in)->ip_flags & OCFS2_INODE_SYSTEM_FILE) ||
2701             (OCFS2_I(inode_out)->ip_flags & OCFS2_INODE_SYSTEM_FILE))
2702                 goto out_unlock;
2703
2704         ret = generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out,
2705                         &len, remap_flags);
2706         if (ret < 0 || len == 0)
2707                 goto out_unlock;
2708
2709         /* Lock out changes to the allocation maps and remap. */
2710         down_write(&OCFS2_I(inode_in)->ip_alloc_sem);
2711         if (!same_inode)
2712                 down_write_nested(&OCFS2_I(inode_out)->ip_alloc_sem,
2713                                   SINGLE_DEPTH_NESTING);
2714
2715         /* Zap any page cache for the destination file's range. */
2716         truncate_inode_pages_range(&inode_out->i_data,
2717                                    round_down(pos_out, PAGE_SIZE),
2718                                    round_up(pos_out + len, PAGE_SIZE) - 1);
2719
2720         remapped = ocfs2_reflink_remap_blocks(inode_in, in_bh, pos_in,
2721                         inode_out, out_bh, pos_out, len);
2722         up_write(&OCFS2_I(inode_in)->ip_alloc_sem);
2723         if (!same_inode)
2724                 up_write(&OCFS2_I(inode_out)->ip_alloc_sem);
2725         if (remapped < 0) {
2726                 ret = remapped;
2727                 mlog_errno(ret);
2728                 goto out_unlock;
2729         }
2730
2731         /*
2732          * Empty the extent map so that we may get the right extent
2733          * record from the disk.
2734          */
2735         ocfs2_extent_map_trunc(inode_in, 0);
2736         ocfs2_extent_map_trunc(inode_out, 0);
2737
2738         ret = ocfs2_reflink_update_dest(inode_out, out_bh, pos_out + len);
2739         if (ret) {
2740                 mlog_errno(ret);
2741                 goto out_unlock;
2742         }
2743
2744 out_unlock:
2745         ocfs2_reflink_inodes_unlock(inode_in, in_bh, inode_out, out_bh);
2746         return remapped > 0 ? remapped : ret;
2747 }
2748
2749 const struct inode_operations ocfs2_file_iops = {
2750         .setattr        = ocfs2_setattr,
2751         .getattr        = ocfs2_getattr,
2752         .permission     = ocfs2_permission,
2753         .listxattr      = ocfs2_listxattr,
2754         .fiemap         = ocfs2_fiemap,
2755         .get_inode_acl  = ocfs2_iop_get_acl,
2756         .set_acl        = ocfs2_iop_set_acl,
2757         .fileattr_get   = ocfs2_fileattr_get,
2758         .fileattr_set   = ocfs2_fileattr_set,
2759 };
2760
2761 const struct inode_operations ocfs2_special_file_iops = {
2762         .setattr        = ocfs2_setattr,
2763         .getattr        = ocfs2_getattr,
2764         .permission     = ocfs2_permission,
2765         .get_inode_acl  = ocfs2_iop_get_acl,
2766         .set_acl        = ocfs2_iop_set_acl,
2767 };
2768
2769 /*
2770  * Other than ->lock, keep ocfs2_fops and ocfs2_dops in sync with
2771  * ocfs2_fops_no_plocks and ocfs2_dops_no_plocks!
2772  */
2773 const struct file_operations ocfs2_fops = {
2774         .llseek         = ocfs2_file_llseek,
2775         .mmap           = ocfs2_mmap,
2776         .fsync          = ocfs2_sync_file,
2777         .release        = ocfs2_file_release,
2778         .open           = ocfs2_file_open,
2779         .read_iter      = ocfs2_file_read_iter,
2780         .write_iter     = ocfs2_file_write_iter,
2781         .unlocked_ioctl = ocfs2_ioctl,
2782 #ifdef CONFIG_COMPAT
2783         .compat_ioctl   = ocfs2_compat_ioctl,
2784 #endif
2785         .lock           = ocfs2_lock,
2786         .flock          = ocfs2_flock,
2787         .splice_read    = ocfs2_file_splice_read,
2788         .splice_write   = iter_file_splice_write,
2789         .fallocate      = ocfs2_fallocate,
2790         .remap_file_range = ocfs2_remap_file_range,
2791 };
2792
2793 WRAP_DIR_ITER(ocfs2_readdir) // FIXME!
2794 const struct file_operations ocfs2_dops = {
2795         .llseek         = generic_file_llseek,
2796         .read           = generic_read_dir,
2797         .iterate_shared = shared_ocfs2_readdir,
2798         .fsync          = ocfs2_sync_file,
2799         .release        = ocfs2_dir_release,
2800         .open           = ocfs2_dir_open,
2801         .unlocked_ioctl = ocfs2_ioctl,
2802 #ifdef CONFIG_COMPAT
2803         .compat_ioctl   = ocfs2_compat_ioctl,
2804 #endif
2805         .lock           = ocfs2_lock,
2806         .flock          = ocfs2_flock,
2807 };
2808
2809 /*
2810  * POSIX-lockless variants of our file_operations.
2811  *
2812  * These will be used if the underlying cluster stack does not support
2813  * posix file locking, if the user passes the "localflocks" mount
2814  * option, or if we have a local-only fs.
2815  *
2816  * ocfs2_flock is in here because all stacks handle UNIX file locks,
2817  * so we still want it in the case of no stack support for
2818  * plocks. Internally, it will do the right thing when asked to ignore
2819  * the cluster.
2820  */
2821 const struct file_operations ocfs2_fops_no_plocks = {
2822         .llseek         = ocfs2_file_llseek,
2823         .mmap           = ocfs2_mmap,
2824         .fsync          = ocfs2_sync_file,
2825         .release        = ocfs2_file_release,
2826         .open           = ocfs2_file_open,
2827         .read_iter      = ocfs2_file_read_iter,
2828         .write_iter     = ocfs2_file_write_iter,
2829         .unlocked_ioctl = ocfs2_ioctl,
2830 #ifdef CONFIG_COMPAT
2831         .compat_ioctl   = ocfs2_compat_ioctl,
2832 #endif
2833         .flock          = ocfs2_flock,
2834         .splice_read    = filemap_splice_read,
2835         .splice_write   = iter_file_splice_write,
2836         .fallocate      = ocfs2_fallocate,
2837         .remap_file_range = ocfs2_remap_file_range,
2838 };
2839
2840 const struct file_operations ocfs2_dops_no_plocks = {
2841         .llseek         = generic_file_llseek,
2842         .read           = generic_read_dir,
2843         .iterate_shared = shared_ocfs2_readdir,
2844         .fsync          = ocfs2_sync_file,
2845         .release        = ocfs2_dir_release,
2846         .open           = ocfs2_dir_open,
2847         .unlocked_ioctl = ocfs2_ioctl,
2848 #ifdef CONFIG_COMPAT
2849         .compat_ioctl   = ocfs2_compat_ioctl,
2850 #endif
2851         .flock          = ocfs2_flock,
2852 };