unsigned long *frontswap_map; /* frontswap in-use, one bit per page */
atomic_t frontswap_pages; /* frontswap pages in-use counter */
#endif
+ spinlock_t lock; /*
+ * protect map scan related fields like
+ * swap_map, lowest_bit, highest_bit,
+ * inuse_pages, cluster_next,
+ * cluster_nr, lowest_alloc and
+ * highest_alloc. other fields are only
+ * changed at swapon/swapoff, so are
+ * protected by swap_lock. changing
+ * flags need hold this lock and
+ * swap_lock. If both locks need hold,
+ * hold swap_lock first.
+ */
};
struct swap_list_t {
int next; /* swapfile to be used next */
};
-/* Swap 50% full? Release swapcache more aggressively.. */
-#define vm_swap_full() (nr_swap_pages*2 < total_swap_pages)
-
/* linux/mm/page_alloc.c */
extern unsigned long totalram_pages;
extern unsigned long totalreserve_pages;
struct vm_area_struct *vma, unsigned long addr);
/* linux/mm/swapfile.c */
-extern long nr_swap_pages;
+extern atomic_long_t nr_swap_pages;
extern long total_swap_pages;
+
+/* Swap 50% full? Release swapcache more aggressively.. */
+static inline bool vm_swap_full(void)
+{
+ return atomic_long_read(&nr_swap_pages) * 2 < total_swap_pages;
+}
+
+static inline long get_nr_swap_pages(void)
+{
+ return atomic_long_read(&nr_swap_pages);
+}
+
extern void si_swapinfo(struct sysinfo *);
extern swp_entry_t get_swap_page(void);
extern swp_entry_t get_swap_page_of_type(int);
#else /* CONFIG_SWAP */
-#define nr_swap_pages 0L
+#define get_nr_swap_pages() 0L
#define total_swap_pages 0L
#define total_swapcache_pages() 0UL
+#define vm_swap_full() 0
#define si_swapinfo(val) \
do { (val)->freeswap = (val)->totalswap = 0; } while (0)
DEFINE_SPINLOCK(swap_lock);
static unsigned int nr_swapfiles;
-long nr_swap_pages;
+atomic_long_t nr_swap_pages;
+/* protected with swap_lock. reading in vm_swap_full() doesn't need lock */
long total_swap_pages;
static int least_priority;
+static atomic_t highest_priority_index = ATOMIC_INIT(-1);
static const char Bad_file[] = "Bad swap file entry ";
static const char Unused_file[] = "Unused swap file entry ";
si->lowest_alloc = si->max;
si->highest_alloc = 0;
}
- spin_unlock(&swap_lock);
+ spin_unlock(&si->lock);
/*
* If seek is expensive, start searching for new cluster from
if (si->swap_map[offset])
last_in_cluster = offset + SWAPFILE_CLUSTER;
else if (offset == last_in_cluster) {
- spin_lock(&swap_lock);
+ spin_lock(&si->lock);
offset -= SWAPFILE_CLUSTER - 1;
si->cluster_next = offset;
si->cluster_nr = SWAPFILE_CLUSTER - 1;
if (si->swap_map[offset])
last_in_cluster = offset + SWAPFILE_CLUSTER;
else if (offset == last_in_cluster) {
- spin_lock(&swap_lock);
+ spin_lock(&si->lock);
offset -= SWAPFILE_CLUSTER - 1;
si->cluster_next = offset;
si->cluster_nr = SWAPFILE_CLUSTER - 1;
}
offset = scan_base;
- spin_lock(&swap_lock);
+ spin_lock(&si->lock);
si->cluster_nr = SWAPFILE_CLUSTER - 1;
si->lowest_alloc = 0;
}
/* reuse swap entry of cache-only swap if not busy. */
if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
int swap_was_freed;
- spin_unlock(&swap_lock);
+ spin_unlock(&si->lock);
swap_was_freed = __try_to_reclaim_swap(si, offset);
- spin_lock(&swap_lock);
+ spin_lock(&si->lock);
/* entry was freed successfully, try to use this again */
if (swap_was_freed)
goto checks;
si->lowest_alloc <= last_in_cluster)
last_in_cluster = si->lowest_alloc - 1;
si->flags |= SWP_DISCARDING;
- spin_unlock(&swap_lock);
+ spin_unlock(&si->lock);
if (offset < last_in_cluster)
discard_swap_cluster(si, offset,
last_in_cluster - offset + 1);
- spin_lock(&swap_lock);
+ spin_lock(&si->lock);
si->lowest_alloc = 0;
si->flags &= ~SWP_DISCARDING;
* could defer that delay until swap_writepage,
* but it's easier to keep this self-contained.
*/
- spin_unlock(&swap_lock);
+ spin_unlock(&si->lock);
wait_on_bit(&si->flags, ilog2(SWP_DISCARDING),
wait_for_discard, TASK_UNINTERRUPTIBLE);
- spin_lock(&swap_lock);
+ spin_lock(&si->lock);
} else {
/*
* Note pages allocated by racing tasks while
return offset;
scan:
- spin_unlock(&swap_lock);
+ spin_unlock(&si->lock);
while (++offset <= si->highest_bit) {
if (!si->swap_map[offset]) {
- spin_lock(&swap_lock);
+ spin_lock(&si->lock);
goto checks;
}
if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
- spin_lock(&swap_lock);
+ spin_lock(&si->lock);
goto checks;
}
if (unlikely(--latency_ration < 0)) {
offset = si->lowest_bit;
while (++offset < scan_base) {
if (!si->swap_map[offset]) {
- spin_lock(&swap_lock);
+ spin_lock(&si->lock);
goto checks;
}
if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
- spin_lock(&swap_lock);
+ spin_lock(&si->lock);
goto checks;
}
if (unlikely(--latency_ration < 0)) {
latency_ration = LATENCY_LIMIT;
}
}
- spin_lock(&swap_lock);
+ spin_lock(&si->lock);
no_page:
si->flags -= SWP_SCANNING;
pgoff_t offset;
int type, next;
int wrapped = 0;
+ int hp_index;
spin_lock(&swap_lock);
- if (nr_swap_pages <= 0)
+ if (atomic_long_read(&nr_swap_pages) <= 0)
goto noswap;
- nr_swap_pages--;
+ atomic_long_dec(&nr_swap_pages);
for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) {
+ hp_index = atomic_xchg(&highest_priority_index, -1);
+ /*
+ * highest_priority_index records current highest priority swap
+ * type which just frees swap entries. If its priority is
+ * higher than that of swap_list.next swap type, we use it. It
+ * isn't protected by swap_lock, so it can be an invalid value
+ * if the corresponding swap type is swapoff. We double check
+ * the flags here. It's even possible the swap type is swapoff
+ * and swapon again and its priority is changed. In such rare
+ * case, low prority swap type might be used, but eventually
+ * high priority swap will be used after several rounds of
+ * swap.
+ */
+ if (hp_index != -1 && hp_index != type &&
+ swap_info[type]->prio < swap_info[hp_index]->prio &&
+ (swap_info[hp_index]->flags & SWP_WRITEOK)) {
+ type = hp_index;
+ swap_list.next = type;
+ }
+
si = swap_info[type];
next = si->next;
if (next < 0 ||
wrapped++;
}
- if (!si->highest_bit)
+ spin_lock(&si->lock);
+ if (!si->highest_bit) {
+ spin_unlock(&si->lock);
continue;
- if (!(si->flags & SWP_WRITEOK))
+ }
+ if (!(si->flags & SWP_WRITEOK)) {
+ spin_unlock(&si->lock);
continue;
+ }
swap_list.next = next;
+
+ spin_unlock(&swap_lock);
/* This is called for allocating swap entry for cache */
offset = scan_swap_map(si, SWAP_HAS_CACHE);
- if (offset) {
- spin_unlock(&swap_lock);
+ spin_unlock(&si->lock);
+ if (offset)
return swp_entry(type, offset);
- }
+ spin_lock(&swap_lock);
next = swap_list.next;
}
- nr_swap_pages++;
+ atomic_long_inc(&nr_swap_pages);
noswap:
spin_unlock(&swap_lock);
return (swp_entry_t) {0};
struct swap_info_struct *si;
pgoff_t offset;
- spin_lock(&swap_lock);
si = swap_info[type];
+ spin_lock(&si->lock);
if (si && (si->flags & SWP_WRITEOK)) {
- nr_swap_pages--;
+ atomic_long_dec(&nr_swap_pages);
/* This is called for allocating swap entry, not cache */
offset = scan_swap_map(si, 1);
if (offset) {
- spin_unlock(&swap_lock);
+ spin_unlock(&si->lock);
return swp_entry(type, offset);
}
- nr_swap_pages++;
+ atomic_long_inc(&nr_swap_pages);
}
- spin_unlock(&swap_lock);
+ spin_unlock(&si->lock);
return (swp_entry_t) {0};
}
goto bad_offset;
if (!p->swap_map[offset])
goto bad_free;
- spin_lock(&swap_lock);
+ spin_lock(&p->lock);
return p;
bad_free:
return NULL;
}
+/*
+ * This swap type frees swap entry, check if it is the highest priority swap
+ * type which just frees swap entry. get_swap_page() uses
+ * highest_priority_index to search highest priority swap type. The
+ * swap_info_struct.lock can't protect us if there are multiple swap types
+ * active, so we use atomic_cmpxchg.
+ */
+static void set_highest_priority_index(int type)
+{
+ int old_hp_index, new_hp_index;
+
+ do {
+ old_hp_index = atomic_read(&highest_priority_index);
+ if (old_hp_index != -1 &&
+ swap_info[old_hp_index]->prio >= swap_info[type]->prio)
+ break;
+ new_hp_index = type;
+ } while (atomic_cmpxchg(&highest_priority_index,
+ old_hp_index, new_hp_index) != old_hp_index);
+}
+
static unsigned char swap_entry_free(struct swap_info_struct *p,
swp_entry_t entry, unsigned char usage)
{
p->lowest_bit = offset;
if (offset > p->highest_bit)
p->highest_bit = offset;
- if (swap_list.next >= 0 &&
- p->prio > swap_info[swap_list.next]->prio)
- swap_list.next = p->type;
- nr_swap_pages++;
+ set_highest_priority_index(p->type);
+ atomic_long_inc(&nr_swap_pages);
p->inuse_pages--;
frontswap_invalidate_page(p->type, offset);
if (p->flags & SWP_BLKDEV) {
p = swap_info_get(entry);
if (p) {
swap_entry_free(p, entry, 1);
- spin_unlock(&swap_lock);
+ spin_unlock(&p->lock);
}
}
count = swap_entry_free(p, entry, SWAP_HAS_CACHE);
if (page)
mem_cgroup_uncharge_swapcache(page, entry, count != 0);
- spin_unlock(&swap_lock);
+ spin_unlock(&p->lock);
}
}
p = swap_info_get(entry);
if (p) {
count = swap_count(p->swap_map[swp_offset(entry)]);
- spin_unlock(&swap_lock);
+ spin_unlock(&p->lock);
}
return count;
}
page = NULL;
}
}
- spin_unlock(&swap_lock);
+ spin_unlock(&p->lock);
}
if (page) {
/*
if ((unsigned int)type < nr_swapfiles) {
struct swap_info_struct *sis = swap_info[type];
+ spin_lock(&sis->lock);
if (sis->flags & SWP_WRITEOK) {
n = sis->pages;
if (free)
n -= sis->inuse_pages;
}
+ spin_unlock(&sis->lock);
}
spin_unlock(&swap_lock);
return n;
p->swap_map = swap_map;
frontswap_map_set(p, frontswap_map);
p->flags |= SWP_WRITEOK;
- nr_swap_pages += p->pages;
+ atomic_long_add(p->pages, &nr_swap_pages);
total_swap_pages += p->pages;
/* insert swap space into swap_list: */
unsigned long *frontswap_map)
{
spin_lock(&swap_lock);
+ spin_lock(&p->lock);
_enable_swap_info(p, prio, swap_map, frontswap_map);
frontswap_init(p->type);
+ spin_unlock(&p->lock);
spin_unlock(&swap_lock);
}
static void reinsert_swap_info(struct swap_info_struct *p)
{
spin_lock(&swap_lock);
+ spin_lock(&p->lock);
_enable_swap_info(p, p->prio, p->swap_map, frontswap_map_get(p));
+ spin_unlock(&p->lock);
spin_unlock(&swap_lock);
}
/* just pick something that's safe... */
swap_list.next = swap_list.head;
}
+ spin_lock(&p->lock);
if (p->prio < 0) {
for (i = p->next; i >= 0; i = swap_info[i]->next)
swap_info[i]->prio = p->prio--;
least_priority++;
}
- nr_swap_pages -= p->pages;
+ atomic_long_sub(p->pages, &nr_swap_pages);
total_swap_pages -= p->pages;
p->flags &= ~SWP_WRITEOK;
+ spin_unlock(&p->lock);
spin_unlock(&swap_lock);
set_current_oom_origin();
mutex_lock(&swapon_mutex);
spin_lock(&swap_lock);
+ spin_lock(&p->lock);
drain_mmlist();
/* wait for anyone still in scan_swap_map */
p->highest_bit = 0; /* cuts scans short */
while (p->flags >= SWP_SCANNING) {
+ spin_unlock(&p->lock);
spin_unlock(&swap_lock);
schedule_timeout_uninterruptible(1);
spin_lock(&swap_lock);
+ spin_lock(&p->lock);
}
swap_file = p->swap_file;
p->swap_map = NULL;
p->flags = 0;
frontswap_invalidate_area(type);
+ spin_unlock(&p->lock);
spin_unlock(&swap_lock);
mutex_unlock(&swapon_mutex);
vfree(swap_map);
p->flags = SWP_USED;
p->next = -1;
spin_unlock(&swap_lock);
+ spin_lock_init(&p->lock);
return p;
}
if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK))
nr_to_be_unused += si->inuse_pages;
}
- val->freeswap = nr_swap_pages + nr_to_be_unused;
+ val->freeswap = atomic_long_read(&nr_swap_pages) + nr_to_be_unused;
val->totalswap = total_swap_pages + nr_to_be_unused;
spin_unlock(&swap_lock);
}
p = swap_info[type];
offset = swp_offset(entry);
- spin_lock(&swap_lock);
+ spin_lock(&p->lock);
if (unlikely(offset >= p->max))
goto unlock_out;
p->swap_map[offset] = count | has_cache;
unlock_out:
- spin_unlock(&swap_lock);
+ spin_unlock(&p->lock);
out:
return err;
}
if (!page) {
- spin_unlock(&swap_lock);
+ spin_unlock(&si->lock);
return -ENOMEM;
}
list_add_tail(&page->lru, &head->lru);
page = NULL; /* now it's attached, don't free it */
out:
- spin_unlock(&swap_lock);
+ spin_unlock(&si->lock);
outer:
if (page)
__free_page(page);