/*
 *  linux/mm/page_alloc.c
 *
 *  Manages the free list, the system allocates free pages here.
 *  Note that kmalloc() lives in slab.c
 *
 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
 *  Swap reorganised 29.12.95, Stephen Tweedie
 *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
 *  Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
 *  Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
 *  Zone balancing, Kanoj Sarcar, SGI, Jan 2000
 *  Copyright (c) 2001 Lineo Inc., David McCullough <davidm@lineo.com>
 *  Copyright (c) 2000-2001 D Jeff Dionne <jeff@uClinux.org> ref uClinux 2.0
 */

#include <linux/config.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/swapctl.h>
#include <linux/interrupt.h>
#include <linux/pagemap.h>
#include <linux/bootmem.h>
#include <linux/slab.h>
#include <linux/module.h>

#ifdef CONFIG_SD_CHECKERROR
#include <linux/proc_fs.h>
#endif

int nr_swap_pages;
int nr_active_pages;
int nr_inactive_pages;
LIST_HEAD(inactive_list);
LIST_HEAD(active_list);
pg_data_t *pgdat_list;

/*
 *
 * The zone_table array is used to look up the address of the
 * struct zone corresponding to a given zone number (ZONE_DMA,
 * ZONE_NORMAL, or ZONE_HIGHMEM).
 */
zone_t *zone_table[MAX_NR_ZONES*MAX_NR_NODES];
EXPORT_SYMBOL(zone_table);

#ifdef CONFIG_SD_LARGEMEM
static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "LargeMem" };
#else
static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
#endif
static int zone_balance_ratio[MAX_NR_ZONES] __initdata = { 128, 128, 128, };
static int zone_balance_min[MAX_NR_ZONES] __initdata = { 20 , 20, 20, };
static int zone_balance_max[MAX_NR_ZONES] __initdata = { 255 , 255, 255, };

#ifdef CONFIG_SD_LARGEMEM
// Large Memory support : Use buddy allocator for memory zone 0 and 1,
// and non-power-of-two memory allocator for memory zone 2 which is originally
// for highmem but newly defined as largemem. Most of large memory implementation
// comes from non-power-of-two memory allocator (page_alloc2.c)
                                                                                
#ifdef __mc68000__
#define ALIGN_ORDER(x)    0
#else
#define ALIGN_ORDER(x)    ((x) == 1 ? 1 : 0)
#endif
                                                                                
#define SMALL_ALLOC_PAGES 2
#endif

/*
 * Temporary debugging check.
 */
#define BAD_RANGE(zone, page)                        \
(                                    \
    (((page) - mem_map) >= ((zone)->zone_start_mapnr+(zone)->size))    \
    || (((page) - mem_map) < (zone)->zone_start_mapnr)        \
    || ((zone) != page_zone(page))                    \
)

#ifdef CONFIG_SD_LARGEMEM

/*
 * free memory
 * this function is called only when the page reference count is zero
 */
void lgmem_free_contiguous_pages(struct page *page, unsigned int num_adjpages)
{
    zone_t *zone = page_zone(page);
    linear_free_area_t *pfreearea = &zone->linear_free_area;
    unsigned long addr = (unsigned long) page_address(page);
    unsigned long map_nr = MAP_NR(addr);
    unsigned long flags;

    // adjust map_nr relative to current zone
    map_nr -= zone->zone_start_mapnr;

    if (map_nr >= pfreearea->bit_map_size) {
        printk("LARGEMEM: Tries to free non-existed page (addr:0x%08lx, %d pages)\n",
            addr, num_adjpages);
        return;
    } else {
        mem_map_t *p, *ep;

        p = zone->zone_mem_map + map_nr;

        save_flags(flags);
        cli();

        // clear corresponding bits in bit_map, and adjust first_usable_page
        // if freed pages comes before current first_usable_page
        // increase number of free pages
        for (ep = p + num_adjpages; p < ep; p++) {
            p->flags &= ~((1<<PG_referenced) | (1<<PG_dirty));
            if (p - zone->zone_mem_map < pfreearea->first_usable_page)
                pfreearea->first_usable_page = p - zone->zone_mem_map;
            clear_bit(p - zone->zone_mem_map, pfreearea->bit_map);
            set_page_count(p, 0);
            pfreearea->nr_free_pages++;
        }
        restore_flags(flags);

        if (waitqueue_active(&kswapd_wait))
            wake_up_interruptible(&kswapd_wait);
    }
}

static void find_some_memory(int n)
{
    int i;
    pg_data_t *pgdat;
    zone_t *zone;

    if (in_interrupt()) /* sorry, you lose */
        return;

    for (i = 0; i < n; ++i) {
        for_each_pgdat(pgdat) {
            zone = pgdat->node_zones + ZONE_LARGEMEM;
            zone->need_balance = 1;
            try_to_free_pages_zone(zone, GFP_KSWAPD);
        }
    }
}

unsigned long
__lgmem_get_contiguous_pages(
    unsigned int gfp_mask,
    unsigned long num_adjpages,
    unsigned int align_order)
{
    unsigned long flags;
    mem_map_t *p;
    int repeats = 0;
    pg_data_t *pgdat;
    zone_t *zone;
    linear_free_area_t *pfreearea;

    save_flags(flags);

    if (waitqueue_active(&kswapd_wait))
        wake_up_interruptible(&kswapd_wait);

repeat:
    cli();

    for_each_pgdat(pgdat) {
        zone = pgdat->node_zones + ZONE_LARGEMEM;
        pfreearea = &zone->linear_free_area;
        if (num_adjpages <= pfreearea->nr_free_pages) {
            int n = 0, little_alloc = 0, ff;
    
            // looks for adjacent free pages which fulfills the memory request
            p = NULL;
            if (num_adjpages <= SMALL_ALLOC_PAGES)
                little_alloc = pfreearea->bit_map_size;
    
            ff = find_next_zero_bit(pfreearea->bit_map, pfreearea->bit_map_size,
                    num_adjpages <= SMALL_ALLOC_PAGES ? (little_alloc -= 16) :
                        pfreearea->first_usable_page);
    
            while (ff + num_adjpages <= pfreearea->bit_map_size || little_alloc > 0) {
                if (ff + num_adjpages <= pfreearea->bit_map_size) {
                    p = zone->zone_mem_map + ff;
                    if (((unsigned long) page_address(p)) &
                            ((PAGE_SIZE << align_order) - 1))
                        n = 0;
                    else
                        for (n = 0; n < num_adjpages; n++, p++) {
                            if (test_bit(p - zone->zone_mem_map, pfreearea->bit_map))
                                break;
                        }
                    if (n >= num_adjpages)
                        break;
                }
                ff = find_next_zero_bit(pfreearea->bit_map, pfreearea->bit_map_size,
                        num_adjpages <= SMALL_ALLOC_PAGES ? (little_alloc -= 16) :
                                (ff + n + 1));
            }
    
            // if found the enough big free blocks, marks the pages allocated,
            // sets corresponding bits in bit_map, and decreases free page counter
            if (p && n >= num_adjpages) {
                pfreearea->nr_free_pages -= num_adjpages;
                while (n-- > 0) {
                    p--;
                    set_page_count(p, 1);
                    set_bit(p - zone->zone_mem_map, pfreearea->bit_map);
                    p->index = 0xa1c20000 | num_adjpages;
		    set_page_zone(p, ZONE_LARGEMEM);
                    if (num_adjpages > 0xffff)
                        BUG();
                }
                restore_flags(flags);
                return((unsigned long)page_address(p));
            }
        }
    }
    restore_flags(flags);
    if ((current->flags & PF_MEMALLOC) == 0) {
        find_some_memory(3);
        if (repeats++ < 3)
            goto repeat;
    }
    return(0);
}

#endif

/*
 * Freeing function for a buddy system allocator.
 * Contrary to prior comments, this is *NOT* hairy, and there
 * is no reason for anyone not to understand it.
 *
 * The concept of a buddy system is to maintain direct-mapped tables
 * (containing bit values) for memory blocks of various "orders".
 * The bottom level table contains the map for the smallest allocatable
 * units of memory (here, pages), and each level above it describes
 * pairs of units from the levels below, hence, "buddies".
 * At a high level, all that happens here is marking the table entry
 * at the bottom level available, and propagating the changes upward
 * as necessary, plus some accounting needed to play nicely with other
 * parts of the VM system.
 * At each level, we keep one bit for each pair of blocks, which
 * is set to 1 iff only one of the pair is allocated.  So when we
 * are allocating or freeing one, we can derive the state of the
 * other.  That is, if we allocate a small block, and both were   
 * free, the remainder of the region must be split into blocks.   
 * If a block is freed, and its buddy is also free, then this
 * triggers coalescing into a block of larger size.            
 *
 * -- wli
 */

static void FASTCALL(__free_pages_ok (struct page *page, unsigned int order));
static void __free_pages_ok (struct page *page, unsigned int order)
{
	unsigned long index, page_idx, mask, flags;
	free_area_t *area;
	struct page *base;
	zone_t *zone;

	/*
	 * Subtle. We do not want to test this in the inlined part of
	 * __free_page() - it's a rare condition and just increases
	 * cache footprint unnecesserily. So we do an 'incorrect'
	 * decrement on page->count for reserved pages, but this part
	 * makes it safe.
	 */
	if (PageReserved(page))
		return;

	/*
	 * Yes, think what happens when other parts of the kernel take 
	 * a reference to a page in order to pin it for io. -ben
	 */
	if (PageLRU(page)) {
		if (unlikely(in_interrupt()))
			BUG();
		lru_cache_del(page);
	}

	if (page->buffers)
		BUG();
	if (page->mapping)
		BUG();
	if (!VALID_PAGE(page))
		BUG();
	if (PageLocked(page))
		BUG();
	if (PageActive(page))
		BUG();

#ifdef CONFIG_SD_LARGEMEM
//	if ((page->index & ~0xffff) == 0xa1c20000) {
	if (((page->flags >> ZONE_SHIFT) % MAX_NR_ZONES) == ZONE_LARGEMEM) {
		lgmem_free_contiguous_pages(page, 1 << order);
		return;
	}
#endif

	page->flags &= ~((1<<PG_referenced) | (1<<PG_dirty));

	/* de-reference all the pages for this order */
	for (page_idx = 1; page_idx < (1 << order); page_idx++)
		set_page_count(&page[page_idx], 0);

	if (current->flags & PF_FREE_PAGES)
		goto local_freelist;
 back_local_freelist:

	zone = page_zone(page);

	mask = (~0UL) << order;
	base = zone->zone_mem_map;
	page_idx = page - base;
	if (page_idx & ~mask)
		BUG();
	index = page_idx >> (1 + order);

	area = zone->free_area + order;

	spin_lock_irqsave(&zone->lock, flags);

	zone->free_pages -= mask;

	while (mask + (1 << (MAX_ORDER-1))) {
		struct page *buddy1, *buddy2;

		if (area >= zone->free_area + MAX_ORDER)
			BUG();
		if (!__test_and_change_bit(index, area->map))
			/*
			 * the buddy page is still allocated.
			 */
			break;
		/*
		 * Move the buddy up one level.
		 * This code is taking advantage of the identity:
		 * 	-mask = 1+~mask
		 */
		buddy1 = base + (page_idx ^ -mask);
		buddy2 = base + page_idx;
		if (BAD_RANGE(zone,buddy1))
			BUG();
		if (BAD_RANGE(zone,buddy2))
			BUG();

		list_del(&buddy1->list);
		mask <<= 1;
		area++;
		index >>= 1;
		page_idx &= mask;
	}
	list_add(&(base + page_idx)->list, &area->free_list);

	spin_unlock_irqrestore(&zone->lock, flags);
	return;

 local_freelist:
	if (current->nr_local_pages)
		goto back_local_freelist;
	if (in_interrupt())
		goto back_local_freelist;		

	list_add(&page->list, &current->local_pages);
	page->index = order;
	current->nr_local_pages++;
}

#define MARK_USED(index, order, area) \
	__change_bit((index) >> (1+(order)), (area)->map)

static inline struct page * expand (zone_t *zone, struct page *page,
	 unsigned long index, int low, int high, free_area_t * area)
{
	unsigned long size = 1 << high;

	while (high > low) {
		if (BAD_RANGE(zone,page))
			BUG();
		area--;
		high--;
		size >>= 1;
		list_add(&(page)->list, &(area)->free_list);
		MARK_USED(index, high, area);
		index += size;
		page += size;
	}
	if (BAD_RANGE(zone,page))
		BUG();
	return page;
}

static FASTCALL(struct page * rmqueue(zone_t *zone, unsigned int order));
static struct page * rmqueue(zone_t *zone, unsigned int order)
{
	free_area_t * area = zone->free_area + order;
	unsigned int curr_order = order;
	struct list_head *head, *curr;
	unsigned long flags;
	struct page *page;
	int i;

	spin_lock_irqsave(&zone->lock, flags);
	do {
		head = &area->free_list;
		curr = head->next;

		if (curr != head) {
			unsigned int index;

			page = list_entry(curr, struct page, list);
			if (BAD_RANGE(zone,page))
				BUG();
			list_del(curr);
			index = page - zone->zone_mem_map;
			if (curr_order != MAX_ORDER-1)
				MARK_USED(index, curr_order, area);
			zone->free_pages -= 1UL << order;

			page = expand(zone, page, index, order, curr_order, area);
			spin_unlock_irqrestore(&zone->lock, flags);

			set_page_count(page, 1);
			if (BAD_RANGE(zone,page))
				BUG();
			if (PageLRU(page))
				BUG();
			if (PageActive(page))
				BUG();

			/*
			 * we need to reference all the pages for this order,
			 * otherwise if anyone accesses one of the pages with
			 * (get/put) it * will be freed :-(
			 */
			for (i = 1; i < (1 << order); i++)
				set_page_count(&page[i], 1);

			return page;	
		}
		curr_order++;
		area++;
	} while (curr_order < MAX_ORDER);
	spin_unlock_irqrestore(&zone->lock, flags);

	return NULL;
}

#ifndef CONFIG_DISCONTIGMEM
struct page *_alloc_pages(unsigned int gfp_mask, unsigned int order)
{
	return __alloc_pages(gfp_mask, order,
		contig_page_data.node_zonelists+(gfp_mask & GFP_ZONEMASK));
}
#endif

static struct page * FASTCALL(balance_classzone(zone_t *, unsigned int, unsigned int, int *));
static struct page * balance_classzone(zone_t * classzone, unsigned int gfp_mask, unsigned int order, int * freed)
{
	struct page * page = NULL;
	int __freed = 0, i;

	if (!(gfp_mask & __GFP_WAIT))
		goto out;
	if (in_interrupt())
		BUG();

	current->allocation_order = order;
	current->flags |= PF_MEMALLOC | PF_FREE_PAGES;

	__freed = try_to_free_pages_zone(classzone, gfp_mask);

	current->flags &= ~(PF_MEMALLOC | PF_FREE_PAGES);

	if (current->nr_local_pages) {
		struct list_head * entry, * local_pages;
		struct page * tmp;
		int nr_pages;

		local_pages = &current->local_pages;

		if (likely(__freed)) {
			/* pick from the last inserted so we're lifo */
			entry = local_pages->next;
			do {
				tmp = list_entry(entry, struct page, list);
				if (tmp->index == order && memclass(page_zone(tmp), classzone)) {
					list_del(entry);
					current->nr_local_pages--;
					set_page_count(tmp, 1);

					page = tmp;

					/*
					 * we need to reference all the pages for this order,
					 * otherwise if anyone accesses one of the pages with
					 * (get/put) it * will be freed :-(
					 */
					for (i = 1; i < (1 << order); i++)
						set_page_count(&page[i], 1);

					if (page->buffers)
						BUG();
					if (page->mapping)
						BUG();
					if (!VALID_PAGE(page))
						BUG();
					if (PageLocked(page))
						BUG();
					if (PageLRU(page))
						BUG();
					if (PageActive(page))
						BUG();
					if (PageDirty(page))
						BUG();

					break;
				}
			} while ((entry = entry->next) != local_pages);
		}

		nr_pages = current->nr_local_pages;
		/* free in reverse order so that the global order will be lifo */
		while ((entry = local_pages->prev) != local_pages) {
			list_del(entry);
			tmp = list_entry(entry, struct page, list);
			__free_pages_ok(tmp, tmp->index);
			if (!nr_pages--)
				BUG();
		}
		current->nr_local_pages = 0;
	}
 out:
	*freed = __freed;
	return page;
}

/*
 * This is the 'heart' of the zoned buddy allocator:
 */
struct page * __alloc_pages(unsigned int gfp_mask, unsigned int order, zonelist_t *zonelist)
{
	unsigned long min;
	zone_t **zone, * classzone;
	struct page * page;
	int freed;
#ifdef CONFIG_SD
	int count = 0;
#endif

#ifdef CONFIG_SD_LARGEMEM
	/* Try to allocate from LargeMem if criteria is met */
	if ((gfp_mask & __GFP_DMA) == 0 && order >= CONFIG_SD_LARGEMEM_MINORDER_PAGE) {
		unsigned addr = __lgmem_get_contiguous_pages(gfp_mask, 1 << order, ALIGN_ORDER(order));
		if (addr != 0) {
			page = virt_to_page(addr);
			return(page);
		}
	}
#endif

	zone = zonelist->zones;
	classzone = *zone;
	if (classzone == NULL)
		return NULL;
	min = 1UL << order;
	for (;;) {
		zone_t *z = *(zone++);
		if (!z)
			break;

		min += z->pages_low;
		if (z->free_pages > min) {
			page = rmqueue(z, order);
			if (page)
				return page;
		}
	}

	classzone->need_balance = 1;
	mb();
	if (waitqueue_active(&kswapd_wait))
		wake_up_interruptible(&kswapd_wait);

	zone = zonelist->zones;
	min = 1UL << order;
	for (;;) {
		unsigned long local_min;
		zone_t *z = *(zone++);
		if (!z)
			break;

		local_min = z->pages_min;
		if (!(gfp_mask & __GFP_WAIT))
			local_min >>= 2;
		min += local_min;
		if (z->free_pages > min) {
			page = rmqueue(z, order);
			if (page)
				return page;
		}
	}

	/* here we're in the low on memory slow path */

rebalance:
	if (current->flags & (PF_MEMALLOC | PF_MEMDIE)) {
		zone = zonelist->zones;
		for (;;) {
			zone_t *z = *(zone++);
			if (!z)
				break;

			page = rmqueue(z, order);
			if (page)
				return page;
		}
#ifdef CONFIG_SD_LARGEMEM
		{
		/* Give it another try from LargeMem zone */
		unsigned addr = __lgmem_get_contiguous_pages(gfp_mask, 1 << order, ALIGN_ORDER(order));
		if (addr != 0) {
			page = virt_to_page(addr);
			return(page);
		} else
			return(NULL);
		}
#else
		return NULL;
#endif
	}

	/* Atomic allocations - we can't balance anything */
	if (!(gfp_mask & __GFP_WAIT))
		return NULL;

	page = balance_classzone(classzone, gfp_mask, order, &freed);
	if (page)
		return page;

	zone = zonelist->zones;
	min = 1UL << order;
	for (;;) {
		zone_t *z = *(zone++);
		if (!z)
			break;

		min += z->pages_min;
		if (z->free_pages > min) {
			page = rmqueue(z, order);
			if (page)
				return page;
		}
	}

#ifdef CONFIG_SD
	if ((order >= MAX_ORDER) || (++count > 512)) {	/* Try to rebalance up to some time */
#ifdef CONFIG_SD_LARGEMEM
		/* Give it another try from LargeMem zone */
		unsigned addr = __lgmem_get_contiguous_pages(gfp_mask, 1 << order, ALIGN_ORDER(order));
		if (addr != 0) {
			page = virt_to_page(addr);
			return(page);
		} else
			return(NULL);
#else
		return NULL;	/*  before give up */
#endif
	}
#else
	/* Don't let big-order allocations loop */
	if (order > 3)
		return NULL;
#endif

	/* Yield for kswapd, and try again */
	yield();
	goto rebalance;
}

/*
 * Common helper functions.
 */
unsigned long __get_free_pages(unsigned int gfp_mask, unsigned int order)
{
	struct page * page;

	page = alloc_pages(gfp_mask, order);
	if (!page)
		return 0;
	{
		unsigned long addr = (unsigned long) page_address(page);
		return (gfp_mask & GFP_DMA) ? em86xx_to_ncaddr(addr) : addr;
	}
}

unsigned long get_zeroed_page(unsigned int gfp_mask)
{
	struct page * page;

	page = alloc_pages(gfp_mask, 0);
	if (page) {
		void *address = page_address(page);
		clear_page(address);
		{
			unsigned long addr = (unsigned long) address;
			return (gfp_mask & GFP_DMA) ? em86xx_to_ncaddr(addr) : addr;
		}
	}
	return 0;
}

void __free_pages(struct page *page, unsigned int order)
{
	if (!PageReserved(page) && put_page_testzero(page))
		__free_pages_ok(page, order);
}

void free_pages(unsigned long addr, unsigned int order)
{
	if (addr != 0)
		__free_pages(virt_to_page(em86xx_to_caddr(addr)), order);
}

/*
 * Total amount of free (allocatable) RAM:
 */
unsigned int nr_free_pages (void)
{
	unsigned int sum = 0;
	zone_t *zone;

	for_each_zone(zone)
		sum += zone->free_pages;

	return sum;
}

/*
 * Amount of free RAM allocatable as buffer memory:
 */
unsigned int nr_free_buffer_pages (void)
{
	pg_data_t *pgdat;
	unsigned int sum = 0;

	for_each_pgdat(pgdat) {
		zonelist_t *zonelist = pgdat->node_zonelists + (GFP_USER & GFP_ZONEMASK);
		zone_t **zonep = zonelist->zones;
		zone_t *zone;

		for (zone = *zonep++; zone; zone = *zonep++) {
			unsigned long size = zone->size;
			unsigned long high = zone->pages_high;
			if (size > high)
				sum += size - high;
		}
	}

	return sum;
}

#if CONFIG_HIGHMEM
unsigned int nr_free_highpages (void)
{
	pg_data_t *pgdat;
	unsigned int pages = 0;

	for_each_pgdat(pgdat)
		pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages;

	return pages;
}
#endif

#define K(x) ((x) << (PAGE_SHIFT-10))

/*
 * Show free area list (used inside shift_scroll-lock stuff)
 * We also calculate the percentage fragmentation. We do this by counting the
 * memory on each free list with the exception of the first item on the list.
 */
void show_free_areas_core(pg_data_t *pgdat)
{
 	unsigned int order;
	unsigned type;
	pg_data_t *tmpdat = pgdat;

	printk("Free pages:      %6dkB (%6dkB HighMem)\n",
		K(nr_free_pages()),
		K(nr_free_highpages()));

	while (tmpdat) {
		zone_t *zone;
		for (zone = tmpdat->node_zones;
			       	zone < tmpdat->node_zones + MAX_NR_ZONES; zone++)
			printk("Zone:%s freepages:%6lukB min:%6lukB low:%6lukB " 
				       "high:%6lukB\n", 
					zone->name,
					K(zone->free_pages),
					K(zone->pages_min),
					K(zone->pages_low),
					K(zone->pages_high));
			
		tmpdat = tmpdat->node_next;
	}

	printk("( Active: %d, inactive: %d, free: %d )\n",
	       nr_active_pages,
	       nr_inactive_pages,
	       nr_free_pages());

	for (type = 0; type < MAX_NR_ZONES; type++) {
		struct list_head *head, *curr;
		zone_t *zone = pgdat->node_zones + type;
 		unsigned long nr, total, flags;

		total = 0;
		if (zone->size) {
			spin_lock_irqsave(&zone->lock, flags);
		 	for (order = 0; order < MAX_ORDER; order++) {
				head = &(zone->free_area + order)->free_list;
				curr = head;
				nr = 0;
				for (;;) {
					if ((curr = curr->next) == head)
						break;
					nr++;
				}
				total += nr * (1 << order);
				printk("%lu*%lukB ", nr, K(1UL) << order);
			}
			spin_unlock_irqrestore(&zone->lock, flags);
		}
		printk("= %lukB)\n", K(total));
	}

#ifndef NO_MM
#ifdef SWAP_CACHE_INFO
	show_swap_cache_info();
#endif	
#endif	
}

void show_free_areas(void)
{
	show_free_areas_core(pgdat_list);
}

/*
 * Builds allocation fallback zone lists.
 */
static inline void build_zonelists(pg_data_t *pgdat)
{
	int i, j, k;

	for (i = 0; i <= GFP_ZONEMASK; i++) {
		zonelist_t *zonelist;
		zone_t *zone;

		zonelist = pgdat->node_zonelists + i;
		memset(zonelist, 0, sizeof(*zonelist));

		j = 0;
		k = ZONE_NORMAL;
#ifndef CONFIG_SD_LARGEMEM
		if (i & __GFP_HIGHMEM)
			k = ZONE_HIGHMEM;
#endif
		if (i & __GFP_DMA)
			k = ZONE_DMA;

		switch (k) {
			default:
				BUG();
			/*
			 * fallthrough:
			 */
			case ZONE_HIGHMEM:
				zone = pgdat->node_zones + ZONE_HIGHMEM;
				if (zone->size) {
#ifndef CONFIG_HIGHMEM
					BUG();
#endif
					zonelist->zones[j++] = zone;
				}
			case ZONE_NORMAL:
				zone = pgdat->node_zones + ZONE_NORMAL;
				if (zone->size)
					zonelist->zones[j++] = zone;
			case ZONE_DMA:
				zone = pgdat->node_zones + ZONE_DMA;
				if (zone->size)
					zonelist->zones[j++] = zone;
		}
		zonelist->zones[j++] = NULL;
	} 
}

/*
 * Helper functions to size the waitqueue hash table.
 * Essentially these want to choose hash table sizes sufficiently
 * large so that collisions trying to wait on pages are rare.
 * But in fact, the number of active page waitqueues on typical
 * systems is ridiculously low, less than 200. So this is even
 * conservative, even though it seems large.
 *
 * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to
 * waitqueues, i.e. the size of the waitq table given the number of pages.
 */
#define PAGES_PER_WAITQUEUE	256

static inline unsigned long wait_table_size(unsigned long pages)
{
	unsigned long size = 1;

	pages /= PAGES_PER_WAITQUEUE;

	while (size < pages)
		size <<= 1;

	/*
	 * Once we have dozens or even hundreds of threads sleeping
	 * on IO we've got bigger problems than wait queue collision.
	 * Limit the size of the wait table to a reasonable size.
	 */
	size = min(size, 4096UL);

	return size;
}

/*
 * This is an integer logarithm so that shifts can be used later
 * to extract the more random high bits from the multiplicative
 * hash function before the remainder is taken.
 */
static inline unsigned long wait_table_bits(unsigned long size)
{
	return ffz(~size);
}

#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))

/*
 * Set up the zone data structures:
 *   - mark all pages reserved
 *   - mark all memory queues empty
 *   - clear the memory bitmaps
 */
void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap,
	unsigned long *zones_size, unsigned long zone_start_paddr, 
	unsigned long *zholes_size, struct page *lmem_map)
{
	unsigned long i, j;
	unsigned long map_size;
	unsigned long totalpages, offset, realtotalpages;
	const unsigned long zone_required_alignment = 1UL << (MAX_ORDER-1);

	if (zone_start_paddr & ~PAGE_MASK)
		BUG();

	totalpages = 0;
	for (i = 0; i < MAX_NR_ZONES; i++) {
		unsigned long size = zones_size[i];
		totalpages += size;
	}
	realtotalpages = totalpages;
	if (zholes_size)
		for (i = 0; i < MAX_NR_ZONES; i++)
			realtotalpages -= zholes_size[i];
			
	printk("On node %d totalpages: %lu\n", nid, realtotalpages);

	/*
	 * Some architectures (with lots of mem and discontinous memory
	 * maps) have to search for a good mem_map area:
	 * For discontigmem, the conceptual mem map array starts from 
	 * PAGE_OFFSET, we need to align the actual array onto a mem map 
	 * boundary, so that MAP_NR works.
	 */
	map_size = (totalpages + 1)*sizeof(struct page);
	if (lmem_map == (struct page *)0) {
		lmem_map = (struct page *) alloc_bootmem_node(pgdat, map_size);
		lmem_map = (struct page *)(PAGE_OFFSET + 
			MAP_ALIGN((unsigned long)lmem_map - PAGE_OFFSET));
	}
	*gmap = pgdat->node_mem_map = lmem_map;
	pgdat->node_size = totalpages;
	pgdat->node_start_paddr = zone_start_paddr;
	pgdat->node_start_mapnr = (lmem_map - mem_map);
	pgdat->nr_zones = 0;

#ifdef CONFIG_SD_LARGEMEM
	// exclude largemem from counting
	totalpages -= zones_size[ZONE_LARGEMEM];
	realtotalpages -= zones_size[ZONE_LARGEMEM] - zholes_size[ZONE_LARGEMEM];
#endif

	offset = lmem_map - mem_map;	
	for (j = 0; j < MAX_NR_ZONES; j++) {
		zone_t *zone = pgdat->node_zones + j;
		unsigned long mask;
		unsigned long size, realsize;

		zone_table[nid * MAX_NR_ZONES + j] = zone;
		realsize = size = zones_size[j];
		if (zholes_size)
			realsize -= zholes_size[j];

		printk("zone(%lu): %lu pages.\n", j, size);
		zone->size = size;
		zone->name = zone_names[j];
		zone->lock = SPIN_LOCK_UNLOCKED;
		zone->zone_pgdat = pgdat;
		zone->free_pages = 0;
		zone->need_balance = 0;
		if (!size)
			continue;

		/*
		 * The per-page waitqueue mechanism uses hashed waitqueues
		 * per zone.
		 */
		zone->wait_table_size = wait_table_size(size);
		zone->wait_table_shift =
			BITS_PER_LONG - wait_table_bits(zone->wait_table_size);
		zone->wait_table = (wait_queue_head_t *)
			alloc_bootmem_node(pgdat, zone->wait_table_size
						* sizeof(wait_queue_head_t));

		for(i = 0; i < zone->wait_table_size; ++i)
			init_waitqueue_head(zone->wait_table + i);

		pgdat->nr_zones = j+1;

		mask = (realsize / zone_balance_ratio[j]);
		if (mask < zone_balance_min[j])
			mask = zone_balance_min[j];
		else if (mask > zone_balance_max[j])
			mask = zone_balance_max[j];
		zone->pages_min = mask;
		zone->pages_low = mask*2;
		zone->pages_high = mask*3;

#ifdef CONFIG_SD_MINFREEPAGES
#ifdef CONFIG_SD_LARGEMEM
		if (j != ZONE_LARGEMEM) {
#endif
		zone->pages_high = (CONFIG_SD_MINFREEPAGES_SIZE >> (PAGE_SHIFT - 10)) * size / totalpages;
		zone->pages_low = zone->pages_high / 2;
		if (size > 0)
			printk("zone(%lu): Set minimum memory threshold to %ldKB\n", j, (zone->pages_high * PAGE_SIZE) >> 10);
#ifdef CONFIG_SD_LARGEMEM
		}
#endif
#endif

#ifdef CONFIG_SD_LARGEMEM
		if (j == ZONE_LARGEMEM) {
			zone->pages_min = 0;
			zone->pages_low = 0;
			zone->pages_high = realsize;
		}
                                                                                                      
		// allocates memory for bit_map.
		if (zones_size[ZONE_LARGEMEM] > 0) {
			zone->linear_free_area.bit_map_size = zones_size[ZONE_LARGEMEM];
			zone->linear_free_area.bit_map = (unsigned char *)
				alloc_bootmem_node(pgdat, LONG_ALIGN(zone->linear_free_area.bit_map_size / 8));
			memset(zone->linear_free_area.bit_map, 0, LONG_ALIGN(zone->linear_free_area.bit_map_size / 8));
			zone->linear_free_area.first_usable_page = zone->linear_free_area.bit_map_size;
			zone->linear_free_area.nr_free_pages = 0;
		}
#endif
		zone->zone_mem_map = mem_map + offset;
		zone->zone_start_mapnr = offset;
		zone->zone_start_paddr = zone_start_paddr;

		if ((zone_start_paddr >> PAGE_SHIFT) & (zone_required_alignment-1))
			printk("Warning: wrong zone alignment (0x%08lx, 0x%08x, 0x%08lx)\n", zone_start_paddr, PAGE_SHIFT, zone_required_alignment);

		/*
		 * Initially all pages are reserved - free ones are freed
		 * up by free_all_bootmem() once the early boot process is
		 * done. Non-atomic initialization, single-pass.
		 */
		for (i = 0; i < size; i++) {
			struct page *page = mem_map + offset + i;
			set_page_zone(page, nid * MAX_NR_ZONES + j);
			set_page_count(page, 0);
			SetPageReserved(page);
#ifdef CONFIG_SD_LARGEMEM
			if (j == ZONE_LARGEMEM)
				set_bit(i, zone->linear_free_area.bit_map);
#endif
			INIT_LIST_HEAD(&page->list);
			if (j != ZONE_HIGHMEM)
				set_page_address(page, __va(zone_start_paddr));
#ifdef CONFIG_SD_LARGEMEM
			if (j == ZONE_LARGEMEM)
				set_page_address(page, __va(zone_start_paddr));
#endif
			zone_start_paddr += PAGE_SIZE;
		}

		offset += size;

#ifdef CONFIG_SD_LARGEMEM
		if (j == ZONE_LARGEMEM)
			continue;
#endif
		for (i = 0; ; i++) {
			unsigned long bitmap_size;

			INIT_LIST_HEAD(&zone->free_area[i].free_list);
			if (i == MAX_ORDER-1) {
				zone->free_area[i].map = NULL;
				break;
			}

			/*
			 * Page buddy system uses "index >> (i+1)",
			 * where "index" is at most "size-1".
			 *
			 * The extra "+3" is to round down to byte
			 * size (8 bits per byte assumption). Thus
			 * we get "(size-1) >> (i+4)" as the last byte
			 * we can access.
			 *
			 * The "+1" is because we want to round the
			 * byte allocation up rather than down. So
			 * we should have had a "+7" before we shifted
			 * down by three. Also, we have to add one as
			 * we actually _use_ the last bit (it's [0,n]
			 * inclusive, not [0,n[).
			 *
			 * So we actually had +7+1 before we shift
			 * down by 3. But (n+8) >> 3 == (n >> 3) + 1
			 * (modulo overflows, which we do not have).
			 *
			 * Finally, we LONG_ALIGN because all bitmap
			 * operations are on longs.
			 */
			bitmap_size = (size-1) >> (i+4);
			bitmap_size = LONG_ALIGN(bitmap_size+1);
			zone->free_area[i].map = 
			  (unsigned long *) alloc_bootmem_node(pgdat, bitmap_size);
		}
	}
	build_zonelists(pgdat);
}

void __init free_area_init(unsigned long *zones_size)
{
	free_area_init_core(0, &contig_page_data, &mem_map, zones_size, 0, 0, 0);
}

static int __init setup_mem_frac(char *str)
{
	int j = 0;

	while (get_option(&str, &zone_balance_ratio[j++]) == 2);
	printk("setup_mem_frac: ");
	for (j = 0; j < MAX_NR_ZONES; j++) printk("%d  ", zone_balance_ratio[j]);
	printk("\n");
	return 1;
}

__setup("memfrac=", setup_mem_frac);

#ifdef CONFIG_SD_CHECKERROR
static int dump_free_areas_core(char *dest, pg_data_t *pgdat)
{
 	unsigned int order;
	unsigned type;
	int len = 0;
	char *buf = dest;

	len = sprintf(buf, "Free pages:      %6dkB (%6dkB HighMem)\n",
		K(nr_free_pages()),
		K(nr_free_highpages()));
	buf += len;

	len = sprintf(buf, "( Active: %d, inactive: %d, free: %d )\n",
	       nr_active_pages,
	       nr_inactive_pages,
	       nr_free_pages());
	buf += len;

	for (type = 0; type < MAX_NR_ZONES; type++) {
		struct list_head *head, *curr;
		zone_t *zone = pgdat->node_zones + type;
 		unsigned long nr, total, flags;
		struct page *page;
		unsigned int index;
		unsigned long pg_addr;

#ifdef CONFIG_SD_LARGEMEM
		if (type == ZONE_LARGEMEM) { 	/* Skip the LARGEMEM zone */
    			linear_free_area_t *pfreearea = &zone->linear_free_area;
			len = sprintf(buf, "Zone:%s freepages:%6ukB\n", zone->name,
            			pfreearea->nr_free_pages);
			buf += len;
			continue;
		}
#endif
		len = sprintf(buf, "Zone:%s freepages:%6lukB min:%6lukB low:%6lukB " 
			       "high:%6lukB\n", 
				zone->name,
				K(zone->free_pages),
				K(zone->pages_min),
				K(zone->pages_low),
				K(zone->pages_high));
		buf += len;
		total = 0;
		if (zone->size) {
			spin_lock_irqsave(&zone->lock, flags);
		 	for (order = 0; order < MAX_ORDER; order++) {
				head = &(zone->free_area + order)->free_list;
				curr = head;
				nr = 0;
				for (;;) {
					if ((curr = curr->next) == head)
						break;

					page = list_entry(curr, struct page, list);
					pg_addr = (unsigned long)page_address(page);
					index = page - zone->zone_mem_map;
					len = sprintf(buf, "  %u-%u: 0x%08lx-0x%08lx\n",
						index, index - 1 + (1 << order),
						pg_addr, 
						pg_addr - 1 + ((1 << order) << PAGE_SHIFT));
					buf += len;
					nr++;
				}
				total += nr * (1 << order);
				if (nr != 0) {
					len = sprintf(buf, "  === %lu*%lukB\n", nr, K(1UL) << order);
					buf += len;
				}
			}
			spin_unlock_irqrestore(&zone->lock, flags);
		}
		len = sprintf(buf, "=== total free %lukB\n", K(total));
		buf += len;
	}

	return(buf - dest);
}

static int mem_info_read_proc(char *page, char **start, off_t off,
                                 int count, int *eof, void *data)
{
	int len = 0;
	unsigned long flags;

	save_flags_cli(flags);
	len = dump_free_areas_core(page, pgdat_list);
	restore_flags(flags);

	if (len <= off+count) *eof = 1;
	*start = page + off;
	len -= off;
	if (len>count) len = count;
	if (len<0) len = 0;
	return len;
}

#ifdef CONFIG_SD_LARGEMEM
#define PRINTK(a...) (page ? (len += sprintf(page + len, a)) : printk(a))
#define FIXUP(t)				\
	if (page) {					\
		if (len <= off) {		\
			off -= len;		\
			len = 0;		\
		} else {			\
			if (len-off > count - 80)	\
				goto t;		\
		}				\
	} else

static int lgmem_info_read_proc(char *page, char **start, off_t off,
                                 int count, int *eof, void *data)
{
	int len = 0;
    	struct page *p, *ep;
	int cols;
	int flags;
	int pcnt = 0;
	zone_t *zone = pgdat_list->node_zones + ZONE_LARGEMEM;
    	linear_free_area_t *pfreearea;
        pfreearea = &zone->linear_free_area;

	save_flags(flags);
	cli();

	FIXUP(got_data);
	cols = 0;

 	for (p = zone->zone_mem_map, ep = p + pfreearea->bit_map_size; p < ep; p++) {

#ifdef MEGA_HACK
		extern int blkdev_readpage(struct page *page);
# ifdef CONFIG_BLK_DEV_RAM
		extern int ramdisk_readpage(struct page *page);
# endif
# ifdef CONFIG_ROMFS_FS
		extern int romfs_readpage(struct page *page);
# endif
# ifdef CONFIG_EXT2_FS
		extern int ext2_readpage(struct page *page);
# endif
# ifdef CONFIG_MINIX_FS
		extern int minix_readpage(struct page *page);
# endif
#endif
		if (cols == 0)
			PRINTK("0x%08x: ",(unsigned)page_address(p));

		if (test_bit(p - zone->zone_mem_map, pfreearea->bit_map)) {
			if (PageReserved(p))
				PRINTK("R");
			else if (p->mapping && p->mapping->a_ops) {
#ifdef MEGA_HACK
				if (p->mapping->a_ops->readpage == blkdev_readpage)
					PRINTK("B");
				else
# ifdef CONFIG_BLK_DEV_RAM
				if (p->mapping->a_ops->readpage == ramdisk_readpage)
					PRINTK("*");
				else
# endif
# ifdef CONFIG_ROMFS_FS
				if (p->mapping->a_ops->readpage == romfs_readpage)
					PRINTK("#");
				else
# endif
# ifdef CONFIG_MINIX_FS
				if (p->mapping->a_ops->readpage == minix_readpage)
					PRINTK("M");
				else
# endif
# ifdef CONFIG_EXT2_FS
				if (p->mapping->a_ops->readpage == ext2_readpage)
					PRINTK("%");
				else
# endif
#endif
					PRINTK("X");
			} else if (PageSwapCache(p))
				PRINTK("S");
			else if (PageLocked(p))
				PRINTK("L");
			else if (PageActive(p))
				PRINTK("A");
			else if (PageLRU(p))
				PRINTK("U");
			else if (PageSlab(p))
				PRINTK("s");
			else if (p->flags & (1<<PG_referenced))
				PRINTK("r");
			else if (atomic_read(&p->count)) {
				PRINTK("C");
			} else
				PRINTK("?");
		} else {
			PRINTK("-");
			pcnt++;
		}
		cols++;
		if (cols >= 64) {
			PRINTK("\n");
			cols = 0;
			FIXUP(got_data);
		}
	}
	if (cols)
		PRINTK("\n");
	PRINTK("Free mem: %d pages (%dKB).\n", pcnt, (pcnt << PAGE_SHIFT) >> 10);
	FIXUP(got_data);
	PRINTK("\n");
#if 0
	FIXUP(got_data);

{
	unsigned long total_bytes = 0, total_sbytes = 0, total_slack = 0;
	struct task_struct *p;

	for_each_task(p) {
		struct mm_struct *mm = p->mm;
		unsigned long bytes = 0, sbytes = 0, slack = 0;
		struct mm_tblock_struct * tblock;

		if (!mm)
			continue;
        
		for (tblock = &mm->tblock; tblock; tblock = tblock->next) {
			if (tblock->rblock) {
				bytes += ksize(tblock);
				if (atomic_read(&mm->mm_count) > 1 ||
						tblock->rblock->refcount > 1) {
					sbytes += ksize(tblock->rblock->kblock);
					sbytes += ksize(tblock->rblock) ;
				} else {
					bytes += ksize(tblock->rblock->kblock);
					bytes += ksize(tblock->rblock) ;
					slack += ksize(tblock->rblock->kblock) - tblock->rblock->size;
				}
			}
		}
		
		((atomic_read(&mm->mm_count) > 1) ? sbytes : bytes)
				+= ksize(mm);
		(current->fs && atomic_read(&current->fs->count) > 1 ? sbytes : bytes)
				+= ksize(current->fs);
		(current->files && atomic_read(&current->files->count) > 1 ? sbytes : bytes)
				+= ksize(current->files);
		(current->sig && atomic_read(&current->sig->count) > 1 ? sbytes : bytes)
				+= ksize(current->sig);
		bytes += ksize(current); /* includes kernel stack */

		PRINTK("%-16s Mem:%8lu Slack:%8lu Shared:%8lu\n", p->comm, bytes,
				slack, sbytes);
		FIXUP(got_data);
		total_slack += slack;
		total_sbytes += sbytes;
		total_bytes += bytes;
	}
	PRINTK("%-16s Mem:%8lu Slack:%8lu Shared:%8lu\n\n", "Total", total_bytes,
				total_slack, total_sbytes);
	FIXUP(got_data);
}
#endif

//	len += print_free_areas(page + len, count - len);
//	FIXUP(got_data);

got_data:
	restore_flags(flags);
	
	if (page) {
		*start = page+off;

		len -= (*start-page);
		if (len <= count - 80)
			*eof = 1;
		if (len>count) len = count;
		if (len<0) len = 0;
	}
	return(len);
}
#endif

static __init int page_alloc_init(void)
{
	create_proc_read_entry("freemem", S_IWUSR | S_IRUGO, NULL,
			mem_info_read_proc, NULL);
#ifdef CONFIG_SD_LARGEMEM
	create_proc_read_entry("lgmem_map", S_IWUSR | S_IRUGO, NULL,
			lgmem_info_read_proc, NULL);
#endif
	return(0);
}

module_init(page_alloc_init);
#endif /* CONFIG_SD_CHECKERROR */

