Bug 20609

Summary: panic: vm_fault: fault on nofault entry, addr: cc4b3000
Product: Base System Reporter: tegge <tegge>
Component: kernAssignee: Andre Oppermann <andre>
Status: Closed FIXED    
Severity: Affects Only Me    
Priority: Normal    
Version: 5.0-CURRENT   
Hardware: Any   
OS: Any   

Description tegge 2000-08-15 01:20:00 UTC
bfreekva() is supposed to be protected by splbio(), serializing calls to
vm_map_delete().  But vm_map_delete() might block, causing the spl based
serialization to fail.

#0  boot (howto=260) at ../../kern/kern_shutdown.c:303
#1  0xc0169ee5 in panic (fmt=0xc02acaf4 "from debugger")
    at ../../kern/kern_shutdown.c:553
#2  0xc0138d79 in db_panic (addr=-1071163028, have_addr=0, count=-1,
    modif=0xdce1e9c0 "") at ../../ddb/db_command.c:433
#3  0xc0138d19 in db_command (last_cmdp=0xc02e6dd4, cmd_table=0xc02e6c34,
    aux_cmd_tablep=0xc0307110) at ../../ddb/db_command.c:333
#4  0xc0138dde in db_command_loop () at ../../ddb/db_command.c:455
#5  0xc013afaf in db_trap (type=3, code=0) at ../../ddb/db_trap.c:71
#6  0xc02756b1 in kdb_trap (type=3, code=0, regs=0xdce1ead4)
    at ../../i386/i386/db_interface.c:158
#7  0xc028a31c in trap (frame={tf_fs = -1070530536, tf_es = -867500016,
      tf_ds = 16, tf_edi = -867487744, tf_esi = 256, tf_ebp = -589173988,
      tf_isp = -589174016, tf_ebx = -1070796192, tf_edx = -1,
      tf_ecx = 16777217, tf_eax = 18, tf_trapno = 3, tf_err = 0,
      tf_eip = -1071163028, tf_cs = 8, tf_eflags = 582, tf_esp = -1070769885,
      tf_ss = -1070911022}) at ../../i386/i386/trap.c:583
#8  0xc027596c in Debugger (msg=0xc02b31d2 "panic") at machine/cpufunc.h:64
#9  0xc0169edc in panic (
    fmt=0xc02cf260 "vm_fault: fault on nofault entry, addr: %lx")
    at ../../kern/kern_shutdown.c:551
#10 0xc02577e0 in vm_fault (map=0xc031844c, vaddr=3427479552,
    fault_type=1 '\001', fault_flags=0) at ../../vm/vm_fault.c:240
#11 0xc028a686 in trap_pfault (frame=0xdce1ec60, usermode=0, eva=3427479780)
    at ../../i386/i386/trap.c:857
#12 0xc028a1ef in trap (frame={tf_fs = 24, tf_es = -882180080,
      tf_ds = -1072103408, tf_edi = -883516928, tf_esi = 62533,
      tf_ebp = -589173500, tf_isp = -589173620, tf_ebx = -57356,
      tf_edx = -867508224, tf_ecx = 0, tf_eax = 5177, tf_trapno = 12,
      tf_err = 0, tf_eip = -1071322559, tf_cs = 8, tf_eflags = 66050,
      tf_esp = -882193568, tf_ss = 2049081344}) at ../../i386/i386/trap.c:457
#13 0xc024ea41 in ufs_bmaparray (vp=0xdcb018c0, bn=62533, bnp=0xcb6acb68,
    ap=0x0, nump=0x0, runp=0x0, runb=0x0) at ../../ufs/ufs/ufs_bmap.c:224
#14 0xc024e778 in ufs_bmap (ap=0xdce1ed4c) at ../../ufs/ufs/ufs_bmap.c:83
#15 0xc025552d in ufs_vnoperate (ap=0xdce1ed4c)
    at ../../ufs/ufs/ufs_vnops.c:2301
#16 0xc0254f39 in ufs_strategy (ap=0xdce1edb0) at vnode_if.h:902
#17 0xc025552d in ufs_vnoperate (ap=0xdce1edb0)
    at ../../ufs/ufs/ufs_vnops.c:2301
#18 0xc0197790 in cluster_read (vp=0xdcb018c0, filesize=5242880000,
    lblkno=62534, size=32768, cred=0x0, totread=28160, seqcount=0,
    bpp=0xdce1ee44) at vnode_if.h:923
#19 0xc024cea6 in ffs_read (ap=0xdce1ee68) at ../../ufs/ufs/ufs_readwrite.c:266
#20 0xc01a3244 in vn_read (fp=0xc363b140, uio=0xdce1eed8, cred=0xc3699880,
    flags=1, p=0xdcda2ee0) at vnode_if.h:334
#21 0xc017b574 in dofileread (p=0xdcda2ee0, fp=0xc363b140, fd=3,
    buf=0x8163c00, nbyte=512, offset=2049108992, flags=1)
    at ../../sys/file.h:141
#22 0xc017b4b4 in pread (p=0xdcda2ee0, uap=0xdce1ef80)
    at ../../kern/sys_generic.c:136


(kgdb) proc 530
(kgdb) where
#0  mi_switch () at machine/globals.h:119
#1  0xc016cc89 in tsleep (ident=0xc033c298, priority=4,
    wmesg=0xc02d022b "vmwait", timo=0) at ../../kern/kern_synch.c:470
#2  0xc025f9ef in vm_wait () at ../../vm/vm_page.c:896
#3  0xc02601a9 in vm_page_grab (object=0xc03184e0, pindex=118847,
    allocflags=131) at ../../vm/vm_page.c:1479
#4  0xc0258e51 in kmem_alloc (map=0xc031844c, size=4096)
    at ../../vm/vm_kern.c:200
#5  0xc0262f5e in _zget (z=0xc0314ea0) at ../../vm/vm_zone.c:344
#6  0xc0262dd1 in zalloci (z=0xc0314ea0) at ../../vm/vm_zone.h:85
#7  0xc0259723 in vm_map_entry_create (map=0xc0318308)
    at ../../vm/vm_zone.h:117
#8  0xc0259e05 in _vm_map_clip_end (map=0xc0318308, entry=0xdcf30270,
    end=3468730368) at ../../vm/vm_map.c:853
#9  0xc025af0f in vm_map_delete (map=0xc0318308, start=3468713984,
    end=3468730368) at ../../vm/vm_map.c:1794
#10 0xc0192f9b in bfreekva (bp=0xcb690960) at ../../kern/vfs_bio.c:414
#11 0xc0194666 in getnewbuf (slpflag=0, slptimeo=0, size=32768, maxsize=32768)
    at ../../kern/vfs_bio.c:1630
#12 0xc01953f1 in getblk (vp=0xdcb018c0, blkno=139706, size=32768, slpflag=0,
    slptimeo=0) at ../../kern/vfs_bio.c:2220
#13 0xc0197416 in cluster_read (vp=0xdcb018c0, filesize=5242880000,
    lblkno=139706, size=32768, cred=0x0, totread=17408, seqcount=0,
    bpp=0xdcb8ee44) at ../../kern/vfs_cluster.c:120
#14 0xc024cea6 in ffs_read (ap=0xdcb8ee68) at ../../ufs/ufs/ufs_readwrite.c:266
#15 0xc01a3244 in vn_read (fp=0xc363b140, uio=0xdcb8eed8, cred=0xc3699880,
    flags=1, p=0xdcb1a260) at vnode_if.h:334
#16 0xc017b574 in dofileread (p=0xdcb1a260, fp=0xc363b140, fd=3,
    buf=0x814fe00, nbyte=512, offset=4577903104, flags=1)
    at ../../sys/file.h:141
#17 0xc017b4b4 in pread (p=0xdcb1a260, uap=0xdcb8ef80)
    at ../../kern/sys_generic.c:136

(kgdb) proc 529
(kgdb) where
#0  mi_switch () at machine/globals.h:119
#1  0xc016cc89 in tsleep (ident=0xcb5f0dc0, priority=16,
    wmesg=0xc02b65c9 "biord", timo=0) at ../../kern/kern_synch.c:470
#2  0xc0195b9b in bufwait (bp=0xcb5f0dc0) at ../../kern/vfs_bio.c:2620
#3  0xc01978c1 in cluster_read (vp=0xdcb018c0, filesize=5242880000,
    lblkno=131174, size=32768, cred=0x0, totread=13312, seqcount=0,
    bpp=0xdcb8ae44) at ../../kern/vfs_cluster.c:302
#4  0xc024cea6 in ffs_read (ap=0xdcb8ae68) at ../../ufs/ufs/ufs_readwrite.c:266
#5  0xc01a3244 in vn_read (fp=0xc363b140, uio=0xdcb8aed8, cred=0xc3699880,
    flags=1, p=0xdcb1a400) at vnode_if.h:334
#6  0xc017b574 in dofileread (p=0xdcb1a400, fp=0xc363b140, fd=3,
    buf=0x814fc00, nbyte=512, offset=4298289664, flags=1)
    at ../../sys/file.h:141
#7  0xc017b4b4 in pread (p=0xdcb1a400, uap=0xdcb8af80)
    at ../../kern/sys_generic.c:136
#8  0xc028ad95 in syscall2 (frame={tf_fs = 47, tf_es = 47, tf_ds = 47,
      tf_edi = 512, tf_esi = 1, tf_ebp = -1115685468, tf_isp = -591876140,
      tf_ebx = 1498383852, tf_edx = 1, tf_ecx = 134520321, tf_eax = 198,
      tf_trapno = 7, tf_err = 2, tf_eip = 1498088260, tf_cs = 31,
      tf_eflags = 514, tf_esp = -1115685528, tf_ss = 47})
    at ../../i386/i386/trap.c:1174
#9  0xc027608b in Xint0x80_syscall ()

(kgdb) proc 528
(kgdb) where
#0  mi_switch () at machine/globals.h:119
#1  0xc016cc89 in tsleep (ident=0xc033c298, priority=4,
    wmesg=0xc02d022b "vmwait", timo=0) at ../../kern/kern_synch.c:470
#2  0xc025f9ef in vm_wait () at ../../vm/vm_page.c:896
#3  0xc02601a9 in vm_page_grab (object=0xc03184e0, pindex=118885,
    allocflags=131) at ../../vm/vm_page.c:1479
#4  0xc0258e51 in kmem_alloc (map=0xc031844c, size=4096)
    at ../../vm/vm_kern.c:200
#5  0xc0262f5e in _zget (z=0xc0314ea0) at ../../vm/vm_zone.c:344
#6  0xc0262dd1 in zalloci (z=0xc0314ea0) at ../../vm/vm_zone.h:85
#7  0xc0259723 in vm_map_entry_create (map=0xc0318308)
    at ../../vm/vm_zone.h:117
#8  0xc0259d69 in _vm_map_clip_start (map=0xc0318308, entry=0xdcb41c60,
    start=3425099776) at ../../vm/vm_map.c:793
#9  0xc025aec7 in vm_map_delete (map=0xc0318308, start=3425099776,
    end=3425116160) at ../../vm/vm_map.c:1767
#10 0xc0192f9b in bfreekva (bp=0xcb558a20) at ../../kern/vfs_bio.c:414
#11 0xc0194666 in getnewbuf (slpflag=0, slptimeo=0, size=32768, maxsize=32768)
    at ../../kern/vfs_bio.c:1630
#12 0xc01953f1 in getblk (vp=0xdcb018c0, blkno=50567, size=32768, slpflag=0,
    slptimeo=0) at ../../kern/vfs_bio.c:2220
#13 0xc0197416 in cluster_read (vp=0xdcb018c0, filesize=5242880000,
    lblkno=50567, size=32768, cred=0x0, totread=6144, seqcount=0,
    bpp=0xdcb86e44) at ../../kern/vfs_cluster.c:120
#14 0xc024cea6 in ffs_read (ap=0xdcb86e68) at ../../ufs/ufs/ufs_readwrite.c:266
#15 0xc01a3244 in vn_read (fp=0xc363b140, uio=0xdcb86ed8, cred=0xc3699880,
    flags=1, p=0xdcb1a5a0) at vnode_if.h:334
#16 0xc017b574 in dofileread (p=0xdcb1a5a0, fp=0xc363b140, fd=3,
    buf=0x814fa00, nbyte=512, offset=1656985088, flags=1)
    at ../../sys/file.h:141
#17 0xc017b4b4 in pread (p=0xdcb1a5a0, uap=0xdcb86f80)
    at ../../kern/sys_generic.c:136
#18 0xc028ad95 in syscall2 (frame={tf_fs = 47, tf_es = 47, tf_ds = 47,
      tf_edi = 512, tf_esi = 0, tf_ebp = -1113588316, tf_isp = -591892524,
      tf_ebx = 1498383852, tf_edx = 0, tf_ecx = 134520321, tf_eax = 198,
      tf_trapno = 7, tf_err = 2, tf_eip = 1498088260, tf_cs = 31,
      tf_eflags = 514, tf_esp = -1113588376, tf_ss = 47})
    at ../../i386/i386/trap.c:1174
#19 0xc027608b in Xint0x80_syscall ()

Fix: 

Alternative 1:

  Obtain an exclusive lock for buffer_map in bfreekva before calling
  vm_map_delete().  Release it afterwards.

  Obtain an exclusive lock on buffer_map before calling vm_map_findspace()
  from getnewbuf().  Release it after call to vm_map_insert().

  never call bfreekva() from interrupts.

Alternative 2:

  define buffer_map as a system map.  This causes the nonblocking
  kmapent zone to be used for allocation of vm map entries for
  buffer_map.
How-To-Repeat: 
Start many parallell read operations for the first time on a system while
having little free memory.  Use a different file system block size on the
partition used for the testing.
Comment 1 Tor Egge 2000-08-15 05:12:03 UTC
defining buffer_map as a system map is not sufficient.

It's still possible to get a Fatal trap 12: page fault while in kernel mode.


dump of kernel_map's vm map entries:

[...]
map c031844c entry d6050990 start dcf25000 end dcf26000
map c031844c entry d6053a20 start dcf26000 end dcf27000
map c031844c entry d604e300 start ce7e6000 end dcf27000
				  ^^^^^^^^ bogus
map c031844c entry d6054b40 start dcf27000 end dcf28000
map c031844c entry d6056d80 start dcf28000 end dcf29000
map c031844c entry d60550c0 start dcf29000 end dcf2a000
map c031844c entry d604a750 start dcf2a000 end dcf2b000
map c031844c entry d6054150 start dcf2b000 end dcf2c000
map c031844c entry d6055150 start dcf2c000 end dcf2d000
map c031844c entry d6054870 start dcf2d000 end dcf2e000
map c031844c entry d604fa50 start dcf2e000 end dcf2f000
map c031844c entry d6051180 start dcf2f000 end dcf30000
map c031844c entry d6050930 start dcf30000 end dcf31000
map c031844c entry d60561b0 start dcf31000 end dcf32000
map c031844c entry d604f540 start ce96e000 end dcf33000
				  ^^^^^^^^ bogus
map c031844c entry d6053210 start dcf33000 end dcf34000

The bogus start addresses are locations inside buffer_map.

This indicates yet another race condition.

vm_map_entry_create should probably use zalloci when allocating elements
from kmapentzone.  Otherwise, a malloc() call inside an interrupt could
cause inconsistent allocation of vm_map_entries from kmapentzone.

Since these problem occur on an SMP machine, another race condition
is also present.  By forcing a panic when multiple processes were
manipulating buffer_map at the same time, I found the following
stack trace:

#0  mi_switch () at machine/globals.h:119
#1  0xc016cca9 in tsleep (ident=0xc033c398, priority=4, 
    wmesg=0xc02d032b "vmwait", timo=0) at ../../kern/kern_synch.c:470
#2  0xc025fa8f in vm_wait () at ../../vm/vm_page.c:896
#3  0xc0260249 in vm_page_grab (object=0xc03185e0, pindex=118761, 
    allocflags=131) at ../../vm/vm_page.c:1479
#4  0xc0258ee5 in kmem_alloc (map=0xc031854c, size=4096)
    at ../../vm/vm_kern.c:200
#5  0xc0262ffe in _zget (z=0xc03186c0) at ../../vm/vm_zone.c:344
#6  0xc0262e71 in zalloci (z=0xc03186c0) at ../../vm/vm_zone.h:85
#7  0xc025d84b in vm_object_allocate (type=0, size=32)
    at ../../vm/vm_object.c:224
#8  0xc0259e88 in _vm_map_clip_end (map=0xc0318408, entry=0xd604ede0, 
    end=3460128768) at ../../vm/vm_map.c:848
#9  0xc025afaf in vm_map_delete (map=0xc0318408, start=3460096000, 
    end=3460128768) at ../../vm/vm_map.c:1799
#10 0xc0192fea in bfreekva (bp=0xcb5c2380) at ../../kern/vfs_bio.c:422
#11 0xc01946ba in getnewbuf (slpflag=0, slptimeo=0, size=32768, maxsize=32768)
    at ../../kern/vfs_bio.c:1639
#12 0xc0195485 in getblk (vp=0xdca96180, blkno=151606, size=32768, slpflag=0, 
    slptimeo=0) at ../../kern/vfs_bio.c:2236
#13 0xc01974aa in cluster_read (vp=0xdca96180, filesize=5242880000, 
    lblkno=151606, size=32768, cred=0x0, totread=29696, seqcount=0, 
    bpp=0xdcb05e44) at ../../kern/vfs_cluster.c:120
#14 0xc024cf3a in ffs_read (ap=0xdcb05e68) at ../../ufs/ufs/ufs_readwrite.c:266
#15 0xc01a32d8 in vn_read (fp=0xc3699c00, uio=0xdcb05ed8, cred=0xc3656700, 
    flags=1, p=0xd7b3d1e0) at vnode_if.h:334
#16 0xc017b594 in dofileread (p=0xd7b3d1e0, fp=0xc3699c00, fd=3, 
    buf=0x814e400, nbyte=512, offset=4967854592, flags=1)
    at ../../sys/file.h:141
#17 0xc017b4d4 in pread (p=0xd7b3d1e0, uap=0xdcb05f80)
    at ../../kern/sys_generic.c:136
#18 0xc028ae35 in syscall2 (frame={tf_fs = 47, tf_es = 47, tf_ds = 47, 
      tf_edi = 512, tf_esi = 1, tf_ebp = -1090519644, tf_isp = -592420908, 
      tf_ebx = 1498383852, tf_edx = 1, tf_ecx = 134520321, tf_eax = 198, 
      tf_trapno = 7, tf_err = 2, tf_eip = 1498088260, tf_cs = 31, 
      tf_eflags = 514, tf_esp = -1090519704, tf_ss = 47})
    at ../../i386/i386/trap.c:1174
#19 0xc027612b in Xint0x80_syscall ()

Blocking, waiting for an object to be available, will cause an inconsistent
buffer map.

using lockmgr calls (i.e. Alternative 1 in previous message) might
cause deadlock problems if the thread holding the buffer_map lock
blocks waiting for memory to be freed while the pager blocks waiting
for a buffer to be available.

Perhaps various vm_map operations need to check a flag in the vm map
and avoid allocating vm objects for some maps, e.g. buffer_map.

- Tor Egge
Comment 2 Sheldon Hearn 2000-08-15 09:07:50 UTC
On Mon, 14 Aug 2000 21:20:04 MST, Tor.Egge@fast.no wrote:

>  Perhaps various vm_map operations need to check a flag in the vm map
>  and avoid allocating vm objects for some maps, e.g. buffer_map.

Hi Tor,

Have you dropped Matt Dillon <dillon@FreeBSD.org> a message pointing him
to this PR, or are you going to tackle this yourself?  If the latter,
will you assign the PR to yourself?

Ciao,
Sheldon.
Comment 3 Tor Egge 2000-08-15 20:14:36 UTC
> Hi Tor,
> 
> Have you dropped Matt Dillon <dillon@FreeBSD.org> a message pointing him
> to this PR, or are you going to tackle this yourself?  If the latter,
> will you assign the PR to yourself?

He gets this message.

I'm currently looking into the class of problem by adding some 
helper functions that can be used for invariant checks.

e.g. 

 -  Denote that a function should not be allowed to block:

	- increase mi_switch disallow counter

	- decrease mi_switch disallow counter


 - Check that process is allowed to block when trying to do so
   (inside mi_switch and tsleep)

 - Check that process is allowed to block at various other
   points (malloc with M_WAIT, zalloc on zone without ZONE_INTERRUPT flag
   set, vm_page_grab with VM_ALLOC_RETRY)

 - Check that we've got proper spl protection
   (zalloc one zone with ZONE_INTERRUPT flag set).

 - Disallow blocking in vm_map_delete, vm_map_findspace and
   vm_map_findspace if the map is a system map.

The first relevant changes should probably be:

	- buffer_map should be a system map to avoid blocking.

	- vm_map* routines should not create extra objects on system maps
	  since the object allocation might block.  Since system maps
	  might be manipulated by interrupts, blocking is not allowed.

	- vm_map_entry_create and vm_map_entry_dispose should use
	  zalloci/zfreei when allocating/freeing elements from/to 
	  kmapentzone.  This to avoid race conditions when interrupts
	  manipulate maps.
	

Further changes might be needed to ensure consistent behavior as 
blocking routines called from routines not allowed to block is found.

I do not plan on committing any fix for this before it has been reviewed.

- Tor Egge
Comment 4 Tor Egge 2000-08-15 21:46:15 UTC
> Further changes might be needed to ensure consistent behavior as 
> blocking routines called from routines not allowed to block is found.

While running a kernel instrumented with further sanity checks I 
found the additional problem:

panic: zfree interrupt zone withput proper spl

mp_lock = 01000001; cpuid = 1; lapic.id = 00000000
Debugger("panic")
Stopped at      Debugger+0x34:  movb    $0,in_Debugger.609
db> trace
Debugger(c02b5072) at Debugger+0x34
panic(c02d25a0,8,0,594ae000,80000000) at panic+0xa4
swp_pager_meta_ctl(dcec9300,0,2,dceedf28,c02596ad) at swp_pager_meta_ctl+0x137
swap_pager_unswapped(c0bb28dc) at swap_pager_unswapped+0x13
vm_fault(d7b42340,594ae000,2,8,c) at vm_fault+0x9e1
trap_pfault(dceedfa8,1,594ae018,0,0) at trap_pfault+0xa2
trap(2f,2f,2f,0,0) at trap+0x1ff
calltrap() at calltrap+0x17
--- trap 0xc, eip = 0x80921dc, esp = 0xbfbffc14, ebp = 0xbfbffc3c ---
db> panic


Calls to zfree on swap_zone without splvm() protection seems like an error.

- Tor Egge
Comment 5 Tor Egge 2000-08-15 23:43:05 UTC
Here is a suggested fix.  Please review.

Index: sys/i386/i386/machdep.c
===================================================================
RCS file: /home/ncvs/src/sys/i386/i386/machdep.c,v
retrieving revision 1.401
diff -u -r1.401 machdep.c
--- sys/i386/i386/machdep.c	2000/08/13 05:17:46	1.401
+++ sys/i386/i386/machdep.c	2000/08/15 22:39:49
@@ -379,6 +379,7 @@
 			(nbuf*BKVASIZE) + (nswbuf*MAXPHYS) + pager_map_size);
 	buffer_map = kmem_suballoc(clean_map, &buffer_sva, &buffer_eva,
 				(nbuf*BKVASIZE));
+	buffer_map->system_map = 1;
 	pager_map = kmem_suballoc(clean_map, &pager_sva, &pager_eva,
 				(nswbuf*MAXPHYS) + pager_map_size);
 	pager_map->system_map = 1;
Index: sys/vm/vm_map.c
===================================================================
RCS file: /home/ncvs/src/sys/vm/vm_map.c,v
retrieving revision 1.190
diff -u -r1.190 vm_map.c
--- sys/vm/vm_map.c	2000/05/14 18:46:40	1.190
+++ sys/vm/vm_map.c	2000/08/15 22:39:49
@@ -271,7 +271,10 @@
 	vm_map_t map;
 	vm_map_entry_t entry;
 {
-	zfree((map->system_map || !mapentzone) ? kmapentzone : mapentzone, entry);
+	if (map->system_map || !mapentzone)
+		zfreei(kmapentzone, entry);
+	else
+		zfree(mapentzone, entry);
 }
 
 /*
@@ -286,8 +289,10 @@
 {
 	vm_map_entry_t new_entry;
 
-	new_entry = zalloc((map->system_map || !mapentzone) ? 
-		kmapentzone : mapentzone);
+	if (map->system_map || !mapentzone)
+		new_entry = zalloci(kmapentzone);
+	else
+		new_entry = zalloc(mapentzone);
 	if (new_entry == NULL)
 	    panic("vm_map_entry_create: kernel resources exhausted");
 	return(new_entry);
@@ -776,7 +781,7 @@
 	 * put this improvement.
 	 */
 
-	if (entry->object.vm_object == NULL) {
+	if (entry->object.vm_object == NULL && !map->system_map) {
 		vm_object_t object;
 		object = vm_object_allocate(OBJT_DEFAULT,
 				atop(entry->end - entry->start));
@@ -832,7 +837,7 @@
 	 * put this improvement.
 	 */
 
-	if (entry->object.vm_object == NULL) {
+	if (entry->object.vm_object == NULL && !map->system_map) {
 		vm_object_t object;
 		object = vm_object_allocate(OBJT_DEFAULT,
 				atop(entry->end - entry->start));
@@ -1287,7 +1292,8 @@
 					    atop(entry->end - entry->start));
 					entry->eflags &= ~MAP_ENTRY_NEEDS_COPY;
 
-				} else if (entry->object.vm_object == NULL) {
+				} else if (entry->object.vm_object == NULL &&
+					   !map->system_map) {
 
 					entry->object.vm_object =
 					    vm_object_allocate(OBJT_DEFAULT,
@@ -1477,7 +1483,8 @@
 						    &entry->offset,
 						    atop(entry->end - entry->start));
 						entry->eflags &= ~MAP_ENTRY_NEEDS_COPY;
-					} else if (entry->object.vm_object == NULL) {
+					} else if (entry->object.vm_object == NULL &&
+						   !map->system_map) {
 						entry->object.vm_object =
 						    vm_object_allocate(OBJT_DEFAULT,
 							atop(entry->end - entry->start));
@@ -2604,7 +2611,8 @@
 	/*
 	 * Create an object if necessary.
 	 */
-	if (entry->object.vm_object == NULL) {
+	if (entry->object.vm_object == NULL &&
+	    !map->system_map) {
 		if (vm_map_lock_upgrade(map)) 
 			goto RetryLookup;
 
Index: sys/vm/vm_fault.c
===================================================================
RCS file: /home/ncvs/src/sys/vm/vm_fault.c,v
retrieving revision 1.112
diff -u -r1.112 vm_fault.c
--- sys/vm/vm_fault.c	2000/05/29 22:40:54	1.112
+++ sys/vm/vm_fault.c	2000/08/15 22:39:49
@@ -796,6 +796,7 @@
 		 * any swap backing since the page is now dirty.
 		 */
 		if (fault_flags & VM_FAULT_DIRTY) {
+			int s;
 			if (fs.entry->eflags & MAP_ENTRY_NOSYNC) {
 				if (fs.m->dirty == 0)
 					vm_page_flag_set(fs.m, PG_NOSYNC);
@@ -803,7 +804,9 @@
 				vm_page_flag_clear(fs.m, PG_NOSYNC);
 			}
 			vm_page_dirty(fs.m);
+			s = splvm();
 			vm_pager_page_unswapped(fs.m);
+			splx(s);
 		}
 	}
Comment 6 Sheldon Hearn freebsd_committer freebsd_triage 2000-08-16 08:22:57 UTC
Responsible Changed
From-To: freebsd-bugs->dillon

Matt asked for this one.
Comment 7 dillon 2001-01-27 18:59:53 UTC
   I finally have time to play with your buffer cache fixes.  I am making
   one addition (and testing it today), and that is to have the buffer
   cache code call vm_map_simplify_entry() in order to collapse vm_map_entry
   elements together.  At the moment each buffer cache buffer allocates
   its own vm_map_entry, so a system with say 50MB worth of buffers will
   eat over 3000 vm_map_entry structures unnecessarily.  The vm_map is
   mostly contiguous in the buffer cache due to the way the buffer
   cache allocates and maintains its KVA space.

   With your patch in place, it is possible to call vm_map_simplify_entry(),
   which should reduce the number of vm_map_entry structures to just a
   handful (maybe a dozen).

   I'll email you the diffs after I've done some basic testing.

						-Matt

:Here is a suggested fix.  Please review.
:
:Index: sys/i386/i386/machdep.c
:===================================================================
:RCS file: /home/ncvs/src/sys/i386/i386/machdep.c,v
:...
Comment 8 dillon 2001-01-27 20:23:14 UTC
:    elements together.  At the moment each buffer cache buffer allocates
:    its own vm_map_entry, so a system with say 50MB worth of buffers will

    Correction:  The initial allocation of buffers is linear and hits a
    vm_map_insert() optimization case, so only one vm_map_entry is created
    to cover the whole buffer map.  However, as time progresses and the
    buffer map gets fragmented, the act of freeing & reallocating creates
    serious fragmentation due to not calling vm_map_simplify_entry().

    The buffer cache code rarely calls bfreekva() - it reuses an existing
    mapping when possible.  Generally speaking the KVA is only freed when
    the buffer cache is forced to defragment the buffer_map due to 
    handling filesystems with mixed block sizes.  Since buffers are 16K
    aligned in KVM, only filesystem mixes with block sizes greater then
    16K will typically cause this problem to occur.

    However, I did find an edge case in the buffer cache code whereby it
    might decide to defragment due to a 'bufspace overcommit'.  The edge
    case can occur under any load condition.  The resulting fragmentation
    creates a proliferation of vm_map_entry elements associated with the
    buffer_map due to missing vm_map_simplify_entry() calls (the 
    vm_map_insert() optimization is not an all-encompassing optimization).
    With Tor's patch fixing the bogus vm_object allocations, and changing
    the buffer_map to a system map, we can now safely call 
    vm_map_simplify_entry().

						-Matt
Comment 9 Giorgos Keramidas freebsd_committer freebsd_triage 2003-02-23 02:15:29 UTC
Responsible Changed
From-To: dillon->freebsd-bugs

Back to the free pool.
Comment 10 Andre Oppermann freebsd_committer freebsd_triage 2003-12-27 15:25:49 UTC
State Changed
From-To: open->closed

This has been fixed ages ago by dillon in rev 1.194 of 
sys/vm/vm_map.c. 


Comment 11 Andre Oppermann freebsd_committer freebsd_triage 2003-12-27 15:25:49 UTC
Responsible Changed
From-To: freebsd-bugs->andre

This has been fixed ages ago by dillon in rev 1.194 of 
sys/vm/vm_map.c.