\# qemu -kernel loads the kernel at 0x80000000 # and causes each CPU to jump there. # kernel.ld causes the following code to # be placed at 0x80000000. .section .text .global _entry _entry: # set up a stack for C. # stack0 is declared in start.c, # with a 4096-byte stack per CPU. # sp = stack0 + (hartid \* 4096) la sp, stack0 li a0, 1024\*4 csrr a1, mhartid addi a1, a1, 1 mul a0, a0, a1 add sp, sp, a0 # jump to start() in start.c call start
// entry.S jumps here in machine mode on stack0. void start() { // set M Previous Privilege mode to Supervisor, for mret. unsigned long x = r_mstatus(); x &= ~MSTATUS_MPP_MASK; x = MSTATUS_MPP_S; w_mstatus(x);
// set M Exception Program Counter to main, for mret. // requires gcc -mcmodel=medany w_mepc((uint64)main);
// disable paging for now. w_satp(0);
// delegate all interrupts and exceptions to supervisor mode. w_medeleg(0xffff); w_mideleg(0xffff); w_sie(r_sie() SIE_SEIE SIE_STIE SIE_SSIE);
// configure Physical Memory Protection to give supervisor mode // access to all of physical memory. w_pmpaddr0(0x3fffffffffffffull); w_pmpcfg0(0xf);
// ask for clock interrupts. timerinit();
// keep each CPU's hartid in its tp register, for cpuid(). int id = r_mhartid(); w_tp(id);
// switch to supervisor mode and jump to main(). asm volatile("mret"); }
xv6提供 Unix 操作系统中的基本接口(由 Ken Thompson 和 Dennis Ritchie 引入),同时模仿 Unix 的内部设计。Unix 里机制结合良好的窄接口提供了令人吃惊的通用性。这样的接口设计非常成功,使得包括 BSD,Linux,Mac OS X,Solaris (甚至 Microsoft Windows 在某种程度上)都有类似 Unix 的接口。理解 xv6 是理解这些操作系统的一个良好起点。
// p->lock must be held when using these: enum procstate state; // Process state void \*chan; // If non-zero, sleeping on chan int killed; // If non-zero, have been killed int xstate; // Exit status to be returned to parent's wait int pid; // Process ID
// wait_lock must be held when using this: struct proc \*parent; // Parent process
// these are private to the process, so p->lock need not be held. uint64 kstack; // Virtual address of kernel stack uint64 sz; // Size of process memory (bytes) pagetable_t pagetable; // User page table struct trapframe \*trapframe; // data page for trampoline.S struct context context; // swtch() here to run process struct file \*ofile[NOFILE]; // Open files struct inode \*cwd; // Current directory char name[16]; // Process name (debugging) };
uservec: # # trap.c sets stvec to point here, so # traps from user space start here, # in supervisor mode, but with a # user page table. # # sscratch points to where the process's p->trapframe is # mapped into user space, at TRAPFRAME. # # swap a0 and sscratch # so that a0 is TRAPFRAME csrrw a0, sscratch, a0
// we're about to switch the destination of traps from // kerneltrap() to usertrap(), so turn off interrupts until // we're back in user space, where usertrap() is correct. intr_off();
// send syscalls, interrupts, and exceptions to trampoline.S w_stvec(TRAMPOLINE + (uservec - trampoline));
// set up trapframe values that uservec will need when // the process next re-enters the kernel. p->trapframe->kernel_satp = r_satp(); // kernel page table p->trapframe->kernel_sp = p->kstack + PGSIZE; // process's kernel stack p->trapframe->kernel_trap = (uint64)usertrap; p->trapframe->kernel_hartid = r_tp(); // hartid for cpuid()
// set up the registers that trampoline.S's sret will use // to get to user space. // set S Previous Privilege mode to User. unsigned long x = r_sstatus(); x &= ~SSTATUS_SPP; // clear SPP to 0 for user mode x = SSTATUS_SPIE; // enable interrupts in user mode w_sstatus(x);
// set S Exception Program Counter to the saved user pc. w_sepc(p->trapframe->epc);
// tell trampoline.S the user page table to switch to. uint64 satp = MAKE_SATP(p->pagetable);
// jump to trampoline.S at the top of memory, which // switches to the user page table, restores user registers, // and switches to user mode with sret. uint64 fn = TRAMPOLINE + (userret - trampoline); ((void (\*)(uint64,uint64))fn)(TRAPFRAME, satp); }
userret: # userret(TRAPFRAME, pagetable) # switch from kernel to user. # usertrapret() calls here. # a0: TRAPFRAME, in user page table. # a1: user page table, for satp.
# switch to the user page table. csrw satp, a1 sfence.vma zero, zero
# put the saved user a0 in sscratch, so we # can swap it with our a0 (TRAPFRAME) in the last step. ld t0, 112(a0) csrw sscratch, t0
# restore user a0, and save TRAPFRAME in sscratch csrrw a0, sscratch, a0 # return to user mode and user pc. # usertrapret() set up sstatus and sepc. sret
// give up the CPU if this is a timer interrupt. if(which_dev == 2 && myproc() != 0 && myproc()->state == RUNNING) yield();
// the yield() may have caused some traps to occur, // so restore trap registers for use by kernelvec.S's sepc instruction. w_sepc(sepc); w_sstatus(sstatus); }
// the PLIC allows each device to raise at most one // interrupt at a time; tell the PLIC the device is // now allowed to interrupt again. if(irq) plic_complete(irq);
return 1; } else if(scause == 0x8000000000000001L){ // software interrupt from a machine-mode timer interrupt, // forwarded by timervec in kernelvec.S.
if(cpuid() == 0){ clockintr(); } // acknowledge the software interrupt by clearing // the SSIP bit in sip. w_sip(r_sip() & ~2);
return 2; } else { return 0; } } //中断处理分成两个部分,前面部分把存储在UART寄存器的键盘输入发送. void uartintr(void) { //keyborad->RHR // read and process incoming characters.(处理控制台输入) while(1){ int c = uartgetc(); if(c == -1) break; consoleintr(c); }
void uartstart() { while(1){ if(uart_tx_w == uart_tx_r){ // transmit buffer is empty. return; } if((ReadReg(LSR) & LSR_TX_IDLE) == 0){ // the UART transmit holding register is full, // so we cannot give it another byte. // it will interrupt when it's ready for a new byte. return; } int c = uart_tx_buf[uart_tx_r % UART_TX_BUF_SIZE]; uart_tx_r += 1; // maybe uartputc() is waiting for space in the buffer. wakeup(&uart_tx_r); WriteReg(THR, c); } }
Another way in which concurrency requires care in drivers is that one process may be waiting for input from a device, but the interrupt signaling arrival of the input may arrive when a different process (or no process at all) is running. Thus interrupt handlers are not allowed to think about the process or code that they have interrupted. For example, an interrupt handler cannot safely call copyout with the current process’s page table. Interrupt handlers typically do relatively little work (e.g., just copy the input data to a buffer), and wake up top-half code to do the rest.
void scheduler(void) { struct proc \*p; struct cpu \*c = mycpu(); c->proc = 0; for(;;){ // Avoid deadlock by ensuring that devices can interrupt. intr_on();
for(p = proc; p < &proc[NPROC]; p++) { acquire(&p->lock); if(p->state == RUNNABLE) { // Switch to chosen process. It is the process's job // to release its lock and then reacquire it // before jumping back to us. p->state = RUNNING; c->proc = p; swtch(&c->context, &p->context);
// Process is done running for now. // It should have changed its p->state before coming back. c->proc = 0; } release(&p->lock); } } }
// Still holding p->lock from scheduler. release(&myproc()->lock);
if (first) { // File system initialization must be run in the context of a // regular process (e.g., because it calls sleep), and thus cannot // be run from main(). first = 0; fsinit(ROOTDEV); }
// Per-CPU state. struct cpu { struct proc \*proc; // The process running on this cpu, or null. struct context context; // swtch() here to enter scheduler(). int noff; // Depth of push_off() nesting. int intena; // Were interrupts enabled before push_off()? };
这个结构体存放着scheduler的上下文,以及当前运行的进程.
1 2 3 4 5 6 7 8 9 10 11 12 13
struct cpu\* mycpu(void) { int id = cpuid(); struct cpu \*c = &cpus[id]; return c; }
// Return the current struct proc \*, or zero if none. struct proc\* myproc(void) { push_off(); struct cpu \*c = mycpu(); struct proc \*p = c->proc; pop_off(); return p; }
// Atomically release lock and sleep on chan. // Reacquires lock when awakened. void sleep(void \*chan, struct spinlock \*lk) { struct proc \*p = myproc(); // Must acquire p->lock in order to // change p->state and then call sched. // Once we hold p->lock, we can be // guaranteed that we won't miss any wakeup // (wakeup locks p->lock), // so it's okay to release lk.
acquire(&p->lock); //DOC: sleeplock1 release(lk);
// Go to sleep. p->chan = chan; p->state = SLEEPING;
sched();
// Tidy up. p->chan = 0;
// Reacquire original lock. release(&p->lock); acquire(lk); }
struct pipe { struct spinlock lock; char data[PIPESIZE]; uint nread; // number of bytes read uint nwrite; // number of bytes written int readopen; // read fd is still open int writeopen; // write fd is still open };
int wait(uint64 addr) { struct proc \*np; int havekids, pid; struct proc \*p = myproc();
acquire(&wait_lock);
for(;;){ // Scan through table looking for exited children. havekids = 0; for(np = proc; np < &proc[NPROC]; np++){ if(np->parent == p){ // make sure the child isn't still in exit() or swtch(). acquire(&np->lock);
// No point waiting if we don't have any children. if(!havekids p->killed){ release(&wait_lock); return -1; } // Wait for a child to exit. sleep(p, &wait_lock); //DOC: wait-sleep } }
struct buf { int valid; // has data been read from disk? int disk; // does disk "own" buf? uint dev; uint blockno; struct sleeplock lock; uint refcnt; struct buf \*prev; // LRU cache list struct buf \*next; uchar data[BSIZE]; };
// Linked list of all buffers, through prev/next. // Sorted by how recently the buffer was used. // head.next is most recent, head.prev is least. struct buf head; } bcache;
// Look through buffer cache for block on device dev. // If not found, allocate a buffer. // In either case, return locked buffer. static struct buf\* bget(uint dev, uint blockno) { struct buf \*b;
acquire(&bcache.lock);
// Is the block already cached? for(b = bcache.head.next; b != &bcache.head; b = b->next){ if(b->dev == dev && b->blockno == blockno){ b->refcnt++; release(&bcache.lock); acquiresleep(&b->lock); return b; } }
// Not cached. // Recycle the least recently used (LRU) unused buffer. for(b = bcache.head.prev; b != &bcache.head; b = b->prev){ if(b->refcnt == 0) { b->dev = dev; b->blockno = blockno; b->valid = 0; b->refcnt = 1; release(&bcache.lock); acquiresleep(&b->lock); return b; } } panic("bget: no buffers"); }
// Release a locked buffer. // Move to the head of the most-recently-used list. void brelse(struct buf \*b) { if(!holdingsleep(&b->lock)) panic("brelse");
releasesleep(&b->lock);
acquire(&bcache.lock); b->refcnt--; if (b->refcnt == 0) { // no one is waiting for it. b->next->prev = b->prev; b->prev->next = b->next; b->next = bcache.head.next; b->prev = &bcache.head; bcache.head.next->prev = b; bcache.head.next = b; } release(&bcache.lock); }
// Disk layout: // [ boot block super block log inode blocks // free bit map data blocks] // // mkfs computes the super block and builds an initial file system. The // super block describes the disk layout: struct superblock { uint magic; // Must be FSMAGIC uint size; // Size of file system image (blocks) uint nblocks; // Number of data blocks uint ninodes; // Number of inodes. uint nlog; // Number of log blocks uint logstart; // Block number of first log block uint inodestart; // Block number of first inode block uint bmapstart; // Block number of first free map block };
// Contents of the header block, used for both the on-disk header block // and to keep track in memory of logged block# before commit. struct logheader { int n; int block[LOGSIZE]; };
struct log { struct spinlock lock; int start; int size; int outstanding; // how many FS sys calls are executing. int committing; // in commit(), please wait. int dev; struct logheader lh; };
Log层函数的定义
initlog函数:使用超级块中的字段初始化log。
1 2 3 4 5 6 7 8 9 10 11 12
void initlog(int dev, struct superblock \*sb) { if (sizeof(struct logheader) >= BSIZE) panic("initlog: too big logheader");
static void recover_from_log(void) { read_head(); install_trans(1); // if committed, copy from log to disk log.lh.n = 0; write_head(); // clear the log }
从磁盘的读取log header到内存中,位于log所在磁盘块的第一个磁盘块
1 2 3 4 5 6 7 8 9 10 11 12 13
// Read the log header from disk into the in-memory log header static void read_head(void) { struct buf \*buf = bread(log.dev, log.start); struct logheader \*lh = (struct logheader \*) (buf->data); int i; log.lh.n = lh->n; for (i = 0; i < log.lh.n; i++) { log.lh.block[i] = lh->block[i]; } brelse(buf); }
recovering的作用是啥?
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
// Copy committed blocks from log to their home location static void install_trans(int recovering) { int tail;
acquire(&log.lock); if (log.lh.n >= LOGSIZE log.lh.n >= log.size - 1) panic("too big a transaction"); if (log.outstanding < 1) panic("log_write outside of trans");
for (i = 0; i < log.lh.n; i++) { if (log.lh.block[i] == b->blockno) // log absorption break; } log.lh.block[i] = b->blockno; if (i == log.lh.n) { // Add new block to log? bpin(b); log.lh.n++; } release(&log.lock); }
static void commit() { if (log.lh.n > 0) { write_log(); // Write modified blocks from cache to log write_head(); // Write header to disk -- the real commit install_trans(0); // Now install writes to home locations log.lh.n = 0; write_head(); // Erase the transaction from the log } }
// Copy modified blocks from cache to log. static void write_log(void) { int tail;
// Allocate a zeroed disk block. static uint balloc(uint dev) { int b, bi, m; struct buf \*bp;
bp = 0; for(b = 0; b < sb.size; b += BPB){ bp = bread(dev, BBLOCK(b, sb)); for(bi = 0; bi < BPB && b + bi < sb.size; bi++){ m = 1 << (bi % 8); if((bp->data[bi/8] & m) == 0){ // Is block free? bp->data[bi/8] = m; // Mark block in use. log_write(bp); brelse(bp); bzero(dev, b + bi); return b + bi; } } brelse(bp); } panic("balloc: out of blocks"); }
// Free a disk block. static void bfree(int dev, uint b) { struct buf \*bp; int bi, m;
bp = bread(dev, BBLOCK(b, sb)); bi = b % BPB; m = 1 << (bi % 8); if((bp->data[bi/8] & m) == 0) panic("freeing free block"); bp->data[bi/8] &= ~m; log_write(bp); brelse(bp); }
// 下面的宏定义用于type字段,type = 0表示该dinode尚未分配 #define T_DIR 1 #define T_FILE 2 #define T_DEVICE 3 // On-disk inode structure struct dinode { short type; // File type short major; // Major device number (T_DEVICE only) short minor; // Minor device number (T_DEVICE only) short nlink; // Number of links to inode in file system uint size; // Size of file (bytes) uint addrs[NDIRECT+1]; // Data block addresses };
// in-memory copy of an inode struct inode { uint dev; // Device number uint inum; // Inode number int ref; // Reference count struct sleeplock lock; // protects everything below here int valid; // inode has been read from disk?
short type; // copy of disk inode short major; short minor; short nlink; uint size; uint addrs[NDIRECT+1]; };
4.3 inode层函数定义
iget()函数
遍历itable寻找 i 结点是否已经被加载至内存中,如是则将该 i 结点的ref引用计数加1;在遍历的过程中使用empty对空的 i 结点进行保存;如果 i 结点还未被加载至内存则使用空 i 结点对需要获取的 i 结点进行保存。最后返回对应的 i 结点。
struct { struct spinlock lock; struct inode inode[NINODE]; } itable; // Find the inode with number inum on device dev // and return the in-memory copy. Does not lock // the inode and does not read it from disk. static struct inode\* iget(uint dev, uint inum) { struct inode \*ip, \*empty;
acquire(&itable.lock);
// Is the inode already in the table? empty = 0; for(ip = &itable.inode[0]; ip < &itable.inode[NINODE]; ip++){ if(ip->ref > 0 && ip->dev == dev && ip->inum == inum){ ip->ref++; release(&itable.lock); return ip; } if(empty == 0 && ip->ref == 0) // Remember empty slot. empty = ip; }
// Recycle an inode entry. if(empty == 0) panic("iget: no inodes");
// Increment reference count for ip. // Returns ip to enable ip = idup(ip1) idiom. struct inode\* idup(struct inode \*ip) { acquire(&itable.lock); ip->ref++; release(&itable.lock); return ip; } // Drop a reference to an in-memory inode. // If that was the last reference, the inode table entry can // be recycled. // If that was the last reference and the inode has no links // to it, free the inode (and its content) on disk. // All calls to iput() must be inside a transaction in // case it has to free the inode. void iput(struct inode \*ip) { acquire(&itable.lock);
if(ip->ref == 1 && ip->valid && ip->nlink == 0){ // inode has no links and no other references: truncate and free.
// ip->ref == 1 means no other process can have ip locked, // so this acquiresleep() won't block (or deadlock). acquiresleep(&ip->lock);
// Write data to inode. // Caller must hold ip->lock. // If user_src==1, then src is a user virtual address; // otherwise, src is a kernel address. // Returns the number of bytes successfully written. // If the return value is less than the requested n, // there was an error of some kind. int writei(struct inode \*ip, int user_src, uint64 src, uint off, uint n) { uint tot, m; struct buf \*bp;
if(off > ip->size off + n < off) return -1; if(off + n > MAXFILE\*BSIZE) return -1;
// write the i-node back to disk even if the size didn't change // because the loop above might have called bmap() and added a new // block to ip->addrs[]. iupdate(ip);
return tot; }
int readi(struct inode \*ip, int user_dst, uint64 dst, uint off, uint n) { uint tot, m; struct buf \*bp;
if(off > ip->size off + n < off) return 0; if(off + n > ip->size) n = ip->size - off;
for(tot=0; tot<n; tot+=m, off+=m, dst+=m){ bp = bread(ip->dev, bmap(ip, off/BSIZE)); m = min(n - tot, BSIZE - off%BSIZE); if(either_copyout(user_dst, dst, bp->data + (off % BSIZE), m) == -1) { brelse(bp); tot = -1; break; } brelse(bp); } return tot; }
// Look for a directory entry in a directory. // If found, set \*poff to byte offset of entry. struct inode\* dirlookup(struct inode \*dp, char \*name, uint \*poff) { uint off, inum; struct dirent de;
// Look up and return the inode for a path name. // If parent != 0, return the inode for the parent and copy the final // path element into name, which must have room for DIRSIZ bytes. // Must be called inside a transaction since it calls iput(). static struct inode\* namex(char \*path, int nameiparent, char \*name) { struct inode \*ip, \*next;
if(\*path == '/') ip = iget(ROOTDEV, ROOTINO); else ip = idup(myproc()->cwd);
// Copy the next path element from path into name. // Return a pointer to the element following the copied one. // The returned path has no leading slashes, // so the caller can check \*path=='\\0' to see if the name is the last one. // If no name to remove, return 0. // // Examples: // skipelem("a/bb/c", name) = "bb/c", setting name = "a" // skipelem("///a//bb", name) = "bb", setting name = "a" // skipelem("a", name) = "", setting name = "a" // skipelem("", name) = skipelem("////", name) = 0 // static char\* skipelem(char \*path, char \*name) { char \*s; int len; // 过滤路径字符串开头所有的'/' while(\*path == '/') path++; if(\*path == 0) return 0; s = path; while(\*path != '/' && \*path != 0) path++; len = path - s; if(len >= DIRSIZ) memmove(name, s, DIRSIZ); else { memmove(name, s, len); name[len] = 0; } while(\*path == '/') path++; return path; }
> `dirlink()`函数 > > 在目录 i 结点dp中添加一个新的名字为name的目录项
// Write a new directory entry (name, inum) into the directory dp. int dirlink(struct inode \*dp, char \*name, uint inum) { int off; struct dirent de; struct inode \*ip;
// Check that name is not present. if((ip = dirlookup(dp, name, 0)) != 0){ iput(ip); return -1; }
// Look for an empty dirent. for(off = 0; off < dp->size; off += sizeof(de)){ if(readi(dp, 0, (uint64)&de, off, sizeof(de)) != sizeof(de)) panic("dirlink read"); if(de.inum == 0) break; }
// Write to file f. // addr is a user virtual address. int filewrite(struct file \*f, uint64 addr, int n) { int r, ret = 0;
if(f->writable == 0) return -1;
if(f->type == FD_PIPE){ ret = pipewrite(f->pipe, addr, n); } else if(f->type == FD_DEVICE){ if(f->major < 0 f->major >= NDEV !devsw[f->major].write) return -1; ret = devsw[f->major].write(1, addr, n); } else if(f->type == FD_INODE){ // write a few blocks at a time to avoid exceeding // the maximum log transaction size, including // i-node, indirect block, allocation blocks, // and 2 blocks of slop for non-aligned writes. // this really belongs lower down, since writei() // might be writing a device like the console. int max = ((MAXOPBLOCKS-1-1-2) / 2) \* BSIZE; int i = 0; while(i < n){ int n1 = n - i; if(n1 > max) n1 = max;
begin_op(); ilock(f->ip); if ((r = writei(f->ip, 1, addr + i, f->off, n1)) > 0) f->off += r; iunlock(f->ip); end_op();
if(r != n1){ // error from writei break; } i += r; } ret = (i == n ? n : -1); } else { panic("filewrite"); }
// Create the path new as a link to the same inode as old. uint64 sys_link(void) { char name[DIRSIZ], new[MAXPATH], old[MAXPATH]; struct inode \*dp, \*ip;
// On RISC-V, sync_lock_test_and_set turns into an atomic swap: // a5 = 1 // s1 = &lk->locked // amoswap.w.aq a5, a5, (s1) while(__sync_lock_test_and_set(&lk->locked, 1) != 0) ;
// Tell the C compiler and the processor to not move loads or stores // past this point, to ensure that the critical section's memory // references happen strictly after the lock is acquired. // On RISC-V, this emits a fence instruction. __sync_synchronize();
// Record info about lock acquisition for holding() and debugging. lk->cpu = mycpu(); }
// Tell the C compiler and the CPU to not move loads or stores // past this point, to ensure that all the stores in the critical // section are visible to other CPUs before the lock is released, // and that loads in the critical section occur strictly before // the lock is released. // On RISC-V, this emits a fence instruction. __sync_synchronize();
// Release the lock, equivalent to lk->locked = 0. // This code doesn't use a C assignment, since the C standard // implies that an assignment might be implemented with // multiple store instructions. // On RISC-V, sync_lock_release turns into an atomic swap: // s1 = &lk->locked // amoswap.w zero, zero, (s1) __sync_lock_release(&lk->locked);
// push_off/pop_off are like intr_off()/intr_on() except that they are matched: // it takes two pop_off()s to undo two push_off()s. Also, if interrupts // are initially off, then push_off, pop_off leaves them off.