Sometimes you have to touch the deep and dark
and perhaps dirty ends of something.
            ^ NOT *fantasy*. I'm not going to say that twice.

0x00 QEMU Problems

git clone http://web.mit.edu/ccutler/www/qemu.git -b 6.828-2.3.0
cd qemu
./configure --disable-kvm --disable-werror
vim scripts/texi2pod.pl # alter { in line 320 to \{
make
# to make your /usr clean DO NOT make install here.
# instead, modify the makefile of your lab, add
# QEMU := /path/to/your/build/i386-softmmu/qemu-system-i386
# before it tries to infer the qemu path.

这里出现的问题应该是官方源里的 qemu 内置的 gdb-server 存在问题,导致 attach 的 gdb 无法正确推断指令的长度。使用 MIT 提供的 patch 过后的 qemu 后顺利解决问题。

如果你不介意 /usr 混乱的问题,可以直接 sudo make install 。不过对于我来说,是不可能这么操作的w

直接修改 josMakefile ,在推断 QEMU 的 ln50 前面强制指定 QEMU 路径即可。

0x01 BIOS Operations

Real Mode

ljmp $0xf000,$0xe05b # f000:fff0

跳到一个较早的位置。因为从这个地址开始是 hard wired 的,所以只能通过这种方式执行真正的 BIOS

cmpl $0x0,%cs:(0x6ac8)  # f000:e05b
# registers:
#  cs: 0xf000
# gdb command output:
# (gdb) x/1xw 0xf6ac8
# 0x00000000
jne 0xfd2e1
xor %dx,%dx # clear dx
mov %dx,%ss
mov $0x7000,%esp
mov $0xf34c2,%edx
jmp 0xfd15c

初始化 BIOS 栈

# f000:d15c
# registers
# eax 0x0
# ecx 0x0
mov %eax,%ecx
cli # cl(ear) i(nterrupt) enable flag. this disabled the hardware inputs
cld # cl(ear) d(irection) flag. this cause chain instructions to increase si & di.
mov $0x8f,%eax # 0x8f = 1000 1111
out %al,$0x70
# Disable NMI(non-maskable interrupts) and select the 0xf (16th register) of CMOS
# Reference: http://wiki.osdev.org/CMOS
in $0x71,%al # read the selected register from CMOS
# register 0x0F is Shutdown Status Byte (Ref: http://www.walshcomptech.com/ohlandl/config/cmos_ram.html#Hex_00F )
# registers:
# eax 0x0
in $0x92,%al # read the System Control Port A
# Ref: http://wiki.osdev.org/Non_Maskable_Interrupt
# registers:
# eax 0x0
or $0x2,%al
out %al,$0x92 # attempt to use FAST-A20 Gate to enable A20 line
# Ref: http://wiki.osdev.org/A20_Line#Fast_A20_Gate
lidtw %cs:0x6ab8 # load the content of memory addr %cs*16+0x6ab8 to IDTR
# operand specification is (w) makes lower 2 byte LIMIT and upper 2 byte BASE
# Ref:http://faydoc.tripod.com/cpu/lidt.htm
# GDB command:
# (gdb) x/xw $cs*16+0x6ab8
# 0x6abe0000
# => limit 0x0000 base 0x6abe
lgdtw %cs:0x6a74 # same as lidtw but load into GDTR
# (gdb) x/xw $cs*16+0x6a74
# 0x6a800037 => limit 0x0037 base 0x6a80
mov %cr0,%eax # read from control register 0
# Ref:http://wiki.osdev.org/CPU_Registers_x86#Control_Registers
# registers
# eax 0x60000010 => 0110 0000 0000 0000 0000 0000 0001 0000
# means [CACHE DISABLE][NOT WRITE THROUGH][EXTENSION TYPE]
or $0x1,%eax
mov %eax,%cr0 # enable protected mode
ljmpl $0x8,$0xfd18f
# NOW WE ARE IN i386!
# registers
#  cs 0x8
# eip 0xfd18f

大步进入保护模式

Protected Mode

mov $0x10,%eax
mov %eax,%ds ; set data segment
mov %eax,%es ; set extra segment
mov %eax,%ss ; set stack segment
mov %eax,%fs ; set f segment
mov %eax,%gs ; set g segment
; registers
; ecx 0x0
; edx 0xf34c2
mov %ecx,%eax
jmp *%edx

初始化 C 程序环境(各种 segment)

; registers
; ebx 0x0
push %ebx
sub 0x2c,%esp
movl $0xf5b5c,0x4(%esp)
movl $0xf447b,(%esp)
call 0xf099e
....

接下来显然进入了使用 C 语言写的部分。懒得往下 si 了,因为维护栈之类的指令会特别多……

总结一下目前为止这个 BIOS 干了什么吧:

打开 A20 地址总线、初始化中断描述符表和全局描述符表、进入 x86 保护模式……

而且目前为止它还没有初始化 display ……

0x02 The Bootloader

The ELF Header

我们的kernel是由gcc编译为ELF(Executable and Linkable Format)格式的可执行文件。在makefile中,我们将这个可执行文件写入虚拟硬盘的一个固定区域。

注意在这里,我们的虚拟硬盘上是没有文件系统的,所以bootloader无法从文件系统当中读取相对应的内核文件。这是与现代的实现的区别之一。

那么为了正确地将内核载入内存空间,我们的 bootloader 需要能够处理 ELF Header.

// ELF Header
// Ref: http://www.skyfree.org/linux/references/ELF_Format.pdf

#define EI_NIDENT 16

typedef struct {
  uchar identity[EI_NIDENT];
  half type;
  half machine;
  word version;
  addr entry;
  offset prog_header_offset;
  offset sect_header_offset;
  word flags;
  half size;
  half prog_header_size;
  half prog_header_number;
  half sect_header_size;
  half sect_header_number;
  half sect_header_string_index;
} ELFHeader;
Data types:
uchar 1
half 2
word 4
addr=offset 4(32);8(64)
NOTE: Alignment exists.
identity: (starts from 0, nbyte)
0-3 File Identification: First 0x7F, then ASCII 'ELF'
4 File Class: 0=>Invalid 1=>32bit 2=>64bit
5 Data Encoding: 0=>Invalid 1=>Little Endian 2=>Big Endian
6 ELF Version
7 OS ABI: 0 - System V : not used in this lab
8-15 PRESERVED

Program Loading

作为bootloader的主要任务之一,它必须正确地将内核装载到内存当中。

这个过程要求bootloader将内核可执行文件的各个segment拷贝到对应的内存地址当中,并且 jmp 到对应的入口地址处。

入口地址已经在 ELF Header 中指定了,那么我们只需要解决segment的问题就可以了。

ELF Program Header

ELF Header 中的 prog_header_offset, prog_header_sizeprog_header_number 部分就是用于指定 Program Header 的位置的。Program Header 是用于告诉 Program Loader 应该如何将程序载入内存的数据结构。

它的位置在 ELF Header + offset 开始,持续 size * number 个字节的区域中。

// ELF Program Header

typedef struct {
  word type;
  offset offset; // offset from the beginning of the file
  addr vaddr; // virtual address of this segment should be loaded
  addr paddr; // physical address. ignored in System V
  word filesz; // size of this segment in file
  word memsz; // size of this segment when loaded into memory
  word flags; // FLAGS!
  word align; // control the alignment when loaded into memory
} ELFProgramHeader;

Load the Kernel

我 6.828bootloader 不看type/flags/size/offset 闭眼复制

是bootloader中的豪杰

boot/main.c

#define SECTSIZE 512 // sector size of hard disk
#define ELFHDR ((struct Elf*) 0x10000) // where the elf header should be loaded in
readseg((uint32_t)ELFHDR,SECTSIZE*8,0); // read 8 sectors from hard disk
if(ELFHDR->e_magic != ELF_MAGIC) goto bad; // boom! not a elf executable
struct Proghdr *ph,*eph;
ph = (struct Proghdr*)((uint8_t*)ELFHDR + ELFHDR->e_phoff); // the beginning position
eph = ph + ELFHDR->e_phnum; // the end position. eph ==> e(nd)ph?
for(;ph<eph;ph++) readseg(ph->p_pa,ph->p_memsz,ph->p_offset); // read into memory
((void(*)(void))(ELFHDR->e_entry))(); // call the entry point. KERNEL KICK STARTED.

What’s Everything

首先,BIOS将硬盘的首个sector载入到内存中的0x7c00~0x7dff范围内。然后跳转到0x7c00的位置开始执行。

这里就是我们的bootloader开始的地方了。

首先我们要进入x86保护模式。关闭中断,并且设置direction flag。设置好段寄存器后,打开A20地址总线,然后设置全局描述符表,打开保护模式开关,然后通过一个ljmp指令进入保护模式。

进入保护模式后,设置保护模式的段寄存器、设置栈寄存器,然后通过call指令进入C代码部分。

之后就是上面说的加载内核的过程了。

The Kernel

Show All Registers

(gdb) info registers
# sample output
eax            0x112800 1124352
ecx            0x0      0
edx            0x9d     157
ebx            0x10094  65684
esp            0x7bec   0x7bec
ebp            0x7bf8   0x7bf8
esi            0x10094  65684
edi            0x0      0
eip            0x10000c 0x10000c
eflags         0x46     [ PF ZF ]
cs             0x8      8
ss             0x10     16
ds             0x10     16
es             0x10     16
fs             0x10     16
gs             0x10     16

好像这并不是全部的寄存器……

怎么办呢?

切换到qemu, 按 Ctrl-a c

(qemu) info registers
# sample output
EAX=00112800 EBX=00010094 ECX=00000000 EDX=0000009d
ESI=00010094 EDI=00000000 EBP=00007bf8 ESP=00007bec
EIP=0010000c EFL=00000046 [---Z-P-] CPL=0 II=0 A20=1 SMM=0 HLT=0
ES =0010 00000000 ffffffff 00cf9300 DPL=0 DS   [-WA]
CS =0008 00000000 ffffffff 00cf9a00 DPL=0 CS32 [-R-]
SS =0010 00000000 ffffffff 00cf9300 DPL=0 DS   [-WA]
DS =0010 00000000 ffffffff 00cf9300 DPL=0 DS   [-WA]
FS =0010 00000000 ffffffff 00cf9300 DPL=0 DS   [-WA]
GS =0010 00000000 ffffffff 00cf9300 DPL=0 DS   [-WA]
LDT=0000 00000000 0000ffff 00008200 DPL=0 LDT
TR =0000 00000000 0000ffff 00008b00 DPL=0 TSS32-busy
GDT=     00007c54 00000017
IDT=     00000000 000003ff
CR0=00000011 CR2=00000000 CR3=00000000 CR4=00000000
DR0=00000000 DR1=00000000 DR2=00000000 DR3=00000000 
DR6=ffff0ff0 DR7=00000400
EFER=0000000000000000
FCW=037f FSW=0000 [ST=0] FTW=00 MXCSR=00001f80
FPR0=0000000000000000 0000 FPR1=0000000000000000 0000
FPR2=0000000000000000 0000 FPR3=0000000000000000 0000
FPR4=0000000000000000 0000 FPR5=0000000000000000 0000
FPR6=0000000000000000 0000 FPR7=0000000000000000 0000
XMM00=00000000000000000000000000000000 XMM01=00000000000000000000000000000000
XMM02=00000000000000000000000000000000 XMM03=00000000000000000000000000000000
XMM04=00000000000000000000000000000000 XMM05=00000000000000000000000000000000
XMM06=00000000000000000000000000000000 XMM07=00000000000000000000000000000000

KBJO(笑

Turn On Paging

kern/entry.S

movl $(RELOC(entry_pgdir)), %eax
# note the address of any memory position tags is compiled into large address space
# aka. 0xf*******
# so we need a RELOC macro to make that to the physical position it loaded into memory.
# the physical address is specified in kern/kernel.ld:
# .text : AT(0x100000)
movl %eax,%cr3
# cr3 register is for physical base address of page directory
movl %cr0,%eax
# read the cr0 control register
# (qemu) info registers: cr0 0x00000011
# means [PE][ET]
orl $(CR0_PE|CR0_PG|CR0_WP), %eax
# enable [PE][PG][WP] flags
movl %eax,%cr0
# notice: we still running in low eip address.
# the trick here is the kernel is actually loaded in low address
# the paging is only mapping the high address to the lower.
# after this instruction, we can access high address now.
# if you run something like
# movl entry_pgdir, %ebx
# and print the content in ebx out in gdb, you will find the content is correctly loaded
mov $relocated, %eax
jmp *%eax
# make the eip go to higher address. now everything settled for paging.

Go C

kern/entry.S

# prepare for the C!
movl $0x0,%ebp
# set up the stack frame pointer register.
# for later backtrace lab.
movl $(bootstacktop),%esp
# set up the stack top pointer.
call i386_init
# C KICK STARTED.

lib/printfmt.c, line 207

		case 'o':
			putch('0', putdat); // remember, we are OCTAL!
			num = getuint(&ap, lflag);
			base = 8;
			goto number;

What the print interface really looks like?

cprintf() at kern/printf.c:26 calls:
vcprintf() at kern/printf.c:17 calls:
vprintfmt() at lib/printfmt.c:83 calls:
putch() at kern/printf.c:10 calls:
cputchar() at kern/console.c:456 calls:
cons_putc() at kern/console.c:433 calls:
serial_putc() at kern/console.c:66,
lpt_putc() at kern/console.c:112,
cga_putc() at kern/console.c:163.

经过一番测试,发现:

serial_putc() ,即输出到serial device的输出,会出现在make qemu的host的stdout和make qemu-nox中。

cga_putc(),即输出到CGA device的输出,会出现在make qemu的emulation screen中。

lpt_putc() 好像哪里都不存在的样子……可能是没有init的原因?

Face the Buffer!

if (crt_pos >= CRT_SIZE) {
	int i;
	// move one line up
	memmove(crt_buf, crt_buf + CRT_COLS, (CRT_SIZE - CRT_COLS) * sizeof(uint16_t));
	for (i = CRT_SIZE - CRT_COLS; i < CRT_SIZE; i++)
		crt_buf[i] = 0x0700 | ' '; // set remains to empty
	crt_pos -= CRT_COLS; // set the position control variable!
}

随着输出的不断增长,你迟早会碰到一个问题:

buffer用完了……怎么办?

很显然,这就是用来解决这个问题的。

No I’m not Going to Work on Challenges

找不太到文档。很气。不做了。

The C Calling Stack

当你试图调用一个C函数的时候,以下事情会按顺序发生:

  • 将函数的参数,从右往左依次压栈(那么从顶往底的方向遍历,就是从左往右得到参数了!)
  • assembly call 的行为:将返回地址压栈
  • 编译器维护:将 ebp 压栈,然后将 ebp 置为 esp

当一个C函数返回的时候,以下事情会按顺序发生:

  • 将函数内所有压栈行为弹栈:包括栈上分配的变量和压栈的 ebp
  • 恢复 ebp 的值。
  • assembly ret 的行为:将返回地址弹栈并设置 eip

那么,我们就可以得到 C Call stack frame 的结构了。

STABs

并没有仔细阅读 STABs 的文档和它的结构,不过……

含有行号的 STABs type 是 N_SLINE 。

那么直接暴力调用搜索就好了,搜完后注意检查 lline 和 rline 的关系。

注意行号存放在 n_desc 字段而不是 n_value 字段。

The Full Score Version Makes Me Unsatisfactory

You call that a BACKTRACE?

No I’m not going to admit it!!

所以最终实现的结果长这样。

不要拿去试 make grade,没分的(笑

kern/kdebug.c : get the line number

	stab_binsearch(stabs,&lline,&rline,N_SLINE,addr);
	if(lline <= rline){
		// found
		info->eip_line = stabs[lline].n_desc;
	}else{
		return -1;
	}

kern/monitor.c : print the backtrace

// get the current EIP
// omit attribute noinline will cause gcc to optimize this function to be inlined.
// then it will lose its function.
uint32_t __attribute__((noinline)) read_eip(void){
	uint32_t* curr_ebp = (uint32_t*)read_ebp();
	return *(curr_ebp+1);
}

inline void fmtprintDbginfo(const char* prefix,struct Eipdebuginfo info,uint32_t eip){
	cprintf("%s %s:%d:%.*s%s%d",prefix,info.eip_file,info.eip_line,info.eip_fn_namelen,info.eip_fn_name,eip-info.eip_fn_addr>=0?"+":"",eip-info.eip_fn_addr);
}

int
mon_backtrace(int argc, char **argv, struct Trapframe *tf)
{
	cprintf("Stack backtrace:\n");
	int isinitial=1;
	for(uint32_t *curr_ebp=(uint32_t*)read_ebp(), prev_eip = read_eip();curr_ebp!=0;(prev_eip = *(curr_ebp+1)),(curr_ebp=(uint32_t*)*curr_ebp)){
		// get debug information
		struct Eipdebuginfo dbginfo,previnfo;
		debuginfo_eip(*(curr_ebp+1),&dbginfo);
		debuginfo_eip(prev_eip,&previnfo);
		if(isinitial){
			fmtprintDbginfo("Called",previnfo,prev_eip);
			cprintf("  <- backtracer here\n");
			isinitial = 0;
		}
		// print basics
		cprintf("  with  ebp %p",(uint32_t)curr_ebp);
		cprintf("  eip %p",*(curr_ebp+1));
		if(previnfo.eip_fn_narg != 0){
			cprintf("  args(");
			for(int i=0;i<previnfo.eip_fn_narg;i++) cprintf("0x%x%s",*(curr_ebp+2+i),i==previnfo.eip_fn_narg-1?")":", ");
		}else{
			cprintf("  void");
		}
		// print advanced
		cprintf("\n");
		fmtprintDbginfo("From",dbginfo,*(curr_ebp+1));
		cprintf("\n");
	}
	return 0;
}

效果图:

backtrace