diff --git a/docs/catfish-proposal.md b/docs/catfish-proposal.md
new file mode 100644
index 0000000..d738e4a
--- /dev/null
+++ b/docs/catfish-proposal.md
@@ -0,0 +1,72 @@
+# Rust OS多核移植与基于PARD框架的线程级Label管理 方案设计文档
+
+2015011251 王纪霆
+
+## 实验目标
+
++ 完成RustOS在riscv32上的多核开启
++ 使RustOS可在中科院计算所PARD RISCV硬件环境下运行
++ 使RustOS能够在PARD上开启smp
++ 添加控制功能，使得RustOS可以控制/查看PARD的寄存器
+
+如果以上这些要全部实现，可以预料地将无法在八周内完成。
+
+## 实验背景
+
+[LvNA/PARD](https://github.com/LvNA-system/labeled-RISC-V)是一个用于进行内存隔离的硬件系统。它基于[rocket-chip](https://github.com/freechipsproject/rocket-chip)实现，在通常的多核以外，在核与各存储设备间增加了寄存器和额外的模块，并且对总线访问添加了标签，两者相结合下，可以完成对总线访问的控制，即对各核能够使用的缓存大小、磁盘流量进行限制等。
+
+但目前为止，这项工作还有一些问题。首先是作为控制流量的关键——control plane，并未暴露给各核，而需要通过硬件的JTAG机制与板子上的控制模块prm（内含一个linux系统）沟通，并且在prm上实现控制脚本。而prm又和各核无法直接沟通，这样，运行在各核上的OS不仅无法修改寄存器，也无法知晓自己所分配到的资源大小，与PARD系统完全隔离。
+
+如此一来，这个系统仅能进行核间的隔离，而无法完成进程间的隔离。这是因为，为了区分进程、给两个进程打上不同的标签，就必须让OS可以主动修改和设置control plane。
+
+解决这个问题，实现进程级的label管理就是这个项目的主要目的。
+
+## 实验设计
+
+实际上，本项目的两个方向，即多核移植和PARD移植，是可以独立完成的。因为也可以退而求其次，考虑在PARD上只运行单核的线程级标签管理。但最终的目的还是让RustOS接手PARD的所有核，作为一个多核OS运行，对整个系统的运行进行管理。
+
+最理想的方法是让硬件把control plane映射到内存，作为各核的一个设备。但硬件非常复杂，并且中科院方也没有实现这一功能。所以还是对硬件不做修改，依旧通过prm管理control plane，而将prm用串口等和各核连接，让其作为核的一个设备，在核的控制下修改底层配置。
+
+为此，需要做的具体来说是：
+
++ 让RustOS可以在核上运行；
++ 在prm上写脚本（运行在完善的Linux环境，且有现成范例），控制control plane；
++ 在RustOS上写驱动，使其可以与prm交互完成控制；
++ 在RustOS中添加系统调用等，使得操作系统可以管理驱动；
+
+另一边，多核移植所要做的是：
+
++ 开启多核，实现原子操作等，为内核中可能的竞争加锁（Rust已经帮我们做了很多）；
++ 核间进程调度，load balance；
+
+多核这边打算做的简单一些，基本照搬uCore+ on rv64来做，因为主要的设计和优化工作应该交给wrj同学。
+
+## 目前进度
+
+PARD端：
+
++ 完成了实现机理的调研
++ 正在服务器上编译复现（工程太大，本地vivado编译不了）
+
+多核端：
+
++ 完成了ucore+ smp的初步学习
++ 学习了bbl，完成了RustOS on rv32的多核开启
+
+## 暂时计划
+
+如果必须前八周结束，估计是无法做完的。
+
+根据能否在前八周过后继续做，有以下远近期的计划：
+
+### 短期
+
++ 完成rv32 on smp的移植（第5-6周）
++ 完成多核Rust OS在PARD上的运行（第7-8周）
+
+### 长期
+
++ 完成RustOS与PARD的交互协议构建（第9-11周）
++ 实现基于RustOS的进程级标签管理（第12周后）
+
+以上均基于只有本人一人工作的前提，若有其他人协助/加入则根据实际情况而定。
\ No newline at end of file
diff --git a/docs/catfish-proposal.pptx b/docs/catfish-proposal.pptx
new file mode 100644
index 0000000..e79c504
Binary files /dev/null and b/docs/catfish-proposal.pptx differ
diff --git a/kernel/Makefile b/kernel/Makefile
index 919ab75..a32456b 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -12,11 +12,13 @@
 #   d    = int | in_asm | ...	QEMU debug info
 #   mode = debug | release
 #   LOG  = off | error | warn | info | debug | trace
+#   smp                         SMP core number
 #   board 						Only available on riscv32, build without bbl, run on board
 
 arch ?= riscv32
 mode ?= debug
 LOG  ?= debug
+smp  ?= 4
 
 target := $(arch)-blog_os
 kernel := target/$(target)/$(mode)/ucore
@@ -31,12 +33,12 @@ ifeq ($(arch), x86_64)
 qemu_opts := \
 	-drive format=raw,file=$(bootimage) \
 	-drive format=raw,file=$(SFSIMG),media=disk,cache=writeback \
-	-smp 4 \
+	-smp $(smp) \
 	-serial mon:stdio \
 	-device isa-debug-exit
 endif
 ifeq ($(arch), riscv32)
-qemu_opts := -machine virt -kernel $(bin) -nographic -smp 4
+qemu_opts := -machine virt -kernel $(bin) -nographic -smp cpus=$(smp)
 endif
 
 ifdef board
@@ -104,9 +106,12 @@ endif
 asm:
 	@$(objdump) -dS $(kernel) | less
 
-elf-h:
+header:
 	@$(objdump) -h $(kernel)
 
+sym:
+	@$(objdump) -t $(kernel) | less
+
 $(bin): kernel
 ifdef board
 	@cp $(kernel) $@
@@ -128,7 +133,7 @@ kernel:
 ifeq ($(arch), x86_64)
 	@bootimage build $(build_args)
 else
-	@cargo xbuild $(build_args)
+	@CC=$(cc) cargo xbuild $(build_args)
 endif
 
 # make user.o from binary files
diff --git a/kernel/build.rs b/kernel/build.rs
index af15f66..ec94d4e 100644
--- a/kernel/build.rs
+++ b/kernel/build.rs
@@ -12,6 +12,13 @@ fn main() {
 //			.compile("cobj");
 		gen_vector_asm().unwrap();
 	}
+	if std::env::var("TARGET").unwrap().find("riscv32").is_some() {
+		cc::Build::new()
+			.file("src/arch/riscv32/compiler_rt.c")
+			.flag("-march=rv32ia")
+			.flag("-mabi=ilp32")
+			.compile("atomic_rt");
+	}
 }
 
 fn gen_vector_asm() -> Result<()> {
diff --git a/kernel/src/arch/riscv32/boot/entry.asm b/kernel/src/arch/riscv32/boot/entry.asm
index 8bb9268..0485f0e 100644
--- a/kernel/src/arch/riscv32/boot/entry.asm
+++ b/kernel/src/arch/riscv32/boot/entry.asm
@@ -1,14 +1,19 @@
     .section .text.entry
     .globl _start
 _start:
-    lui  sp, %hi(bootstacktop)
-    addi sp, sp, %lo(bootstacktop)
+    add t0, a0, 1
+    slli t0, t0, 16
+    
+    lui sp, %hi(bootstack)
+    addi sp, sp, %lo(bootstack)
+    add sp, sp, t0
+
     call rust_main
 
     .section .bss
     .align 12  #PGSHIFT
     .global bootstack
 bootstack:
-    .space 4096 * 16  #KSTACKSIZE
+    .space 4096 * 16 * 8
     .global bootstacktop
 bootstacktop:
diff --git a/kernel/src/arch/riscv32/compiler_rt.c b/kernel/src/arch/riscv32/compiler_rt.c
new file mode 100644
index 0000000..c2f2fb8
--- /dev/null
+++ b/kernel/src/arch/riscv32/compiler_rt.c
@@ -0,0 +1,49 @@
+// http://llvm.org/docs/Atomics.html#libcalls-atomic
+
+char __atomic_load_1(char *src) {
+    char res = 0;
+    __asm__ __volatile__("amoadd.w.rl %0, zero, (%1)" : "=r"(res) : "r"(src) : "memory");
+    return res;
+}
+
+short __atomic_load_2(short *src) {
+    short res = 0;
+    __asm__ __volatile__("amoadd.w.rl %0, zero, (%1)" : "=r"(res) : "r"(src) : "memory");
+    return res;
+}
+
+int __atomic_load_4(int *src) {
+    int res = 0;
+    __asm__ __volatile__("amoadd.w.rl %0, zero, (%1)" : "=r"(res) : "r"(src) : "memory");
+    return res;
+}
+
+char __atomic_store_1(char *dst, char val) {
+    __asm__ __volatile__("amoswap.w.aq zero, %0, (%1)" :: "r"(val), "r"(dst) : "memory");
+}
+
+int __atomic_store_4(int *dst, int val) {
+    __asm__ __volatile__("amoswap.w.aq zero, %0, (%1)" :: "r"(val), "r"(dst) : "memory");
+}
+
+char __atomic_compare_exchange_1(char* dst, char* expected, char desired) {
+    char val = 0;
+    __asm__ __volatile__("lr.w %0, (%1)" : "=r"(val) : "r"(dst) : "memory");
+    if (val == *expected) {
+        int sc_ret = 0;
+        __asm__ __volatile__("sc.w %0, %1, (%2)" : "=r"(sc_ret) : "r"(desired), "r"(dst) : "memory");
+        return sc_ret == 0;
+    }
+    return 0;
+}
+
+char __atomic_compare_exchange_4(int* dst, int* expected, int desired) {
+    int val = 0;
+    __asm__ __volatile__("lr.w %0, (%1)" : "=r"(val) : "r"(dst) : "memory");
+    if (val == *expected) {
+        int sc_ret = 0;
+        __asm__ __volatile__("sc.w %0, %1, (%2)" : "=r"(sc_ret) : "r"(desired), "r"(dst) : "memory");
+        return sc_ret == 0;
+    }
+    return 0;
+}
\ No newline at end of file
diff --git a/kernel/src/arch/riscv32/compiler_rt.rs b/kernel/src/arch/riscv32/compiler_rt.rs
index 80dbdf8..e8fff84 100644
--- a/kernel/src/arch/riscv32/compiler_rt.rs
+++ b/kernel/src/arch/riscv32/compiler_rt.rs
@@ -23,50 +23,3 @@ pub extern fn __mulsi3(mut a: u32, mut b: u32) -> u32 {
 pub extern fn abort() {
     loop {}
 }
-
-use core::ptr::{read, write};
-
-#[no_mangle]
-pub unsafe extern fn __atomic_load_1(src: *const u8) -> u8 {
-    read(src)
-}
-
-#[no_mangle]
-pub unsafe extern fn __atomic_load_2(src: *const u16) -> u16 {
-    read(src)
-}
-
-#[no_mangle]
-pub unsafe extern fn __atomic_load_4(src: *const u32) -> u32 {
-    read(src)
-}
-
-#[no_mangle]
-pub unsafe extern fn __atomic_store_1(dst: *mut u8, val: u8) {
-    write(dst, val)
-}
-
-#[no_mangle]
-pub unsafe extern fn __atomic_store_4(dst: *mut u32, val: u32) {
-    write(dst, val)
-}
-
-unsafe fn __atomic_compare_exchange<T: PartialEq>(dst: *mut T, expected: *mut T, desired: T) -> bool {
-    use super::interrupt;
-    let flags = interrupt::disable_and_store();
-    let val = read(dst);
-    let success = val == read(expected);
-    write(dst, if success {desired} else {val});
-    interrupt::restore(flags);
-    success
-}
-
-#[no_mangle]
-pub unsafe extern fn __atomic_compare_exchange_1(dst: *mut u8, expected: *mut u8, desired: u8) -> bool {
-    __atomic_compare_exchange(dst, expected, desired)
-}
-
-#[no_mangle]
-pub unsafe extern fn __atomic_compare_exchange_4(dst: *mut u32, expected: *mut u32, desired: u32) -> bool {
-    __atomic_compare_exchange(dst, expected, desired)
-}
\ No newline at end of file
diff --git a/kernel/src/arch/riscv32/consts.rs b/kernel/src/arch/riscv32/consts.rs
new file mode 100644
index 0000000..f2adbcb
--- /dev/null
+++ b/kernel/src/arch/riscv32/consts.rs
@@ -0,0 +1,14 @@
+// Physical address available on THINPAD:
+// [0x80000000, 0x80800000]
+const P2_SIZE: usize = 1 << 22;
+const P2_MASK: usize = 0x3ff << 22;
+pub const RECURSIVE_PAGE_PML4: usize = 0x3fe;
+pub const KERNEL_OFFSET: usize = 0;
+pub const KERNEL_PML4: usize = 0x8000_0000 >> 22;
+pub const KERNEL_HEAP_OFFSET: usize = 0x8020_0000;
+pub const KERNEL_HEAP_SIZE: usize = 0x0020_0000;
+pub const MEMORY_OFFSET: usize = 0x8000_0000;
+pub const MEMORY_END: usize = 0x8080_0000;
+pub const USER_STACK_OFFSET: usize = 0x70000000;
+pub const USER_STACK_SIZE: usize = 0x10000;
+pub const USER32_STACK_OFFSET: usize = USER_STACK_OFFSET;
\ No newline at end of file
diff --git a/kernel/src/arch/riscv32/memory.rs b/kernel/src/arch/riscv32/memory.rs
index 0f9cfb6..f9b5597 100644
--- a/kernel/src/arch/riscv32/memory.rs
+++ b/kernel/src/arch/riscv32/memory.rs
@@ -3,6 +3,8 @@ use memory::{active_table, FRAME_ALLOCATOR, init_heap, MemoryArea, MemoryAttr, M
 use super::riscv::{addr::*, register::sstatus};
 use ucore_memory::PAGE_SIZE;
 
+// static mut KERNEL_MS: Option<MemorySet> = None;
+
 pub fn init() {
     #[repr(align(4096))]
     struct PageData([u8; PAGE_SIZE]);
diff --git a/kernel/src/arch/riscv32/mod.rs b/kernel/src/arch/riscv32/mod.rs
index 13d9ad3..aadbdaf 100644
--- a/kernel/src/arch/riscv32/mod.rs
+++ b/kernel/src/arch/riscv32/mod.rs
@@ -7,14 +7,34 @@ pub mod timer;
 pub mod paging;
 pub mod memory;
 pub mod compiler_rt;
+pub mod consts;
+pub mod smp;
+
+use self::smp::*;
+
+fn others_main(hartid: usize) -> ! {
+    println!("hart {} is booting", hartid);
+    loop { }
+}
 
 #[no_mangle]
-pub extern fn rust_main() -> ! {
-    println!("Hello RISCV! {}", 123);
+pub extern fn rust_main(hartid: usize, dtb: usize, hart_mask: usize) -> ! {
+    unsafe { set_cpu_id(hartid); } 
+
+    if hartid != 0 {
+        while unsafe { !has_started(hartid) }  { }
+        others_main(hartid);
+        // others_main should not return
+    }
+
+    println!("Hello RISCV! in hart {}, {}, {}", hartid, dtb, hart_mask);
+
     ::logging::init();
     interrupt::init();
     memory::init();
     timer::init();
+
+    unsafe { start_others(hart_mask); }
     ::kmain();
 }
 
diff --git a/kernel/src/arch/riscv32/smp.rs b/kernel/src/arch/riscv32/smp.rs
new file mode 100644
index 0000000..aab6e4e
--- /dev/null
+++ b/kernel/src/arch/riscv32/smp.rs
@@ -0,0 +1,31 @@
+use consts::MAX_CPU_NUM;
+use core::ptr::{read_volatile, write_volatile};
+use memory::*;
+
+static mut STARTED: [bool; MAX_CPU_NUM] = [false; MAX_CPU_NUM];
+
+pub unsafe fn set_cpu_id(cpu_id: usize) {
+    unsafe {
+        asm!("mv tp, $0" : : "r"(cpu_id));
+    }
+}
+
+pub unsafe fn get_cpu_id() -> usize {
+    let mut cpu_id = 0;
+    unsafe {
+        asm!("mv $0, tp" : : "r" (cpu_id));
+    }
+    cpu_id
+}
+
+pub unsafe fn has_started(cpu_id: usize) -> bool {
+    read_volatile(&STARTED[cpu_id])
+}
+
+pub unsafe fn start_others(hart_mask: usize) {
+    for cpu_id in 0..MAX_CPU_NUM {
+        if (hart_mask >> cpu_id) & 1 != 0 {
+            write_volatile(&mut STARTED[cpu_id], true);
+        }
+    }
+}
\ No newline at end of file
diff --git a/kernel/src/arch/x86_64/consts.rs b/kernel/src/arch/x86_64/consts.rs
new file mode 100644
index 0000000..c2426c5
--- /dev/null
+++ b/kernel/src/arch/x86_64/consts.rs
@@ -0,0 +1,97 @@
+// Copy from Redox consts.rs:
+
+// Because the memory map is so important to not be aliased, it is defined here, in one place
+// The lower 256 PML4 entries are reserved for userspace
+// Each PML4 entry references up to 512 GB of memory
+// The top (511) PML4 is reserved for recursive mapping
+// The second from the top (510) PML4 is reserved for the kernel
+/// The size of a single PML4
+pub const PML4_SIZE: usize = 0x0000_0080_0000_0000;
+pub const PML4_MASK: usize = 0x0000_ff80_0000_0000;
+
+/// Offset of recursive paging
+pub const RECURSIVE_PAGE_OFFSET: usize = (-(PML4_SIZE as isize)) as usize;
+pub const RECURSIVE_PAGE_PML4: usize = (RECURSIVE_PAGE_OFFSET & PML4_MASK) / PML4_SIZE;
+
+/// Offset of kernel
+pub const KERNEL_OFFSET: usize = RECURSIVE_PAGE_OFFSET - PML4_SIZE;
+pub const KERNEL_PML4: usize = (KERNEL_OFFSET & PML4_MASK) / PML4_SIZE;
+
+pub const KERNEL_SIZE: usize = PML4_SIZE;
+
+/// Offset to kernel heap
+pub const KERNEL_HEAP_OFFSET: usize = KERNEL_OFFSET - PML4_SIZE;
+pub const KERNEL_HEAP_PML4: usize = (KERNEL_HEAP_OFFSET & PML4_MASK) / PML4_SIZE;
+/// Size of kernel heap
+pub const KERNEL_HEAP_SIZE: usize = 8 * 1024 * 1024; // 8 MB
+
+pub const MEMORY_OFFSET: usize = 0;
+
+/// Offset to kernel percpu variables
+//TODO: Use 64-bit fs offset to enable this pub const KERNEL_PERCPU_OFFSET: usize = KERNEL_HEAP_OFFSET - PML4_SIZE;
+pub const KERNEL_PERCPU_OFFSET: usize = 0xC000_0000;
+/// Size of kernel percpu variables
+pub const KERNEL_PERCPU_SIZE: usize = 64 * 1024; // 64 KB
+
+/// Offset to user image
+pub const USER_OFFSET: usize = 0;
+pub const USER_PML4: usize = (USER_OFFSET & PML4_MASK) / PML4_SIZE;
+
+/// Offset to user TCB
+pub const USER_TCB_OFFSET: usize = 0xB000_0000;
+
+/// Offset to user arguments
+pub const USER_ARG_OFFSET: usize = USER_OFFSET + PML4_SIZE / 2;
+
+/// Offset to user heap
+pub const USER_HEAP_OFFSET: usize = USER_OFFSET + PML4_SIZE;
+pub const USER_HEAP_PML4: usize = (USER_HEAP_OFFSET & PML4_MASK) / PML4_SIZE;
+
+/// Offset to user grants
+pub const USER_GRANT_OFFSET: usize = USER_HEAP_OFFSET + PML4_SIZE;
+pub const USER_GRANT_PML4: usize = (USER_GRANT_OFFSET & PML4_MASK) / PML4_SIZE;
+
+/// Offset to user stack
+pub const USER_STACK_OFFSET: usize = USER_GRANT_OFFSET + PML4_SIZE;
+pub const USER32_STACK_OFFSET: usize = 0xB000_0000;
+pub const USER_STACK_PML4: usize = (USER_STACK_OFFSET & PML4_MASK) / PML4_SIZE;
+/// Size of user stack
+pub const USER_STACK_SIZE: usize = 1024 * 1024; // 1 MB
+
+/// Offset to user sigstack
+pub const USER_SIGSTACK_OFFSET: usize = USER_STACK_OFFSET + PML4_SIZE;
+pub const USER_SIGSTACK_PML4: usize = (USER_SIGSTACK_OFFSET & PML4_MASK) / PML4_SIZE;
+/// Size of user sigstack
+pub const USER_SIGSTACK_SIZE: usize = 256 * 1024; // 256 KB
+
+/// Offset to user TLS
+pub const USER_TLS_OFFSET: usize = USER_SIGSTACK_OFFSET + PML4_SIZE;
+pub const USER_TLS_PML4: usize = (USER_TLS_OFFSET & PML4_MASK) / PML4_SIZE;
+
+/// Offset to user temporary image (used when cloning)
+pub const USER_TMP_OFFSET: usize = USER_TLS_OFFSET + PML4_SIZE;
+pub const USER_TMP_PML4: usize = (USER_TMP_OFFSET & PML4_MASK) / PML4_SIZE;
+
+/// Offset to user temporary heap (used when cloning)
+pub const USER_TMP_HEAP_OFFSET: usize = USER_TMP_OFFSET + PML4_SIZE;
+pub const USER_TMP_HEAP_PML4: usize = (USER_TMP_HEAP_OFFSET & PML4_MASK) / PML4_SIZE;
+
+/// Offset to user temporary page for grants
+pub const USER_TMP_GRANT_OFFSET: usize = USER_TMP_HEAP_OFFSET + PML4_SIZE;
+pub const USER_TMP_GRANT_PML4: usize = (USER_TMP_GRANT_OFFSET & PML4_MASK) / PML4_SIZE;
+
+/// Offset to user temporary stack (used when cloning)
+pub const USER_TMP_STACK_OFFSET: usize = USER_TMP_GRANT_OFFSET + PML4_SIZE;
+pub const USER_TMP_STACK_PML4: usize = (USER_TMP_STACK_OFFSET & PML4_MASK) / PML4_SIZE;
+
+/// Offset to user temporary sigstack (used when cloning)
+pub const USER_TMP_SIGSTACK_OFFSET: usize = USER_TMP_STACK_OFFSET + PML4_SIZE;
+pub const USER_TMP_SIGSTACK_PML4: usize = (USER_TMP_SIGSTACK_OFFSET & PML4_MASK) / PML4_SIZE;
+
+/// Offset to user temporary tls (used when cloning)
+pub const USER_TMP_TLS_OFFSET: usize = USER_TMP_SIGSTACK_OFFSET + PML4_SIZE;
+pub const USER_TMP_TLS_PML4: usize = (USER_TMP_TLS_OFFSET & PML4_MASK) / PML4_SIZE;
+
+/// Offset for usage in other temporary pages
+pub const USER_TMP_MISC_OFFSET: usize = USER_TMP_TLS_OFFSET + PML4_SIZE;
+pub const USER_TMP_MISC_PML4: usize = (USER_TMP_MISC_OFFSET & PML4_MASK) / PML4_SIZE;
\ No newline at end of file
diff --git a/kernel/src/arch/x86_64/driver/vga.rs b/kernel/src/arch/x86_64/driver/vga.rs
index 7ef8720..a5b2feb 100644
--- a/kernel/src/arch/x86_64/driver/vga.rs
+++ b/kernel/src/arch/x86_64/driver/vga.rs
@@ -1,7 +1,7 @@
 use consts::KERNEL_OFFSET;
 use core::ptr::Unique;
 use core::fmt;
-use spin::Mutex;
+use sync::SpinLock as Mutex;
 use volatile::Volatile;
 use x86_64::instructions::port::Port;
 use logging::Color;
diff --git a/kernel/src/arch/x86_64/mod.rs b/kernel/src/arch/x86_64/mod.rs
index 64b66af..db30cf9 100644
--- a/kernel/src/arch/x86_64/mod.rs
+++ b/kernel/src/arch/x86_64/mod.rs
@@ -14,6 +14,7 @@ pub mod gdt;
 pub mod idt;
 pub mod memory;
 pub mod io;
+pub mod consts;
 
 static AP_CAN_INIT: AtomicBool = ATOMIC_BOOL_INIT;
 
diff --git a/kernel/src/consts.rs b/kernel/src/consts.rs
index 4dddae0..ea22abd 100644
--- a/kernel/src/consts.rs
+++ b/kernel/src/consts.rs
@@ -1,128 +1,6 @@
 #![allow(dead_code)]
 
-#[cfg(target_arch = "riscv32")]
-pub use self::riscv::*;
-#[cfg(target_arch = "x86_64")]
-pub use self::x86_64::*;
+pub use arch::consts::*;
 
 pub const MAX_CPU_NUM: usize = 8;
 pub const MAX_PROCESS_NUM: usize = 48;
-
-#[cfg(target_arch = "riscv32")]
-mod riscv {
-    // Physical address available on THINPAD:
-    // [0x80000000, 0x80800000]
-    const P2_SIZE: usize = 1 << 22;
-    const P2_MASK: usize = 0x3ff << 22;
-    pub const RECURSIVE_PAGE_PML4: usize = 0x3fe;
-    pub const KERNEL_OFFSET: usize = 0;
-    pub const KERNEL_PML4: usize = 0x8000_0000 >> 22;
-    pub const KERNEL_HEAP_OFFSET: usize = 0x8020_0000;
-    pub const KERNEL_HEAP_SIZE: usize = 0x0020_0000;
-    pub const MEMORY_OFFSET: usize = 0x8000_0000;
-    pub const MEMORY_END: usize = 0x8080_0000;
-    pub const USER_STACK_OFFSET: usize = 0x70000000;
-    pub const USER_STACK_SIZE: usize = 0x10000;
-    pub const USER32_STACK_OFFSET: usize = USER_STACK_OFFSET;
-}
-
-#[cfg(target_arch = "x86_64")]
-mod x86_64 {
-    // Copy from Redox consts.rs:
-
-    // Because the memory map is so important to not be aliased, it is defined here, in one place
-    // The lower 256 PML4 entries are reserved for userspace
-    // Each PML4 entry references up to 512 GB of memory
-    // The top (511) PML4 is reserved for recursive mapping
-    // The second from the top (510) PML4 is reserved for the kernel
-    /// The size of a single PML4
-    pub const PML4_SIZE: usize = 0x0000_0080_0000_0000;
-    pub const PML4_MASK: usize = 0x0000_ff80_0000_0000;
-
-    /// Offset of recursive paging
-    pub const RECURSIVE_PAGE_OFFSET: usize = (-(PML4_SIZE as isize)) as usize;
-    pub const RECURSIVE_PAGE_PML4: usize = (RECURSIVE_PAGE_OFFSET & PML4_MASK) / PML4_SIZE;
-
-    /// Offset of kernel
-    pub const KERNEL_OFFSET: usize = RECURSIVE_PAGE_OFFSET - PML4_SIZE;
-    pub const KERNEL_PML4: usize = (KERNEL_OFFSET & PML4_MASK) / PML4_SIZE;
-
-    pub const KERNEL_SIZE: usize = PML4_SIZE;
-
-    /// Offset to kernel heap
-    pub const KERNEL_HEAP_OFFSET: usize = KERNEL_OFFSET - PML4_SIZE;
-    pub const KERNEL_HEAP_PML4: usize = (KERNEL_HEAP_OFFSET & PML4_MASK) / PML4_SIZE;
-    /// Size of kernel heap
-    pub const KERNEL_HEAP_SIZE: usize = 8 * 1024 * 1024; // 8 MB
-
-    pub const MEMORY_OFFSET: usize = 0;
-
-    /// Offset to kernel percpu variables
-    //TODO: Use 64-bit fs offset to enable this pub const KERNEL_PERCPU_OFFSET: usize = KERNEL_HEAP_OFFSET - PML4_SIZE;
-    pub const KERNEL_PERCPU_OFFSET: usize = 0xC000_0000;
-    /// Size of kernel percpu variables
-    pub const KERNEL_PERCPU_SIZE: usize = 64 * 1024; // 64 KB
-
-    /// Offset to user image
-    pub const USER_OFFSET: usize = 0;
-    pub const USER_PML4: usize = (USER_OFFSET & PML4_MASK) / PML4_SIZE;
-
-    /// Offset to user TCB
-    pub const USER_TCB_OFFSET: usize = 0xB000_0000;
-
-    /// Offset to user arguments
-    pub const USER_ARG_OFFSET: usize = USER_OFFSET + PML4_SIZE / 2;
-
-    /// Offset to user heap
-    pub const USER_HEAP_OFFSET: usize = USER_OFFSET + PML4_SIZE;
-    pub const USER_HEAP_PML4: usize = (USER_HEAP_OFFSET & PML4_MASK) / PML4_SIZE;
-
-    /// Offset to user grants
-    pub const USER_GRANT_OFFSET: usize = USER_HEAP_OFFSET + PML4_SIZE;
-    pub const USER_GRANT_PML4: usize = (USER_GRANT_OFFSET & PML4_MASK) / PML4_SIZE;
-
-    /// Offset to user stack
-    pub const USER_STACK_OFFSET: usize = USER_GRANT_OFFSET + PML4_SIZE;
-    pub const USER32_STACK_OFFSET: usize = 0xB000_0000;
-    pub const USER_STACK_PML4: usize = (USER_STACK_OFFSET & PML4_MASK) / PML4_SIZE;
-    /// Size of user stack
-    pub const USER_STACK_SIZE: usize = 1024 * 1024; // 1 MB
-
-    /// Offset to user sigstack
-    pub const USER_SIGSTACK_OFFSET: usize = USER_STACK_OFFSET + PML4_SIZE;
-    pub const USER_SIGSTACK_PML4: usize = (USER_SIGSTACK_OFFSET & PML4_MASK) / PML4_SIZE;
-    /// Size of user sigstack
-    pub const USER_SIGSTACK_SIZE: usize = 256 * 1024; // 256 KB
-
-    /// Offset to user TLS
-    pub const USER_TLS_OFFSET: usize = USER_SIGSTACK_OFFSET + PML4_SIZE;
-    pub const USER_TLS_PML4: usize = (USER_TLS_OFFSET & PML4_MASK) / PML4_SIZE;
-
-    /// Offset to user temporary image (used when cloning)
-    pub const USER_TMP_OFFSET: usize = USER_TLS_OFFSET + PML4_SIZE;
-    pub const USER_TMP_PML4: usize = (USER_TMP_OFFSET & PML4_MASK) / PML4_SIZE;
-
-    /// Offset to user temporary heap (used when cloning)
-    pub const USER_TMP_HEAP_OFFSET: usize = USER_TMP_OFFSET + PML4_SIZE;
-    pub const USER_TMP_HEAP_PML4: usize = (USER_TMP_HEAP_OFFSET & PML4_MASK) / PML4_SIZE;
-
-    /// Offset to user temporary page for grants
-    pub const USER_TMP_GRANT_OFFSET: usize = USER_TMP_HEAP_OFFSET + PML4_SIZE;
-    pub const USER_TMP_GRANT_PML4: usize = (USER_TMP_GRANT_OFFSET & PML4_MASK) / PML4_SIZE;
-
-    /// Offset to user temporary stack (used when cloning)
-    pub const USER_TMP_STACK_OFFSET: usize = USER_TMP_GRANT_OFFSET + PML4_SIZE;
-    pub const USER_TMP_STACK_PML4: usize = (USER_TMP_STACK_OFFSET & PML4_MASK) / PML4_SIZE;
-
-    /// Offset to user temporary sigstack (used when cloning)
-    pub const USER_TMP_SIGSTACK_OFFSET: usize = USER_TMP_STACK_OFFSET + PML4_SIZE;
-    pub const USER_TMP_SIGSTACK_PML4: usize = (USER_TMP_SIGSTACK_OFFSET & PML4_MASK) / PML4_SIZE;
-
-    /// Offset to user temporary tls (used when cloning)
-    pub const USER_TMP_TLS_OFFSET: usize = USER_TMP_SIGSTACK_OFFSET + PML4_SIZE;
-    pub const USER_TMP_TLS_PML4: usize = (USER_TMP_TLS_OFFSET & PML4_MASK) / PML4_SIZE;
-
-    /// Offset for usage in other temporary pages
-    pub const USER_TMP_MISC_OFFSET: usize = USER_TMP_TLS_OFFSET + PML4_SIZE;
-    pub const USER_TMP_MISC_PML4: usize = (USER_TMP_MISC_OFFSET & PML4_MASK) / PML4_SIZE;
-}
\ No newline at end of file
diff --git a/kernel/src/fs.rs b/kernel/src/fs.rs
index fe20c6c..6c9c958 100644
--- a/kernel/src/fs.rs
+++ b/kernel/src/fs.rs
@@ -2,7 +2,7 @@ use simple_filesystem::*;
 use alloc::boxed::Box;
 #[cfg(target_arch = "x86_64")]
 use arch::driver::ide;
-use spin::Mutex;
+use sync::SpinLock as Mutex;
 
 // Hard link user program
 #[cfg(target_arch = "riscv32")]
diff --git a/kernel/src/logging.rs b/kernel/src/logging.rs
index a007cb4..c1f2241 100644
--- a/kernel/src/logging.rs
+++ b/kernel/src/logging.rs
@@ -1,5 +1,10 @@
 use core::fmt;
 use log::{self, Level, LevelFilter, Log, Metadata, Record};
+use sync::SpinLock as Mutex;
+
+lazy_static! {
+    static ref log_mutex: Mutex<()> = Mutex::new(());
+}
 
 pub fn init() {
     static LOGGER: SimpleLogger = SimpleLogger;
@@ -38,11 +43,13 @@ macro_rules! with_color {
 
 fn print_in_color(args: fmt::Arguments, color: Color) {
     use arch::io;
+    let mutex = log_mutex.lock();
     io::putfmt(with_color!(args, color));
 }
 
 pub fn print(args: fmt::Arguments) {
     use arch::io;
+    let mutex = log_mutex.lock();
     io::putfmt(args);
 }
 
diff --git a/kernel/src/memory.rs b/kernel/src/memory.rs
index 0f203f3..dec17fb 100644
--- a/kernel/src/memory.rs
+++ b/kernel/src/memory.rs
@@ -1,7 +1,8 @@
 pub use arch::paging::*;
 use bit_allocator::{BitAlloc, BitAlloc4K, BitAlloc64K};
 use consts::MEMORY_OFFSET;
-use spin::{Mutex, MutexGuard};
+use sync::{MutexGuard, Spin};
+use sync::SpinLock as Mutex;
 use super::HEAP_ALLOCATOR;
 use ucore_memory::{*, paging::PageTable};
 use ucore_memory::cow::CowExt;
@@ -48,7 +49,7 @@ lazy_static! {
 }
 
 /// The only way to get active page table
-pub fn active_table() -> MutexGuard<'static, CowExt<ActivePageTable>> {
+pub fn active_table() -> MutexGuard<'static, CowExt<ActivePageTable>, Spin> {
     ACTIVE_TABLE.lock()
 }
 
diff --git a/kernel/src/smp.rs b/kernel/src/smp.rs
new file mode 100644
index 0000000..3439d17
--- /dev/null
+++ b/kernel/src/smp.rs
@@ -0,0 +1,4 @@
+pub struct cpu {
+    pub id: usize
+}
+
diff --git a/kernel/src/sync/arch/riscv32/atomic_lock.rs b/kernel/src/sync/arch/riscv32/atomic_lock.rs
new file mode 100644
index 0000000..ee19bfa
--- /dev/null
+++ b/kernel/src/sync/arch/riscv32/atomic_lock.rs
@@ -0,0 +1,44 @@
+//! RISCV atomic is not currently supported by Rust.
+//! This is a ugly workaround.
+
+use core::cell::UnsafeCell;
+
+extern {
+    fn __atomic_load_4(src: *const u32) -> u32;
+    fn __atomic_store_4(dst: *mut u32, val: u32);
+    fn __atomic_compare_exchange_4(dst: *mut u32, expected: *mut u32, desired: u32) -> bool;
+}
+
+pub struct AtomicLock
+{
+    lock: UnsafeCell<u32>
+}
+
+impl AtomicLock 
+{
+    pub fn new() -> Self {
+        AtomicLock {
+            lock: UnsafeCell::new(0)
+        }
+    }
+
+    /// Returns 1 if lock is acquired
+    pub fn try_lock(&self) -> bool {
+        let mut expected: u32 = 0;
+        unsafe {
+            __atomic_compare_exchange_4(self.lock.get(), &mut expected as *mut u32, 1)
+        }
+    }
+
+    pub fn load(&self) -> bool {
+        unsafe {
+            __atomic_load_4(self.lock.get()) == 1
+        }
+    }
+
+    pub fn store(&self) {
+        unsafe {
+            __atomic_store_4(self.lock.get(), 0);
+        }
+    }
+}
diff --git a/kernel/src/sync/arch/x86_64/atomic_lock.rs b/kernel/src/sync/arch/x86_64/atomic_lock.rs
new file mode 100644
index 0000000..06550c8
--- /dev/null
+++ b/kernel/src/sync/arch/x86_64/atomic_lock.rs
@@ -0,0 +1,31 @@
+use core::sync::atomic::{AtomicBool, Ordering};
+
+pub struct AtomicLock
+{
+    lock: AtomicBool
+}
+
+impl AtomicLock 
+{
+    pub fn new() -> AtomicLock {
+        AtomicLock {
+            lock: AtomicBool::new(false)
+        }
+    }
+
+    pub fn try_lock(&self) -> bool {
+        self.lock.compare_and_swap(false, true, Ordering::Acquire) == false
+    }
+
+    pub fn load(&self) -> bool {
+        self.lock.load(Ordering::Relaxed)
+    }
+
+    pub fn store(&self) {
+        self.lock.store(false, Ordering::Release);
+    }
+}
+
+pub const ATOMIC_LOCK_INIT: AtomicLock = AtomicLock {
+    lock: AtomicBool::new(false)
+};
\ No newline at end of file
diff --git a/kernel/src/sync/mod.rs b/kernel/src/sync/mod.rs
index cfa5071..c2bae78 100644
--- a/kernel/src/sync/mod.rs
+++ b/kernel/src/sync/mod.rs
@@ -53,6 +53,15 @@ pub use self::condvar::*;
 pub use self::mutex::*;
 pub use self::semaphore::*;
 
+#[allow(dead_code)]
+#[cfg(target_arch = "x86_64")]
+#[path = "arch/x86_64/atomic_lock.rs"]
+pub mod atomic_lock;
+
+#[cfg(target_arch = "riscv32")]
+#[path = "arch/riscv32/atomic_lock.rs"]
+pub mod atomic_lock;
+
 mod mutex;
 mod condvar;
 mod semaphore;
diff --git a/kernel/src/sync/mutex.rs b/kernel/src/sync/mutex.rs
index c8e62ea..d542e76 100644
--- a/kernel/src/sync/mutex.rs
+++ b/kernel/src/sync/mutex.rs
@@ -30,8 +30,8 @@ use arch::interrupt;
 use core::cell::UnsafeCell;
 use core::fmt;
 use core::ops::{Deref, DerefMut};
-use core::sync::atomic::{ATOMIC_BOOL_INIT, AtomicBool, Ordering};
 use super::Condvar;
+use super::atomic_lock::AtomicLock;
 
 pub type SpinLock<T> = Mutex<T, Spin>;
 pub type SpinNoIrqLock<T> = Mutex<T, SpinNoIrq>;
@@ -39,7 +39,7 @@ pub type ThreadLock<T> = Mutex<T, Condvar>;
 
 pub struct Mutex<T: ?Sized, S: MutexSupport>
 {
-    lock: AtomicBool,
+    lock: AtomicLock,
     support: S,
     data: UnsafeCell<T>,
 }
@@ -78,7 +78,7 @@ impl<T, S: MutexSupport> Mutex<T, S>
     /// ```
     pub fn new(user_data: T) -> Mutex<T, S> {
         Mutex {
-            lock: ATOMIC_BOOL_INIT,
+            lock: AtomicLock::new(),
             data: UnsafeCell::new(user_data),
             support: S::new(),
         }
@@ -96,9 +96,9 @@ impl<T, S: MutexSupport> Mutex<T, S>
 impl<T: ?Sized, S: MutexSupport> Mutex<T, S>
 {
     fn obtain_lock(&self) {
-        while self.lock.compare_and_swap(false, true, Ordering::Acquire) != false {
+        while !self.lock.try_lock() {
             // Wait until the lock looks unlocked before retrying
-            while self.lock.load(Ordering::Relaxed) {
+            while self.lock.load() {
                 self.support.cpu_relax();
             }
         }
@@ -137,14 +137,14 @@ impl<T: ?Sized, S: MutexSupport> Mutex<T, S>
     ///
     /// If the lock isn't held, this is a no-op.
     pub unsafe fn force_unlock(&self) {
-        self.lock.store(false, Ordering::Release);
+        self.lock.store();
     }
 
     /// Tries to lock the mutex. If it is already locked, it will return None. Otherwise it returns
     /// a guard within Some.
     pub fn try_lock(&self) -> Option<MutexGuard<T, S>> {
         let support_guard = S::before_lock();
-        if self.lock.compare_and_swap(false, true, Ordering::Acquire) == false {
+        if self.lock.try_lock() {
             Some(MutexGuard {
                 mutex: self,
                 support_guard,
@@ -186,7 +186,7 @@ impl<'a, T: ?Sized, S: MutexSupport> Drop for MutexGuard<'a, T, S>
 {
     /// The dropping of the MutexGuard will release the lock it was created from.
     fn drop(&mut self) {
-        self.mutex.lock.store(false, Ordering::Release);
+        self.mutex.lock.store();
         self.mutex.support.after_unlock();
     }
 }
diff --git a/riscv-pk/bbl/bbl.c b/riscv-pk/bbl/bbl.c
index 1b96a9d..dbb9277 100644
--- a/riscv-pk/bbl/bbl.c
+++ b/riscv-pk/bbl/bbl.c
@@ -48,7 +48,7 @@ void boot_other_hart(uintptr_t unused __attribute__((unused)))
     }
   }
 
-  enter_supervisor_mode(entry, hartid, dtb_output());
+  enter_supervisor_mode(entry, hartid, dtb_output(), ~disabled_hart_mask & hart_mask);
 }
 
 void boot_loader(uintptr_t dtb)
diff --git a/riscv-pk/configure b/riscv-pk/configure
index 49dddf3..a41f908 100755
--- a/riscv-pk/configure
+++ b/riscv-pk/configure
@@ -4084,8 +4084,8 @@ fi
 case "${BUILD_32BIT}" in
   yes|default)
 	echo "Building 32-bit pk"
-	CFLAGS="$default_CFLAGS -march=rv32i -mabi=ilp32"
-	LDFLAGS="-march=rv32i -mabi=ilp32"
+	CFLAGS="$default_CFLAGS -march=rv32iac -mabi=ilp32"
+	LDFLAGS="-march=rv32iac -mabi=ilp32"
 	install_subdir="riscv32-unknown-elf"
 	;;
   *)
diff --git a/riscv-pk/configure.ac b/riscv-pk/configure.ac
index 107a3f2..20cd6d1 100644
--- a/riscv-pk/configure.ac
+++ b/riscv-pk/configure.ac
@@ -88,8 +88,8 @@ AC_ARG_ENABLE([32bit],
 case "${BUILD_32BIT}" in
   yes|default)
 	echo "Building 32-bit pk"
-	CFLAGS="$default_CFLAGS -march=rv32i -mabi=ilp32"
-	LDFLAGS="-march=rv32i -mabi=ilp32"
+	CFLAGS="$default_CFLAGS -march=rv32iac -mabi=ilp32"
+	LDFLAGS="-march=rv32iac -mabi=ilp32"
 	install_subdir="riscv32-unknown-elf"
 	;;
   *)
diff --git a/riscv-pk/machine/minit.c b/riscv-pk/machine/minit.c
index c3fce3d..ca6f43b 100644
--- a/riscv-pk/machine/minit.c
+++ b/riscv-pk/machine/minit.c
@@ -172,7 +172,7 @@ void init_other_hart(uintptr_t hartid, uintptr_t dtb)
   boot_other_hart(dtb);
 }
 
-void enter_supervisor_mode(void (*fn)(uintptr_t), uintptr_t arg0, uintptr_t arg1)
+void enter_supervisor_mode(void (*fn)(uintptr_t), uintptr_t arg0, uintptr_t arg1, uintptr_t arg2)
 {
   // Set up a PMP to permit access to all of memory.
   // Ignore the illegal-instruction trap if PMPs aren't supported.
@@ -194,6 +194,7 @@ void enter_supervisor_mode(void (*fn)(uintptr_t), uintptr_t arg0, uintptr_t arg1
 
   register uintptr_t a0 asm ("a0") = arg0;
   register uintptr_t a1 asm ("a1") = arg1;
-  asm volatile ("mret" : : "r" (a0), "r" (a1));
+  register uintptr_t a2 asm ("a2") = arg2;
+  asm volatile ("mret" : : "r" (a0), "r" (a1), "r" (a2));
   __builtin_unreachable();
 }
diff --git a/riscv-pk/machine/mtrap.h b/riscv-pk/machine/mtrap.h
index eafdb14..b439088 100644
--- a/riscv-pk/machine/mtrap.h
+++ b/riscv-pk/machine/mtrap.h
@@ -63,7 +63,7 @@ void putstring(const char* s);
 #define assert(x) ({ if (!(x)) die("assertion failed: %s", #x); })
 #define die(str, ...) ({ printm("%s:%d: " str "\n", __FILE__, __LINE__, ##__VA_ARGS__); poweroff(-1); })
 
-void enter_supervisor_mode(void (*fn)(uintptr_t), uintptr_t arg0, uintptr_t arg1)
+void enter_supervisor_mode(void (*fn)(uintptr_t), uintptr_t arg0, uintptr_t arg1, uintptr_t arg2)
   __attribute__((noreturn));
 void boot_loader(uintptr_t dtb);
 void boot_other_hart(uintptr_t dtb);