diff --git a/os/src/syscall/mod.rs b/os/src/syscall/mod.rs
index 4683d055..84ff73fe 100644
--- a/os/src/syscall/mod.rs
+++ b/os/src/syscall/mod.rs
@@ -11,12 +11,17 @@ const SYSCALL_GETPID: usize = 172;
 const SYSCALL_FORK: usize = 220;
 const SYSCALL_EXEC: usize = 221;
 const SYSCALL_WAITPID: usize = 260;
+const SYSCALL_THREAD_CREATE: usize = 1000;
+const SYSCALL_GETTID: usize = 1001;
+const SYSCALL_WAITTID: usize = 1002;
 
 mod fs;
 mod process;
+mod thread;
 
 use fs::*;
 use process::*;
+use thread::*;
 
 pub fn syscall(syscall_id: usize, args: [usize; 3]) -> isize {
     match syscall_id {
@@ -33,6 +38,9 @@ pub fn syscall(syscall_id: usize, args: [usize; 3]) -> isize {
         SYSCALL_FORK => sys_fork(),
         SYSCALL_EXEC => sys_exec(args[0] as *const u8, args[1] as *const usize),
         SYSCALL_WAITPID => sys_waitpid(args[0] as isize, args[1] as *mut i32),
+        SYSCALL_THREAD_CREATE => sys_thread_create(args[0]),
+        SYSCALL_GETTID => sys_gettid(),
+        SYSCALL_WAITTID => sys_waittid(args[0]) as isize,
         _ => panic!("Unsupported syscall_id: {}", syscall_id),
     }
 }
diff --git a/os/src/syscall/thread.rs b/os/src/syscall/thread.rs
new file mode 100644
index 00000000..d9e42ba1
--- /dev/null
+++ b/os/src/syscall/thread.rs
@@ -0,0 +1,70 @@
+use alloc::sync::Arc;
+use crate::{mm::kernel_token, task::{TaskControlBlock, add_task, current_task}, trap::{TrapContext, trap_handler}};
+
+pub fn sys_thread_create(entry: usize) -> isize {
+    let task = current_task().unwrap();
+    let process = task.process.upgrade().unwrap();
+    // create a new thread
+    let new_task = Arc::new(TaskControlBlock::new(
+        Arc::clone(&process),
+        task.inner_exclusive_access().res.as_ref().unwrap().ustack_base,
+        true,
+    ));
+    // add new task to scheduler
+    add_task(Arc::clone(&new_task));
+    let new_task_inner = new_task.inner_exclusive_access();
+    let new_task_res = new_task_inner.res.as_ref().unwrap();
+    let new_task_tid = new_task_res.tid;
+    let mut process_inner = process.inner_exclusive_access();
+    // add new thread to current process
+    let tasks = &mut process_inner.tasks;
+    while tasks.len() < new_task_tid + 1 {
+        tasks.push(None);
+    }
+    tasks[new_task_tid] = Some(Arc::clone(&new_task));
+    let new_task_trap_cx = new_task_inner.get_trap_cx();
+    *new_task_trap_cx = TrapContext::app_init_context(
+        entry,
+        new_task_res.ustack_top(),
+        kernel_token(),
+        new_task.kstack.get_top(),
+        trap_handler as usize,
+    );
+    new_task_tid as isize
+}
+
+pub fn sys_gettid() -> isize {
+    current_task().unwrap().inner_exclusive_access().res.as_ref().unwrap().tid as isize
+}
+
+/// thread does not exist, return -1
+/// thread has not exited yet, return -2
+/// otherwise, return thread's exit code
+pub fn sys_waittid(tid: usize) -> i32 {
+    let task = current_task().unwrap();
+    let process = task.process.upgrade().unwrap();
+    let task_inner = task.inner_exclusive_access();
+    let mut process_inner = process.inner_exclusive_access();
+    // a thread cannot wait for itself
+    if task_inner.res.as_ref().unwrap().tid == tid {
+        return -1;
+    }
+    let mut exit_code: Option<i32> = None;
+    let waited_task = process_inner.tasks[tid].as_ref();
+    if let Some(waited_task) = waited_task {
+        if let Some(waited_exit_code) = waited_task.inner_exclusive_access().exit_code {
+            exit_code = Some(waited_exit_code);
+        }
+    } else {
+        // waited thread does not exist
+        return -1;
+    }
+    if let Some(exit_code) = exit_code {
+        // dealloc the exited thread
+        process_inner.tasks[tid] = None;
+        exit_code
+    } else {
+        // waited thread has not exited
+        -2
+    }
+}
diff --git a/os/src/task/id.rs b/os/src/task/id.rs
index 3795a871..f94488c8 100644
--- a/os/src/task/id.rs
+++ b/os/src/task/id.rs
@@ -1,4 +1,4 @@
-use alloc::{vec::Vec, sync::Arc};
+use alloc::{vec::Vec, sync::{Arc, Weak}};
 use lazy_static::*;
 use crate::sync::UPSafeCell;
 use crate::mm::{KERNEL_SPACE, MapPermission, PhysPageNum, VirtAddr};
@@ -107,8 +107,7 @@ impl KernelStack {
 pub struct TaskUserRes {
     pub tid: usize,
     pub ustack_base: usize,
-    pub kstack: KernelStack,
-    pub process: Arc<ProcessControlBlock>,
+    pub process: Weak<ProcessControlBlock>,
 }
 
 fn trap_cx_bottom_from_tid(tid: usize) -> usize {
@@ -126,12 +125,10 @@ impl TaskUserRes {
         alloc_user_res: bool,
     ) -> Self {
         let tid = process.inner_exclusive_access().alloc_tid();
-        let kstack = kstack_alloc();
         let task_user_res = Self {
             tid,
             ustack_base,
-            kstack,
-            process: Arc::clone(&process),
+            process: Arc::downgrade(&process),
         };
         if alloc_user_res {
             task_user_res.alloc_user_res();
@@ -140,11 +137,12 @@ impl TaskUserRes {
     }
 
     pub fn alloc_user_res(&self) {
-        let mut process = self.process.inner_exclusive_access();
+        let process = self.process.upgrade().unwrap();
+        let mut process_inner = process.inner_exclusive_access();
         // alloc user stack
         let ustack_bottom = ustack_bottom_from_tid(self.ustack_base, self.tid);
         let ustack_top = ustack_bottom + USER_STACK_SIZE;
-        process
+        process_inner
             .memory_set
             .insert_framed_area(
                 ustack_bottom.into(),
@@ -154,7 +152,7 @@ impl TaskUserRes {
         // alloc trap_cx
         let trap_cx_bottom = trap_cx_bottom_from_tid(self.tid);
         let trap_cx_top = trap_cx_bottom + PAGE_SIZE;
-        process
+        process_inner
             .memory_set
             .insert_framed_area(
                 trap_cx_bottom.into(),
@@ -165,23 +163,30 @@ impl TaskUserRes {
 
     fn dealloc_user_res(&self) {
         // dealloc tid
-        let mut process = self.process.inner_exclusive_access();
+        let process = self.process.upgrade().unwrap();
+        let mut process_inner = process.inner_exclusive_access();
         // dealloc ustack manually
         let ustack_bottom_va: VirtAddr = ustack_bottom_from_tid(self.ustack_base, self.tid).into();
-        process.memory_set.remove_area_with_start_vpn(ustack_bottom_va.into());
+        process_inner.memory_set.remove_area_with_start_vpn(ustack_bottom_va.into());
         // dealloc trap_cx manually
         let trap_cx_bottom_va: VirtAddr = trap_cx_bottom_from_tid(self.tid).into();
-        process.memory_set.remove_area_with_start_vpn(trap_cx_bottom_va.into());
+        process_inner.memory_set.remove_area_with_start_vpn(trap_cx_bottom_va.into());
     }
 
     #[allow(unused)]
     pub fn alloc_tid(&mut self) {
-        self.tid = self.process.inner_exclusive_access().alloc_tid();
+        self.tid = self
+            .process
+            .upgrade()
+            .unwrap()
+            .inner_exclusive_access()
+            .alloc_tid();
     }
 
     pub fn dealloc_tid(&self) {
-        let mut process = self.process.inner_exclusive_access();
-        process.dealloc_tid(self.tid);
+        let process = self.process.upgrade().unwrap();
+        let mut process_inner = process.inner_exclusive_access();
+        process_inner.dealloc_tid(self.tid);
     }
 
     pub fn trap_cx_user_va(&self) -> usize {
@@ -189,26 +194,22 @@ impl TaskUserRes {
     }
 
     pub fn trap_cx_ppn(&self) -> PhysPageNum {
-        let process = self.process.inner_exclusive_access();
+        let process = self.process.upgrade().unwrap();
+        let process_inner = process.inner_exclusive_access();
         let trap_cx_bottom_va: VirtAddr = trap_cx_bottom_from_tid(self.tid).into();
-        process.memory_set.translate(trap_cx_bottom_va.into()).unwrap().ppn()
+        process_inner.memory_set.translate(trap_cx_bottom_va.into()).unwrap().ppn()
     }
 
     pub fn ustack_base(&self) -> usize { self.ustack_base }
     pub fn ustack_top(&self) -> usize {
         ustack_bottom_from_tid(self.ustack_base, self.tid) + USER_STACK_SIZE 
     }
-
-    pub fn kstack_top(&self) -> usize {
-        self.kstack.get_top()
-    }
 }
 
 impl Drop for TaskUserRes {
     fn drop(&mut self) {
         self.dealloc_tid();
         self.dealloc_user_res();
-        // kstack can also be deallocated automatically 
     }
 }
 
diff --git a/os/src/task/mod.rs b/os/src/task/mod.rs
index c83af4c0..75754012 100644
--- a/os/src/task/mod.rs
+++ b/os/src/task/mod.rs
@@ -8,7 +8,6 @@ mod process;
 
 use crate::fs::{open_file, OpenFlags};
 use switch::__switch;
-use task::{TaskControlBlock, TaskStatus};
 use alloc::sync::Arc;
 use manager::fetch_task;
 use lazy_static::*;
@@ -26,6 +25,7 @@ pub use processor::{
     take_current_task,
     schedule,
 };
+pub use task::{TaskControlBlock, TaskStatus};
 pub use manager::add_task;
 pub use id::{
     PidHandle,
@@ -53,16 +53,21 @@ pub fn suspend_current_and_run_next() {
 }
 
 pub fn exit_current_and_run_next(exit_code: i32) {
-    // take from Processor
     let task = take_current_task().unwrap();
-    task.inner_exclusive_access().exit_code = exit_code;
-    let tid = task.inner_exclusive_access().res.tid;
-    // remove thread 
+    let mut task_inner = task.inner_exclusive_access();
     let process = task.process.upgrade().unwrap();
-    let mut process_inner = process.inner_exclusive_access();
-    process_inner.tasks.drain(tid..tid + 1);
-    // if this is the main thread of the process, then we need terminate this process
+    let tid = task_inner.res.as_ref().unwrap().tid;
+    // record exit code
+    task_inner.exit_code = Some(exit_code);
+    task_inner.res = None;
+    // here we do not remove the thread since we are still using the kstack
+    // it will be deallocated when sys_waittid is called
+    drop(task_inner);
+    drop(task);
+    // however, if this is the main thread of current process
+    // the process should terminate at once
     if tid == 0 {
+        let mut process_inner = process.inner_exclusive_access();
         // mark this process as a zombie process
         process_inner.is_zombie = true;
         // record exit code of main process
@@ -77,14 +82,20 @@ pub fn exit_current_and_run_next(exit_code: i32) {
             }
         }
 
+        // deallocate user res (including tid/trap_cx/ustack) of all threads
+        // it has to be done before we dealloc the whole memory_set
+        // otherwise they will be deallocated twice
+        for task in process_inner.tasks.iter().filter(|t| t.is_some()) {
+            let task = task.as_ref().unwrap();
+            let mut task_inner = task.inner_exclusive_access();
+            task_inner.res = None;
+        }
+
         process_inner.children.clear();
-        // deallocate user space as soon as possible
+        // deallocate other data in user space i.e. program code/data section
         process_inner.memory_set.recycle_data_pages();
     }
-    // maintain rc of process manually since we will break this context soon
-    drop(process_inner);
     drop(process);
-    drop(task);
     // we do not have to save task context
     let mut _unused = TaskContext::zero_init();
     schedule(&mut _unused as *mut _);
diff --git a/os/src/task/process.rs b/os/src/task/process.rs
index 370d8037..9c8d53bd 100644
--- a/os/src/task/process.rs
+++ b/os/src/task/process.rs
@@ -106,8 +106,8 @@ impl ProcessControlBlock {
         // prepare trap_cx of main thread
         let task_inner = task.inner_exclusive_access();
         let trap_cx = task_inner.get_trap_cx();
-        let ustack_top = task_inner.res.ustack_top();
-        let kstack_top = task_inner.res.kstack_top();
+        let ustack_top = task_inner.res.as_ref().unwrap().ustack_top();
+        let kstack_top = task.kstack.get_top();
         drop(task_inner);
         *trap_cx = TrapContext::app_init_context(
             entry_point,
@@ -137,11 +137,11 @@ impl ProcessControlBlock {
         // since memory_set has been changed
         let task = self.inner_exclusive_access().get_task(0);
         let mut task_inner = task.inner_exclusive_access();
-        task_inner.res.ustack_base = ustack_base;
-        task_inner.res.alloc_user_res();
-        task_inner.trap_cx_ppn = task_inner.res.trap_cx_ppn();
+        task_inner.res.as_mut().unwrap().ustack_base = ustack_base;
+        task_inner.res.as_mut().unwrap().alloc_user_res();
+        task_inner.trap_cx_ppn = task_inner.res.as_mut().unwrap().trap_cx_ppn();
         // push arguments on user stack
-        let mut user_sp = task_inner.res.ustack_top();
+        let mut user_sp = task_inner.res.as_mut().unwrap().ustack_top();
         user_sp -= (args.len() + 1) * core::mem::size_of::<usize>();
         let argv_base = user_sp;
         let mut argv: Vec<_> = (0..=args.len())
@@ -170,7 +170,7 @@ impl ProcessControlBlock {
             entry_point,
             user_sp,
             KERNEL_SPACE.exclusive_access().token(),
-            task_inner.res.kstack_top(),
+            task.kstack.get_top(),
             trap_handler as usize,
         );
         trap_cx.x[10] = args.len();
@@ -214,7 +214,7 @@ impl ProcessControlBlock {
         // create main thread of child process
         let task = Arc::new(TaskControlBlock::new(
             Arc::clone(&child),
-            parent.get_task(0).inner_exclusive_access().res.ustack_base(),
+            parent.get_task(0).inner_exclusive_access().res.as_ref().unwrap().ustack_base(),
             // here we do not allocate trap_cx or ustack again
             // but mention that we allocate a new kstack here
             false,
@@ -226,7 +226,7 @@ impl ProcessControlBlock {
         // modify kstack_top in trap_cx of this thread
         let task_inner = task.inner_exclusive_access();
         let trap_cx = task_inner.get_trap_cx();
-        trap_cx.kernel_sp = task_inner.res.kstack_top();
+        trap_cx.kernel_sp = task.kstack.get_top();
         drop(task_inner);
         // add this thread to scheduler
         add_task(task);
diff --git a/os/src/task/processor.rs b/os/src/task/processor.rs
index 7ebfb08d..cb83319a 100644
--- a/os/src/task/processor.rs
+++ b/os/src/task/processor.rs
@@ -84,11 +84,20 @@ pub fn current_trap_cx() -> &'static mut TrapContext {
 }
 
 pub fn current_trap_cx_user_va() -> usize {
-    current_task().unwrap().inner_exclusive_access().res.trap_cx_user_va()
+    current_task()
+        .unwrap()
+        .inner_exclusive_access()
+        .res
+        .as_ref()
+        .unwrap()
+        .trap_cx_user_va()
 }
 
 pub fn current_kstack_top() -> usize {
-    current_task().unwrap().inner_exclusive_access().res.kstack_top()
+    current_task()
+        .unwrap()
+        .kstack
+        .get_top()
 }
 
 pub fn schedule(switched_task_cx_ptr: *mut TaskContext) {
diff --git a/os/src/task/task.rs b/os/src/task/task.rs
index 996e4115..c5465066 100644
--- a/os/src/task/task.rs
+++ b/os/src/task/task.rs
@@ -2,15 +2,13 @@ use alloc::sync::{Arc, Weak};
 use crate::{mm::PhysPageNum, sync::UPSafeCell};
 use crate::trap::TrapContext;
 use super::id::TaskUserRes;
-use super::{
-    ProcessControlBlock,
-    TaskContext
-};
+use super::{KernelStack, ProcessControlBlock, TaskContext, kstack_alloc};
 use core::cell::RefMut;
 
 pub struct TaskControlBlock {
     // immutable
     pub process: Weak<ProcessControlBlock>,
+    pub kstack: KernelStack,
     // mutable
     inner: UPSafeCell<TaskControlBlockInner>,
 }
@@ -25,15 +23,14 @@ impl TaskControlBlock {
         let inner = process.inner_exclusive_access();
         inner.memory_set.token()
     }
-
 }
 
 pub struct TaskControlBlockInner {
-    pub res: TaskUserRes,
+    pub res: Option<TaskUserRes>,
     pub trap_cx_ppn: PhysPageNum,
     pub task_cx: TaskContext,
     pub task_status: TaskStatus,
-    pub exit_code: i32,
+    pub exit_code: Option<i32>,
 }
 
 impl TaskControlBlockInner {
@@ -55,16 +52,18 @@ impl TaskControlBlock {
     ) -> Self {
         let res = TaskUserRes::new(Arc::clone(&process), ustack_base, alloc_user_res);
         let trap_cx_ppn = res.trap_cx_ppn();
-        let kstack_top = res.kstack_top();
+        let kstack = kstack_alloc();
+        let kstack_top = kstack.get_top();
         Self {
             process: Arc::downgrade(&process),
+            kstack,
             inner: unsafe { UPSafeCell::new(
                 TaskControlBlockInner {
-                    res,
+                    res: Some(res),
                     trap_cx_ppn,
                     task_cx: TaskContext::goto_trap_return(kstack_top),
                     task_status: TaskStatus::Ready,
-                    exit_code: 0,
+                    exit_code: None,
                 }
             )},
         }
diff --git a/user/src/bin/initproc.rs b/user/src/bin/initproc.rs
index 0889a2f0..cf8840f7 100644
--- a/user/src/bin/initproc.rs
+++ b/user/src/bin/initproc.rs
@@ -13,7 +13,6 @@ use user_lib::{
 
 #[no_mangle]
 fn main() -> i32 {
-    println!("start initproc!");
     if fork() == 0 {
         exec("user_shell\0", &[0 as *const u8]);
     } else {
diff --git a/user/src/bin/race_adder.rs b/user/src/bin/race_adder.rs
new file mode 100644
index 00000000..44d653fd
--- /dev/null
+++ b/user/src/bin/race_adder.rs
@@ -0,0 +1,36 @@
+#![no_std]
+#![no_main]
+
+#[macro_use]
+extern crate user_lib;
+extern crate alloc;
+
+use user_lib::{exit, thread_create, waittid};
+use alloc::vec::Vec;
+
+static mut A: usize = 0;
+const PER_THREAD: usize = 10000000;
+const THREAD_COUNT: usize = 50;
+
+unsafe fn f() -> ! {
+    for _ in 0..PER_THREAD {
+        let a = &mut A as *mut usize;
+        let cur = a.read_volatile();
+        a.write_volatile(cur + 1);
+    }
+    exit(0)
+}
+
+#[no_mangle]
+pub fn main() -> i32 {
+    let mut v = Vec::new();    
+    for _ in 0..THREAD_COUNT {
+        v.push(thread_create(f as usize) as usize);
+    }
+    for tid in v.iter() {
+        waittid(*tid);
+    }
+    assert_eq!(unsafe { A }, PER_THREAD * THREAD_COUNT);
+    println!("total = {}", unsafe { A });
+    0
+}
diff --git a/user/src/bin/threads.rs b/user/src/bin/threads.rs
new file mode 100644
index 00000000..7641dde5
--- /dev/null
+++ b/user/src/bin/threads.rs
@@ -0,0 +1,38 @@
+#![no_std]
+#![no_main]
+
+#[macro_use]
+extern crate user_lib;
+extern crate alloc;
+
+use user_lib::{thread_create, waittid, exit};
+use alloc::vec::Vec;
+
+pub fn thread_a() -> ! {
+    for _ in 0..1000 { print!("a"); }
+    exit(1)
+}
+
+pub fn thread_b() -> ! {
+    for _ in 0..1000 { print!("b"); }
+    exit(2) 
+}
+
+pub fn thread_c() -> ! {
+    for _ in 0..1000 { print!("c"); }
+    exit(3)
+}
+
+#[no_mangle]
+pub fn main() -> i32 {
+    let mut v = Vec::new();
+    v.push(thread_create(thread_a as usize));
+    v.push(thread_create(thread_b as usize));
+    v.push(thread_create(thread_c as usize));
+    for tid in v.iter() {
+        let exit_code = waittid(*tid as usize);
+        println!("thread#{} exited with code {}", tid, exit_code);
+    }
+    println!("main thread exited.");
+    0
+}
diff --git a/user/src/lib.rs b/user/src/lib.rs
index f13a3673..b6b3e5ac 100644
--- a/user/src/lib.rs
+++ b/user/src/lib.rs
@@ -106,3 +106,14 @@ pub fn sleep(period_ms: usize) {
         sys_yield();
     }
 }
+
+pub fn thread_create(entry: usize) -> isize { sys_thread_create(entry) }
+pub fn gettid() -> isize { sys_gettid() }
+pub fn waittid(tid: usize) -> isize {
+    loop {
+        match sys_waittid(tid) {
+            -2 => { yield_(); }
+            exit_code => return exit_code,
+        }
+    }
+}
diff --git a/user/src/syscall.rs b/user/src/syscall.rs
index 4863bd3d..63c10781 100644
--- a/user/src/syscall.rs
+++ b/user/src/syscall.rs
@@ -11,6 +11,9 @@ const SYSCALL_GETPID: usize = 172;
 const SYSCALL_FORK: usize = 220;
 const SYSCALL_EXEC: usize = 221;
 const SYSCALL_WAITPID: usize = 260;
+const SYSCALL_THREAD_CREATE: usize = 1000;
+const SYSCALL_GETTID: usize = 1001;
+const SYSCALL_WAITTID: usize = 1002;
 
 fn syscall(id: usize, args: [usize; 3]) -> isize {
     let mut ret: isize;
@@ -77,4 +80,16 @@ pub fn sys_exec(path: &str, args: &[*const u8]) -> isize {
 
 pub fn sys_waitpid(pid: isize, exit_code: *mut i32) -> isize {
     syscall(SYSCALL_WAITPID, [pid as usize, exit_code as usize, 0])
-}
\ No newline at end of file
+}
+
+pub fn sys_thread_create(entry: usize) -> isize {
+    syscall(SYSCALL_THREAD_CREATE, [entry, 0, 0])
+}
+
+pub fn sys_gettid() -> isize {
+    syscall(SYSCALL_GETTID, [0; 3])
+}
+
+pub fn sys_waittid(tid: usize) -> isize {
+    syscall(SYSCALL_WAITTID, [tid, 0, 0])
+}