@@ -23,50 +23,46 @@ struct devcd_entry {
2323	void  * data ;
2424	size_t  datalen ;
2525	/* 
26- 	 * Here, mutex is required to serialize the calls to del_wk work between 
27- 	 * user/kernel space which happens when devcd is added with device_add() 
28- 	 * and that sends uevent to user space. User space reads the uevents, 
29- 	 * and calls to devcd_data_write() which try to modify the work which is 
30- 	 * not even initialized/queued from devcoredump. 
26+ 	 * There are 2 races for which mutex is required. 
3127	 * 
28+ 	 * The first race is between device creation and userspace writing to 
29+ 	 * schedule immediately destruction. 
3230	 * 
31+ 	 * This race is handled by arming the timer before device creation, but 
32+ 	 * when device creation fails the timer still exists. 
3333	 * 
34- 	 *        cpu0(X)                                 cpu1(Y) 
34+ 	 * To solve this, hold the mutex during device_add(), and set 
35+ 	 * init_completed on success before releasing the mutex. 
3536	 * 
36- 	 *        dev_coredump() uevent sent to user space 
37- 	 *        device_add()  ======================> user space process Y reads the 
38- 	 *                                              uevents writes to devcd fd 
39- 	 *                                              which results into writes to 
37+ 	 * That way the timer will never fire until device_add() is called, 
38+ 	 * it will do nothing if init_completed is not set. The timer is also 
39+ 	 * cancelled in that case. 
4040	 * 
41- 	 *                                             devcd_data_write() 
42- 	 *                                               mod_delayed_work() 
43- 	 *                                                 try_to_grab_pending() 
44- 	 *                                                   timer_delete() 
45- 	 *                                                     debug_assert_init() 
46- 	 *       INIT_DELAYED_WORK() 
47- 	 *       schedule_delayed_work() 
48- 	 * 
49- 	 * 
50- 	 * Also, mutex alone would not be enough to avoid scheduling of 
51- 	 * del_wk work after it get flush from a call to devcd_free() 
52- 	 * mentioned as below. 
53- 	 * 
54- 	 *	disabled_store() 
55- 	 *        devcd_free() 
56- 	 *          mutex_lock()             devcd_data_write() 
57- 	 *          flush_delayed_work() 
58- 	 *          mutex_unlock() 
59- 	 *                                   mutex_lock() 
60- 	 *                                   mod_delayed_work() 
61- 	 *                                   mutex_unlock() 
62- 	 * So, delete_work flag is required. 
41+ 	 * The second race involves multiple parallel invocations of devcd_free(), 
42+ 	 * add a deleted flag so only 1 can call the destructor. 
6343	 */ 
6444	struct  mutex  mutex ;
65- 	bool  delete_work ;
45+ 	bool  init_completed ,  deleted ;
6646	struct  module  * owner ;
6747	ssize_t  (* read )(char  * buffer , loff_t  offset , size_t  count ,
6848			void  * data , size_t  datalen );
6949	void  (* free )(void  * data );
50+ 	/* 
51+ 	 * If nothing interferes and device_add() was returns success, 
52+ 	 * del_wk will destroy the device after the timer fires. 
53+ 	 * 
54+ 	 * Multiple userspace processes can interfere in the working of the timer: 
55+ 	 * - Writing to the coredump will reschedule the timer to run immediately, 
56+ 	 *   if still armed. 
57+ 	 * 
58+ 	 *   This is handled by using "if (cancel_delayed_work()) { 
59+ 	 *   schedule_delayed_work() }", to prevent re-arming after having 
60+ 	 *   been previously fired. 
61+ 	 * - Writing to /sys/class/devcoredump/disabled will destroy the 
62+ 	 *   coredump synchronously. 
63+ 	 *   This is handled by using disable_delayed_work_sync(), and then 
64+ 	 *   checking if deleted flag is set with &devcd->mutex held. 
65+ 	 */ 
7066	struct  delayed_work  del_wk ;
7167	struct  device  * failing_dev ;
7268};
@@ -95,14 +91,27 @@ static void devcd_dev_release(struct device *dev)
9591	kfree (devcd );
9692}
9793
94+ static  void  __devcd_del (struct  devcd_entry  * devcd )
95+ {
96+ 	devcd -> deleted  =  true;
97+ 	device_del (& devcd -> devcd_dev );
98+ 	put_device (& devcd -> devcd_dev );
99+ }
100+ 
98101static  void  devcd_del (struct  work_struct  * wk )
99102{
100103	struct  devcd_entry  * devcd ;
104+ 	bool  init_completed ;
101105
102106	devcd  =  container_of (wk , struct  devcd_entry , del_wk .work );
103107
104- 	device_del (& devcd -> devcd_dev );
105- 	put_device (& devcd -> devcd_dev );
108+ 	/* devcd->mutex serializes against dev_coredumpm_timeout */ 
109+ 	mutex_lock (& devcd -> mutex );
110+ 	init_completed  =  devcd -> init_completed ;
111+ 	mutex_unlock (& devcd -> mutex );
112+ 
113+ 	if  (init_completed )
114+ 		__devcd_del (devcd );
106115}
107116
108117static  ssize_t  devcd_data_read (struct  file  * filp , struct  kobject  * kobj ,
@@ -122,12 +131,12 @@ static ssize_t devcd_data_write(struct file *filp, struct kobject *kobj,
122131	struct  device  * dev  =  kobj_to_dev (kobj );
123132	struct  devcd_entry  * devcd  =  dev_to_devcd (dev );
124133
125- 	mutex_lock ( & devcd -> mutex ); 
126- 	if  (! devcd -> delete_work ) { 
127- 		 devcd -> delete_work   =  true; 
128- 		 mod_delayed_work ( system_wq ,  & devcd -> del_wk ,  0 ); 
129- 	} 
130- 	mutex_unlock (& devcd -> mutex );
134+ 	/* 
135+ 	 * Although it's tempting to use mod_delayed work here,  
136+ 	 * that will cause a reschedule if the timer already fired.  
137+ 	 */  
138+ 	if  ( cancel_delayed_work ( & devcd -> del_wk )) 
139+ 		 schedule_delayed_work (& devcd -> del_wk ,  0 );
131140
132141	return  count ;
133142}
@@ -151,11 +160,21 @@ static int devcd_free(struct device *dev, void *data)
151160{
152161	struct  devcd_entry  * devcd  =  dev_to_devcd (dev );
153162
163+ 	/* 
164+ 	 * To prevent a race with devcd_data_write(), disable work and 
165+ 	 * complete manually instead. 
166+ 	 * 
167+ 	 * We cannot rely on the return value of 
168+ 	 * disable_delayed_work_sync() here, because it might be in the 
169+ 	 * middle of a cancel_delayed_work + schedule_delayed_work pair. 
170+ 	 * 
171+ 	 * devcd->mutex here guards against multiple parallel invocations 
172+ 	 * of devcd_free(). 
173+ 	 */ 
174+ 	disable_delayed_work_sync (& devcd -> del_wk );
154175	mutex_lock (& devcd -> mutex );
155- 	if  (!devcd -> delete_work )
156- 		devcd -> delete_work  =  true;
157- 
158- 	flush_delayed_work (& devcd -> del_wk );
176+ 	if  (!devcd -> deleted )
177+ 		__devcd_del (devcd );
159178	mutex_unlock (& devcd -> mutex );
160179	return  0 ;
161180}
@@ -179,12 +198,10 @@ static ssize_t disabled_show(const struct class *class, const struct class_attri
179198 *                                                                 put_device() <- last reference 
180199 *             error = fn(dev, data)                           devcd_dev_release() 
181200 *             devcd_free(dev, data)                           kfree(devcd) 
182-  *             mutex_lock(&devcd->mutex); 
183201 * 
184202 * 
185203 * In the above diagram, it looks like disabled_store() would be racing with parallelly 
186-  * running devcd_del() and result in memory abort while acquiring devcd->mutex which 
187-  * is called after kfree of devcd memory after dropping its last reference with 
204+  * running devcd_del() and result in memory abort after dropping its last reference with 
188205 * put_device(). However, this will not happens as fn(dev, data) runs 
189206 * with its own reference to device via klist_node so it is not its last reference. 
190207 * so, above situation would not occur. 
@@ -374,7 +391,7 @@ void dev_coredumpm_timeout(struct device *dev, struct module *owner,
374391	devcd -> read  =  read ;
375392	devcd -> free  =  free ;
376393	devcd -> failing_dev  =  get_device (dev );
377- 	devcd -> delete_work  =  false;
394+ 	devcd -> deleted  =  false;
378395
379396	mutex_init (& devcd -> mutex );
380397	device_initialize (& devcd -> devcd_dev );
@@ -383,8 +400,14 @@ void dev_coredumpm_timeout(struct device *dev, struct module *owner,
383400		     atomic_inc_return (& devcd_count ));
384401	devcd -> devcd_dev .class  =  & devcd_class ;
385402
386- 	mutex_lock (& devcd -> mutex );
387403	dev_set_uevent_suppress (& devcd -> devcd_dev , true);
404+ 
405+ 	/* devcd->mutex prevents devcd_del() completing until init finishes */ 
406+ 	mutex_lock (& devcd -> mutex );
407+ 	devcd -> init_completed  =  false;
408+ 	INIT_DELAYED_WORK (& devcd -> del_wk , devcd_del );
409+ 	schedule_delayed_work (& devcd -> del_wk , timeout );
410+ 
388411	if  (device_add (& devcd -> devcd_dev ))
389412		goto put_device ;
390413
@@ -401,13 +424,20 @@ void dev_coredumpm_timeout(struct device *dev, struct module *owner,
401424
402425	dev_set_uevent_suppress (& devcd -> devcd_dev , false);
403426	kobject_uevent (& devcd -> devcd_dev .kobj , KOBJ_ADD );
404- 	INIT_DELAYED_WORK (& devcd -> del_wk , devcd_del );
405- 	schedule_delayed_work (& devcd -> del_wk , timeout );
427+ 
428+ 	/* 
429+ 	 * Safe to run devcd_del() now that we are done with devcd_dev. 
430+ 	 * Alternatively we could have taken a ref on devcd_dev before 
431+ 	 * dropping the lock. 
432+ 	 */ 
433+ 	devcd -> init_completed  =  true;
406434	mutex_unlock (& devcd -> mutex );
407435	return ;
408436 put_device :
409- 	put_device (& devcd -> devcd_dev );
410437	mutex_unlock (& devcd -> mutex );
438+ 	cancel_delayed_work_sync (& devcd -> del_wk );
439+ 	put_device (& devcd -> devcd_dev );
440+ 
411441 put_module :
412442	module_put (owner );
413443 free :
0 commit comments