@@ -23,50 +23,46 @@ struct devcd_entry {
2323 void * data ;
2424 size_t datalen ;
2525 /*
26- * Here, mutex is required to serialize the calls to del_wk work between
27- * user/kernel space which happens when devcd is added with device_add()
28- * and that sends uevent to user space. User space reads the uevents,
29- * and calls to devcd_data_write() which try to modify the work which is
30- * not even initialized/queued from devcoredump.
26+ * There are 2 races for which mutex is required.
3127 *
28+ * The first race is between device creation and userspace writing to
29+ * schedule immediately destruction.
3230 *
31+ * This race is handled by arming the timer before device creation, but
32+ * when device creation fails the timer still exists.
3333 *
34- * cpu0(X) cpu1(Y)
34+ * To solve this, hold the mutex during device_add(), and set
35+ * init_completed on success before releasing the mutex.
3536 *
36- * dev_coredump() uevent sent to user space
37- * device_add() ======================> user space process Y reads the
38- * uevents writes to devcd fd
39- * which results into writes to
37+ * That way the timer will never fire until device_add() is called,
38+ * it will do nothing if init_completed is not set. The timer is also
39+ * cancelled in that case.
4040 *
41- * devcd_data_write()
42- * mod_delayed_work()
43- * try_to_grab_pending()
44- * timer_delete()
45- * debug_assert_init()
46- * INIT_DELAYED_WORK()
47- * schedule_delayed_work()
48- *
49- *
50- * Also, mutex alone would not be enough to avoid scheduling of
51- * del_wk work after it get flush from a call to devcd_free()
52- * mentioned as below.
53- *
54- * disabled_store()
55- * devcd_free()
56- * mutex_lock() devcd_data_write()
57- * flush_delayed_work()
58- * mutex_unlock()
59- * mutex_lock()
60- * mod_delayed_work()
61- * mutex_unlock()
62- * So, delete_work flag is required.
41+ * The second race involves multiple parallel invocations of devcd_free(),
42+ * add a deleted flag so only 1 can call the destructor.
6343 */
6444 struct mutex mutex ;
65- bool delete_work ;
45+ bool init_completed , deleted ;
6646 struct module * owner ;
6747 ssize_t (* read )(char * buffer , loff_t offset , size_t count ,
6848 void * data , size_t datalen );
6949 void (* free )(void * data );
50+ /*
51+ * If nothing interferes and device_add() was returns success,
52+ * del_wk will destroy the device after the timer fires.
53+ *
54+ * Multiple userspace processes can interfere in the working of the timer:
55+ * - Writing to the coredump will reschedule the timer to run immediately,
56+ * if still armed.
57+ *
58+ * This is handled by using "if (cancel_delayed_work()) {
59+ * schedule_delayed_work() }", to prevent re-arming after having
60+ * been previously fired.
61+ * - Writing to /sys/class/devcoredump/disabled will destroy the
62+ * coredump synchronously.
63+ * This is handled by using disable_delayed_work_sync(), and then
64+ * checking if deleted flag is set with &devcd->mutex held.
65+ */
7066 struct delayed_work del_wk ;
7167 struct device * failing_dev ;
7268};
@@ -95,14 +91,27 @@ static void devcd_dev_release(struct device *dev)
9591 kfree (devcd );
9692}
9793
94+ static void __devcd_del (struct devcd_entry * devcd )
95+ {
96+ devcd -> deleted = true;
97+ device_del (& devcd -> devcd_dev );
98+ put_device (& devcd -> devcd_dev );
99+ }
100+
98101static void devcd_del (struct work_struct * wk )
99102{
100103 struct devcd_entry * devcd ;
104+ bool init_completed ;
101105
102106 devcd = container_of (wk , struct devcd_entry , del_wk .work );
103107
104- device_del (& devcd -> devcd_dev );
105- put_device (& devcd -> devcd_dev );
108+ /* devcd->mutex serializes against dev_coredumpm_timeout */
109+ mutex_lock (& devcd -> mutex );
110+ init_completed = devcd -> init_completed ;
111+ mutex_unlock (& devcd -> mutex );
112+
113+ if (init_completed )
114+ __devcd_del (devcd );
106115}
107116
108117static ssize_t devcd_data_read (struct file * filp , struct kobject * kobj ,
@@ -122,12 +131,12 @@ static ssize_t devcd_data_write(struct file *filp, struct kobject *kobj,
122131 struct device * dev = kobj_to_dev (kobj );
123132 struct devcd_entry * devcd = dev_to_devcd (dev );
124133
125- mutex_lock ( & devcd -> mutex );
126- if (! devcd -> delete_work ) {
127- devcd -> delete_work = true;
128- mod_delayed_work ( system_wq , & devcd -> del_wk , 0 );
129- }
130- mutex_unlock (& devcd -> mutex );
134+ /*
135+ * Although it's tempting to use mod_delayed work here,
136+ * that will cause a reschedule if the timer already fired.
137+ */
138+ if ( cancel_delayed_work ( & devcd -> del_wk ))
139+ schedule_delayed_work (& devcd -> del_wk , 0 );
131140
132141 return count ;
133142}
@@ -151,11 +160,21 @@ static int devcd_free(struct device *dev, void *data)
151160{
152161 struct devcd_entry * devcd = dev_to_devcd (dev );
153162
163+ /*
164+ * To prevent a race with devcd_data_write(), disable work and
165+ * complete manually instead.
166+ *
167+ * We cannot rely on the return value of
168+ * disable_delayed_work_sync() here, because it might be in the
169+ * middle of a cancel_delayed_work + schedule_delayed_work pair.
170+ *
171+ * devcd->mutex here guards against multiple parallel invocations
172+ * of devcd_free().
173+ */
174+ disable_delayed_work_sync (& devcd -> del_wk );
154175 mutex_lock (& devcd -> mutex );
155- if (!devcd -> delete_work )
156- devcd -> delete_work = true;
157-
158- flush_delayed_work (& devcd -> del_wk );
176+ if (!devcd -> deleted )
177+ __devcd_del (devcd );
159178 mutex_unlock (& devcd -> mutex );
160179 return 0 ;
161180}
@@ -179,12 +198,10 @@ static ssize_t disabled_show(const struct class *class, const struct class_attri
179198 * put_device() <- last reference
180199 * error = fn(dev, data) devcd_dev_release()
181200 * devcd_free(dev, data) kfree(devcd)
182- * mutex_lock(&devcd->mutex);
183201 *
184202 *
185203 * In the above diagram, it looks like disabled_store() would be racing with parallelly
186- * running devcd_del() and result in memory abort while acquiring devcd->mutex which
187- * is called after kfree of devcd memory after dropping its last reference with
204+ * running devcd_del() and result in memory abort after dropping its last reference with
188205 * put_device(). However, this will not happens as fn(dev, data) runs
189206 * with its own reference to device via klist_node so it is not its last reference.
190207 * so, above situation would not occur.
@@ -374,7 +391,7 @@ void dev_coredumpm_timeout(struct device *dev, struct module *owner,
374391 devcd -> read = read ;
375392 devcd -> free = free ;
376393 devcd -> failing_dev = get_device (dev );
377- devcd -> delete_work = false;
394+ devcd -> deleted = false;
378395
379396 mutex_init (& devcd -> mutex );
380397 device_initialize (& devcd -> devcd_dev );
@@ -383,8 +400,14 @@ void dev_coredumpm_timeout(struct device *dev, struct module *owner,
383400 atomic_inc_return (& devcd_count ));
384401 devcd -> devcd_dev .class = & devcd_class ;
385402
386- mutex_lock (& devcd -> mutex );
387403 dev_set_uevent_suppress (& devcd -> devcd_dev , true);
404+
405+ /* devcd->mutex prevents devcd_del() completing until init finishes */
406+ mutex_lock (& devcd -> mutex );
407+ devcd -> init_completed = false;
408+ INIT_DELAYED_WORK (& devcd -> del_wk , devcd_del );
409+ schedule_delayed_work (& devcd -> del_wk , timeout );
410+
388411 if (device_add (& devcd -> devcd_dev ))
389412 goto put_device ;
390413
@@ -401,13 +424,20 @@ void dev_coredumpm_timeout(struct device *dev, struct module *owner,
401424
402425 dev_set_uevent_suppress (& devcd -> devcd_dev , false);
403426 kobject_uevent (& devcd -> devcd_dev .kobj , KOBJ_ADD );
404- INIT_DELAYED_WORK (& devcd -> del_wk , devcd_del );
405- schedule_delayed_work (& devcd -> del_wk , timeout );
427+
428+ /*
429+ * Safe to run devcd_del() now that we are done with devcd_dev.
430+ * Alternatively we could have taken a ref on devcd_dev before
431+ * dropping the lock.
432+ */
433+ devcd -> init_completed = true;
406434 mutex_unlock (& devcd -> mutex );
407435 return ;
408436 put_device :
409- put_device (& devcd -> devcd_dev );
410437 mutex_unlock (& devcd -> mutex );
438+ cancel_delayed_work_sync (& devcd -> del_wk );
439+ put_device (& devcd -> devcd_dev );
440+
411441 put_module :
412442 module_put (owner );
413443 free :
0 commit comments