|
| 1 | +@inproceedings{satheesan2022toward, |
| 2 | + author = {Satheesan, Sandeep Puthanveetil and Bhavya and Davies, Adam and Craig, Alan B. and Zhang, Yu and Zhai, ChengXiang}, |
| 3 | + title = {Toward a Big Data Analysis System for Historical Newspaper Collections Research}, |
| 4 | + year = {2022}, |
| 5 | + isbn = {9781450394109}, |
| 6 | + publisher = {Association for Computing Machinery}, |
| 7 | + address = {New York, NY, USA}, |
| 8 | + url = {https://doi.org/10.1145/3539781.3539795}, |
| 9 | + doi = {10.1145/3539781.3539795}, |
| 10 | + abstract = {The availability and generation of digitized newspaper collections have provided researchers in several domains with a powerful tool to advance their research. More specifically, digitized historical newspapers give us a magnifying glass into the past. In this paper, we propose a scalable and customizable big data analysis system that enables researchers to study complex questions about our society as depicted in news media for the past few centuries by applying cutting-edge text analysis tools to large historical newspaper collections. We discuss our experience with building a preliminary version of such a system, including how we have addressed the following challenges: processing millions of digitized newspaper pages from various publications worldwide, which amount to hundreds of terabytes of data; applying article segmentation and Optical Character Recognition (OCR) to historical newspapers, which vary between and within publications over time; retrieving relevant information to answer research questions from such data collections by applying human-in-the-loop machine learning; and enabling users to analyze topic evolution and semantic dynamics with multiple compatible analysis operators. We also present some preliminary results of using the proposed system to study the social construction of juvenile delinquency in the United States and discuss important remaining challenges to be tackled in the future.}, |
| 11 | + booktitle = {Proceedings of the Platform for Advanced Scientific Computing Conference}, |
| 12 | + articleno = {12}, |
| 13 | + numpages = {11}, |
| 14 | + keywords = {newspaper article segmentation, text analysis, historical newspapers, social science research, juvenile delinquency, natural language processing, social construction, data visualization, big data analysis system, image analysis, information retrieval}, |
| 15 | + location = {Basel, Switzerland}, |
| 16 | + series = {PASC '22} |
| 17 | +} |
| 18 | + |
1 | 19 | @inproceedings{kuhn2014movie,
|
2 | 20 | title = {MOVIE: Large Scale Automated Analysis of MOVing ImagEs},
|
3 | 21 | author = {Kuhn, Virginia and Marini, Luigi and Simeone, Michael and Craig, Alan and Diesendruck, Liana and Puthanveetil Satheesan, Sandeep and Bock, David},
|
@@ -114,7 +132,7 @@ @inproceedings{marini2019clowder
|
114 | 132 | }
|
115 | 133 |
|
116 | 134 | @inproceedings{satheesan2017brown,
|
117 |
| - author = {{Puthanveetil Satheesan}, S. and {Alameda}, J. and {Bradley}, S. and {Dietze}, M. and {Jansen}, G. and {Kooper}, R. and {Kumar}, P. and {Lee}, J. and {Marciano}, R. and {Marini}, L. and {Minsker}, B.~S. and {Navarro}, C. and {Roeder}, E. and {Schmidt}, A. and {Slavenas}, M. and {Sullivan}, W. and {Zhang}, B. and {Zhao}, Y. and {Zharnitsky}, I. and {McHenry}, K.}, |
| 135 | + author = {Puthanveetil Satheesan, S. and Alameda, J. and Bradley, S. and Dietze, M. and Jansen, G. and Kooper, R. and Kumar, P. and Lee, J. and Marciano, R. and Marini, L. and Minsker, B.~S. and Navarro, C. and Roeder, E. and Schmidt, A. and Slavenas, M. and Sullivan, W. and Zhang, B. and Zhao, Y. and Zharnitsky, I. and McHenry, K.}, |
118 | 136 | title = {Brown Dog: A Data Transformation Ecosystem for Research - Advancing from Beta to 1.0},
|
119 | 137 | keywords = {1916 Data and information discovery, INFORMATICS, 1946 Metadata, INFORMATICS, 1950 Metadata: Quality, INFORMATICS},
|
120 | 138 | booktitle = {AGU Fall Meeting Abstracts},
|
|
0 commit comments