This repository has been archived by the owner on Oct 29, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
extras.bib
31 lines (28 loc) · 4.39 KB
/
extras.bib
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
@article{chen2018,
author = "Chen, Beidi and Shrivastava, Anshumali and Steorts, Rebecca C.",
doi = "10.1214/18-AOAS1163",
fjournal = "The Annals of Applied Statistics",
journal = "Ann. Appl. Stat.",
month = "06",
number = "2",
pages = "1039--1067",
publisher = "The Institute of Mathematical Statistics",
title = "Unique entity estimation with application to the Syrian conflict",
url = "https://doi.org/10.1214/18-AOAS1163",
volume = "12",
year = "2018",
abstract = {Entity resolution identifies and removes duplicate entities in large, noisy databases and has grown in both usage and new developments as a result of increased data availability. Nevertheless, entity resolution has tradeoffs regarding assumptions of the data generation process, error rates, and computational scalability that make it a difficult task for real applications. In this paper, we focus on a related problem of unique entity estimation, which is the task of estimating the unique number of entities and associated standard errors in a data set with duplicate entities. Unique entity estimation shares many fundamental challenges of entity resolution, namely, that the computational cost of all-to-all entity comparisons is intractable for large databases. To circumvent this computational barrier, we propose an efficient (near-linear time) estimation algorithm based on locality sensitive hashing. Our estimator, under realistic assumptions, is unbiased and has provably low variance compared to existing random sampling based approaches. In addition, we empirically show its superiority over the state-of-the-art estimators on three real applications. The motivation for our work is to derive an accurate estimate of the documented, identifiable deaths in the ongoing Syrian conflict. Our methodology, when applied to the Syrian data set, provides an estimate of 191,874±1,772 documented, identifiable deaths, which is very close to the Human Rights Data Analysis Group (HRDAG) estimate of 191,369. Our work provides an example of challenges and efforts involved in solving a real, noisy challenging problem where modeling assumptions may not hold.
This project was started when Shrivastava and Steorts were funded by NCRN grants to Cornell and CMU, respectively.}
}
@article{ncrn-summary,
author={Daniel H. Weinberg and John M. Abowd and Robert F. Belli and Noel Cressie and David C. Folch and Scott H. Holan and Margaret C. Levenstein and Kristen M. Olson and Jerome P. Reiter and Matthew D. Shapiro and Jolene Smyth and Leen-Kiat Soh and Bruce D. Spencer and Seth E. Spielman and Lars Vilhuber and Christopher K. Wikle},
title={{Effects of a Government-Academic Partnership: Has the NSF-Census Bureau Research Network Helped Improve the U.S. Statistical System?}},
journal={Journal of Survey Statistics and Methodology},
abstract={The National Science Foundation-Census Bureau Research Network (NCRN) was established in 2011 to create interdisciplinary research nodes on methodological questions of interest and significance to the broader research community and to the Federal Statistical System (FSS), particularly the Census Bureau. The activities to date have covered both fundamental and applied statistical research and have focused at least in part on the training of current and future generations of researchers in skills of relevance to surveys and alternative measurement of economic units, households, and persons. This paper discusses some of the key research findings of the eight nodes, organized into six topics: (1) Improving census and survey data collection methods; (2) Using alternative sources of data; (3) Protecting privacy and confidentiality by improving disclosure avoidance; (4) Using spatial and spatio-temporal statistical modeling to improve estimates; (5) Assessing data cost and quality tradeoffs; and (6) Combining information from multiple sources. It also reports on collaborations across nodes and with federal agencies, new software developed, and educational activities and outcomes. The paper concludes with an evaluation of the ability of the FSS to apply the NCRN’s research outcomes and suggests some next steps, as well as the implications of this research-network model for future federal government renewal initiatives.},
keywords={},
pages = {smy023},
year = {2018},
doi = {10.1093/jssam/smy023},
URL = {https://doi.org/10.1093/jssam/smy023},
eprint = {/oup/backfile/content_public/journal/jssam/pap/10.1093_jssam_smy023/1/smy023.pdf}
}