-
Notifications
You must be signed in to change notification settings - Fork 34
/
pens_data.html
531 lines (288 loc) · 25.1 KB
/
pens_data.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
<!DOCTYPE html>
<html >
<head>
<!-- Site made with Mobirise Website Builder v5.3.10, https://mobirise.com -->
<meta charset="UTF-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<meta name="generator" content="Mobirise v5.3.10, mobirise.com">
<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1">
<link rel="shortcut icon" href="assets/images/output-onlinepngtools-160x160.png" type="image/x-icon">
<meta name="description" content="Web Site Generator Description">
<title>PENS Dataset</title>
<link rel="stylesheet" href="assets/web/assets/mobirise-icons/mobirise-icons.css">
<link rel="stylesheet" href="assets/bootstrap/css/bootstrap.min.css">
<link rel="stylesheet" href="assets/bootstrap/css/bootstrap-grid.min.css">
<link rel="stylesheet" href="assets/bootstrap/css/bootstrap-reboot.min.css">
<link rel="stylesheet" href="assets/tether/tether.min.css">
<link rel="stylesheet" href="assets/datatables/data-tables.bootstrap4.min.css">
<link rel="stylesheet" href="assets/dropdown/css/style.css">
<link rel="stylesheet" href="assets/theme/css/style.css">
<link rel="preload" href="https://fonts.googleapis.com/css?family=Rubik:300,400,500,600,700,800,900,300i,400i,500i,600i,700i,800i,900i&display=swap" as="style" onload="this.onload=null;this.rel='stylesheet'">
<noscript><link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Rubik:300,400,500,600,700,800,900,300i,400i,500i,600i,700i,800i,900i&display=swap"></noscript>
<link rel="preload" as="style" href="assets/mobirise/css/mbr-additional.css"><link rel="stylesheet" href="assets/mobirise/css/mbr-additional.css" type="text/css">
</head>
<body>
<section class="header1 cid-sxfKWCXd0a" id="header16-i">
<div class="container">
<div class="row justify-content-md-center">
<div class="col-md-10 align-center">
<h1 class="mbr-section-title mbr-bold pb-3 mbr-fonts-style display-2">PENS DATASET</h1>
<p class="mbr-text pb-3 mbr-fonts-style display-5">A benchmark to testify the performance of <br>personalized news headline generation approaches</p>
</div>
</div>
</div>
</section>
<section class="mbr-section article content9 cid-sxfM5qKJSV" id="content9-k">
<div class="container">
<div class="inner-container" style="width: 100%;">
<hr class="line" style="width: 96%;">
<div class="section-text align-center mbr-fonts-style display-5">Training Dataset</div>
<hr class="line" style="width: 96%;">
</div>
</div>
</section>
<section class="mbr-section article content1 cid-sxgzBYtK2z" id="content1-19">
<div class="container">
<div class="media-container-row">
<div class="mbr-text col-12 mbr-fonts-style display-5 col-md-12"><p><strong>Overall Description</strong></p></div>
</div>
</div>
</section>
<section class="mbr-section article content1 cid-sxfNbnjmIf" id="content1-n">
<div class="container">
<div class="media-container-row">
<div class="mbr-text col-12 mbr-fonts-style display-7 col-md-12"><p><span style="font-size: 1rem;">The PENS dataset contains 113,762 pieces of </span><strong>News</strong><span style="font-size: 1rem;"> whose topics are distributed into 15 categories. </span><span style="font-size: 1rem;">Each news includes a news ID, a title, a body and a category manually tagged by editors. The average length of news title and news body is 10.5 and 549.0, individually. Entities from each news title are extracted and then linked to those in <a href="https://www.wikidata.org/wiki/Wikidata:MainPage" class="text-primary">WikiData</a>.</span><br></p><p><span style="font-size: 1rem;">We sample 500, 000 user-news impressions from June 13, 2019, to July 3, 2019, as the<strong> training set</strong>. An impression log records the news articles displayed to a user as well as the click behaviors on these news articles when he/she visits the news website at a specific time. The format of each labeled sample in our training set is [<em>uID, tmp, clkNews, uclkNews, clkedHis</em>], where <em>uID</em> indicates the anonymous ID of a user, <em>tmp</em> denotes the timestamp of this impression record. <em>clkNews</em> and <em>uclkNews</em> are the clicked news and un-clicked news in this impression, respectively. clkedHis represents the news articles previously clicked by this user. All the samples in <em>clkNews, uclkNews</em> and <em>clkedHis</em> are sorted by the user’s click time.</span></p></div>
</div>
</div>
</section>
<section class="cid-sxfOoJT8Af" id="content13-s">
<div class="container">
<div class="media-container-row" style="width: 95%;">
<div class="img-item item1" style="width: 203%;">
<img src="assets/images/1-902x737.png" alt="">
</div>
<div class="img-item">
<img src="assets/images/2-458x758.png" alt="">
</div>
</div>
</div>
</section>
<section class="mbr-section article content1 cid-sxgACQFY4s" id="content1-1e">
<div class="container">
<div class="media-container-row">
<div class="mbr-text col-12 mbr-fonts-style display-5 col-md-12"><p><strong>Dataset Format</strong></p></div>
</div>
</div>
</section>
<section class="mbr-section article content1 cid-sxgzGcvlRi" id="content1-1c">
<div class="container">
<div class="media-container-row">
<div class="mbr-text col-12 mbr-fonts-style display-7 col-md-12"><p>Our provided training dataset has contains the following files:</p></div>
</div>
</div>
</section>
<section class="section-table cid-sxgy9V1G5A" id="table1-12">
<div class="container container-table">
<div class="table-wrapper">
<div class="container">
</div>
<div class="container scroll">
<table class="table" cellspacing="0" data-empty="No matching records found">
<thead>
<tr class="table-heads ">
<th class="head-item mbr-fonts-style display-7">File Name</th><th class="head-item mbr-fonts-style display-7">
Description</th></tr>
</thead>
<tbody>
<tr>
<td class="body-item mbr-fonts-style display-7">news.tsv</td><td class="body-item mbr-fonts-style display-7">The information of news articles</td></tr><tr>
<td class="body-item mbr-fonts-style display-7">train.tsv</td><td class="body-item mbr-fonts-style display-7">The click histories and impression logs of users for training</td></tr><tr>
<td class="body-item mbr-fonts-style display-7">valid.tsv</td><td class="body-item mbr-fonts-style display-7">The click histories and impression logs of users for validation</td></tr></tbody>
</table>
</div>
<div class="container table-info-container">
</div>
</div>
</div>
</section>
<section class="mbr-section article content1 cid-sxgADuf8GS" id="content1-1f">
<div class="container">
<div class="media-container-row">
<div class="mbr-text col-12 mbr-fonts-style display-5 col-md-12"><p>
news.tsv <span style="font-size: 1rem;">contains the detailed information of news articles involved in the train.tsv and valid.tsv files. It has 7 columns divided by the tab symbol: </span></p></div>
</div>
</div>
</section>
<section class="section-table cid-sxgSiwnPrY" id="table1-1g">
<div class="container container-table">
<div class="table-wrapper">
<div class="container">
</div>
<div class="container scroll">
<table class="table" cellspacing="0" data-empty="No matching records found">
<thead>
<tr class="table-heads ">
<th class="head-item mbr-fonts-style display-7">Column</th><th class="head-item mbr-fonts-style display-7">
Example Context</th><th class="head-item mbr-fonts-style display-7">
Description</th></tr>
</thead>
<tbody>
<tr>
<td class="body-item mbr-fonts-style display-7">News ID</td><td class="body-item mbr-fonts-style display-7">N10000</td><td class="body-item mbr-fonts-style display-7">Unique ID of news</td></tr><tr>
<td class="body-item mbr-fonts-style display-7">Category</td><td class="body-item mbr-fonts-style display-7">sports</td><td class="body-item mbr-fonts-style display-7">Belong to one of 15 categories</td></tr><tr>
<td class="body-item mbr-fonts-style display-7">Topic </td><td class="body-item mbr-fonts-style display-7">soccer</td><td class="body-item mbr-fonts-style display-7">Specific topic of news</td></tr><tr>
<td class="body-item mbr-fonts-style display-7">Headline</td><td class="body-item mbr-fonts-style display-7">Predicting Atlanta United's lineup against Columbus Crew in the U.S. Open Cup</td><td class="body-item mbr-fonts-style display-7"></td></tr><tr>
<td class="body-item mbr-fonts-style display-7">News body</td><td class="body-item mbr-fonts-style display-7">Only FIVE internationals allowed, count em, FIVE! So first off we should say, per our usual Atlanta United lineup predictions, this will be wrong...</td><td class="body-item mbr-fonts-style display-7"></td></tr><tr>
<td class="body-item mbr-fonts-style display-7">Title entity</td><td class="body-item mbr-fonts-style display-7">{"Atlanta United's": 'Atlanta United FC'}</td><td class="body-item mbr-fonts-style display-7">The mapping between the phrase in title and the entity in wikidata</td></tr><tr>
<td class="body-item mbr-fonts-style display-7">Entity content</td><td class="body-item mbr-fonts-style display-7">{'Atlanta United FC': {
<br>'type': 'item',
<br>'id': 'Q16836317',
<br>'labels': {'en': {'language': 'en', 'value': 'Atlanta United FC'}, ...},
<br>'descriptions': {'en': {'language': 'en', 'value': 'Football team in the city of Atlanta, Georgia, United States'}, ...},
<br>'aliases': {'en': [{'language': 'en', 'value': 'Atlanta United'}, {'language': 'en', 'value': 'ATL UTD'}, {'language': 'en', 'value': 'ATL UTD FC'}, ...], ...},
<br>'claims': {'P31': [{'mainsnak': {'snaktype': 'value', 'property': 'P31', 'datavalue': {'value' {'entity-type': 'item', 'numeric-id': 476028, 'id': 'Q476028'}, 'type': 'wikibase-entityid'}, 'datatype': 'wikibase-item'}, 'type': 'statement', 'id': 'Q16836317$2462E96F-B25E-4BE9-9CAC-876FF99CD5DA', 'rank': 'normal'}, ... ], ...},
<br>'sitelinks': {'zhwiki': {'site': 'zhwiki', 'title': '阿特蘭大聯足球會', 'badges': []}, ...}
<br>'lastrevid': 1452771827}, ...}</td><td class="body-item mbr-fonts-style display-7">The mapping between the entity name and the entity content in wikidata. For detailed data structure, please refer to the <a href="https://www.mediawiki.org/wiki/Wikibase/DataModel#Wikidata_Object_Notation" class="text-primary" target="_blank">official documents</a>.</td></tr></tbody>
</table>
</div>
<div class="container table-info-container">
</div>
</div>
</div>
</section>
<section class="mbr-section article content1 cid-sxgTiv55iy" id="content1-1h">
<div class="container">
<div class="media-container-row">
<div class="mbr-text col-12 mbr-fonts-style display-5 col-md-12"><p>train.tsv & valid.tsv <span style="font-size: 1rem;">contains the impression logs and users' news click histories. They have 9 columns divided by the tab symbol: </span></p></div>
</div>
</div>
</section>
<section class="section-table cid-sxgTkJ7Cgh" id="table1-1i">
<div class="container container-table">
<div class="table-wrapper">
<div class="container">
</div>
<div class="container scroll">
<table class="table" cellspacing="0" data-empty="No matching records found">
<thead>
<tr class="table-heads ">
<th class="head-item mbr-fonts-style display-7">Column</th><th class="head-item mbr-fonts-style display-7">
Example Context</th><th class="head-item mbr-fonts-style display-7">
Description</th></tr>
</thead>
<tbody>
<tr>
<td class="body-item mbr-fonts-style display-7">UserID</td><td class="body-item mbr-fonts-style display-7">U335175</td><td class="body-item mbr-fonts-style display-7">Unique ID of users</td></tr><tr>
<td class="body-item mbr-fonts-style display-7">ClicknewsID</td><td class="body-item mbr-fonts-style display-7">N41340 N27570 N83288 ...</td><td class="body-item mbr-fonts-style display-7">The user’s historical clicked news</td></tr><tr>
<td class="body-item mbr-fonts-style display-7">dwelltime</td><td class="body-item mbr-fonts-style display-7">116 23 59 ...</td><td class="body-item mbr-fonts-style display-7">The duration of browsing historical clicked news</td></tr><tr>
<td class="body-item mbr-fonts-style display-7">exposure_time</td><td class="body-item mbr-fonts-style display-7">6/19/2019 5:10:01 AM#TAB#...</td><td class="body-item mbr-fonts-style display-7">The exposure time of historical clicked news and can be split by '#TAB#'</td></tr><tr>
<td class="body-item mbr-fonts-style display-7">pos</td><td class="body-item mbr-fonts-style display-7">N55476 N103556 N52756 ...</td><td class="body-item mbr-fonts-style display-7">The clicked news in this impression</td></tr><tr>
<td class="body-item mbr-fonts-style display-7">neg</td><td class="body-item mbr-fonts-style display-7">N48119 N92507 N92467 ...</td><td class="body-item mbr-fonts-style display-7">The unclicked news in this impression</td></tr><tr>
<td class="body-item mbr-fonts-style display-7">start</td><td class="body-item mbr-fonts-style display-7">7/3/2019 6:43:49 AM</td><td class="body-item mbr-fonts-style display-7">Start time of this impression</td></tr><tr>
<td class="body-item mbr-fonts-style display-7">end</td><td class="body-item mbr-fonts-style display-7">7/3/2019 7:06:06 AM</td><td class="body-item mbr-fonts-style display-7">End time of this impression</td></tr><tr>
<td class="body-item mbr-fonts-style display-7">dwelltime_pos</td><td class="body-item mbr-fonts-style display-7">34 83 79 ...</td><td class="body-item mbr-fonts-style display-7">The duration of browsing clicked news in this impression</td></tr></tbody>
</table>
</div>
<div class="container table-info-container">
</div>
</div>
</div>
</section>
<section class="mbr-section article content9 cid-sxfUHvQ3qT" id="content9-t">
<div class="container">
<div class="inner-container" style="width: 100%;">
<hr class="line" style="width: 98%;">
<div class="section-text align-center mbr-fonts-style display-5">
Test Dataset</div>
<hr class="line" style="width: 98%;">
</div>
</div>
</section>
<section class="mbr-section article content1 cid-sxi8cdF5hH" id="content1-1m">
<div class="container">
<div class="media-container-row">
<div class="mbr-text col-12 mbr-fonts-style display-5 col-md-12"><p><strong>Overall Description</strong></p></div>
</div>
</div>
</section>
<section class="cid-sxfOiS03oo" id="image1-r">
<figure class="mbr-figure container">
<div class="image-block" style="width: 91%;">
<img src="assets/images/process-1322x268.png" width="1400" alt="Mobirise">
</div>
</figure>
</section>
<section class="mbr-section article content1 cid-sxfNbzfXCT" id="content1-o">
<div class="container">
<div class="media-container-row">
<div class="mbr-text col-12 mbr-fonts-style display-7 col-md-12"><p>
<strong>The construction process of test set: </strong> To provide an offline testbed, we invited 103 English native speakers (all are college students) to manually create a test set by two stages. </p><p><strong>At the first stage</strong>, each person browses 1,000 news headlines and marks at least 50 pieces he/she is interested in. These exhibited news were randomly selected from our news corpus and were arranged by their first exposure time. </p><p><strong>At the second stage</strong>, everyone is asked to write down their preferred headlines for another 200 unseen news articles from our dataset without exhibiting them the original news titles, while highlighting some important segments in the original news articles as well. These unseen news articles are evenly sampled, and we redundantly assign them to make sure each news is exhibited to four people on average. The quality of these manually-written headlines were checked by professional editors from the perspective of the factual aspect of media frame. Low-quality headlines, e.g. containing wrong factual information, inconsistent with the news body, too-short or overlong, etc., are excluded. The rest are regarded as the personalized reading focuses of these annotators on the articles, and are taken as gold-standard headlines in our dataset.</p></div>
</div>
</div>
</section>
<section class="mbr-section article content1 cid-sxi8dPZbSp" id="content1-1n">
<div class="container">
<div class="media-container-row">
<div class="mbr-text col-12 mbr-fonts-style display-5 col-md-12"><p><strong>Dataset Format</strong></p></div>
</div>
</div>
</section>
<section class="mbr-section article content1 cid-sxgzFUCgCz" id="content1-1a">
<div class="container">
<div class="media-container-row">
<div class="mbr-text col-12 mbr-fonts-style display-7 col-md-12"><p>Our provided test dataset has the following contexts:</p></div>
</div>
</div>
</section>
<section class="section-table cid-sxialodHJK" id="table1-1o">
<div class="container container-table">
<div class="table-wrapper">
<div class="container">
</div>
<div class="container scroll">
<table class="table" cellspacing="0" data-empty="No matching records found">
<thead>
<tr class="table-heads ">
<th class="head-item mbr-fonts-style display-7">Column</th><th class="head-item mbr-fonts-style display-7">Example Context</th><th class="head-item mbr-fonts-style display-7">Description</th></tr>
</thead>
<tbody>
<tr>
<td class="body-item mbr-fonts-style display-7">userid</td><td class="body-item mbr-fonts-style display-7">NT1</td><td class="body-item mbr-fonts-style display-7">The unique ID of 103 users</td></tr><tr>
<td class="body-item mbr-fonts-style display-7">clicknewsID </td><td class="body-item mbr-fonts-style display-7">N108480,N38238,N35068, ...</td><td class="body-item mbr-fonts-style display-7">The user’s historical clicked news collected at the first stage</td></tr><tr>
<td class="body-item mbr-fonts-style display-7">posnewID</td><td class="body-item mbr-fonts-style display-7">N24110,N62769,N36186, ...<br></td><td class="body-item mbr-fonts-style display-7">The exhibited news for each user at the second stage<br></td></tr><tr>
<td class="body-item mbr-fonts-style display-7">rewrite_titles</td><td class="body-item mbr-fonts-style display-7">'Legal battle looms over Trump EPA\'s rule change of Obama\'s Clean Power Plan rule ...</td><td class="body-item mbr-fonts-style display-7">The manually-written news headlines for the exhibited news articles and can be split by '#TAB#'</td></tr></tbody>
</table>
</div>
<div class="container table-info-container">
</div>
</div>
</div>
</section>
<section class="menu cid-sykHwG5qtz" once="menu" id="menu2-28">
<nav class="navbar navbar-expand beta-menu navbar-dropdown align-items-center navbar-fixed-top navbar-toggleable-sm bg-color transparent">
<button class="navbar-toggler navbar-toggler-right" type="button" data-toggle="collapse" data-target="#navbarSupportedContent" aria-controls="navbarSupportedContent" aria-expanded="false" aria-label="Toggle navigation">
<div class="hamburger">
<span></span>
<span></span>
<span></span>
<span></span>
</div>
</button>
<div class="menu-logo">
<div class="navbar-brand">
<span class="navbar-logo">
<a href="pens.html">
<img src="assets/images/output-onlinepngtools-160x160.png" alt="" style="height: 5rem;">
</a>
</span>
<span class="navbar-caption-wrap"><a class="navbar-caption text-black display-4" href="https://mobiri.se"></a></span>
</div>
</div>
<div class="collapse navbar-collapse" id="navbarSupportedContent">
<ul class="navbar-nav nav-dropdown nav-right" data-app-modern-menu="true"><li class="nav-item dropdown"><a class="nav-link link text-black dropdown-toggle display-4" href="#" aria-expanded="false" data-toggle="dropdown-submenu">
About PENS </a><div class="dropdown-menu"><a class="text-black dropdown-item text-primary display-4" href="pens_model.html" aria-expanded="false">MODEL</a><a class="text-black dropdown-item text-primary display-4" href="pens_data.html" aria-expanded="false">DATA</a></div></li></ul>
</div>
</nav>
</section><section style="background-color: #fff; font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', 'Roboto', 'Helvetica Neue', Arial, sans-serif; color:#aaa; font-size:12px; padding: 0; align-items: center; display: flex;"><a href="https://mobirise.site/n" style="flex: 1 1; height: 3rem; padding-left: 1rem;"></a><p style="flex: 0 0 auto; margin:0; padding-right:1rem;"><a href="https://mobirise.site/d" style="color:#aaa;">The web page</a> was started with Mobirise</p></section><script src="assets/web/assets/jquery/jquery.min.js"></script> <script src="assets/popper/popper.min.js"></script> <script src="assets/bootstrap/js/bootstrap.min.js"></script> <script src="assets/tether/tether.min.js"></script> <script src="assets/smoothscroll/smooth-scroll.js"></script> <script src="assets/datatables/jquery.data-tables.min.js"></script> <script src="assets/datatables/data-tables.bootstrap4.min.js"></script> <script src="assets/dropdown/js/nav-dropdown.js"></script> <script src="assets/dropdown/js/navbar-dropdown.js"></script> <script src="assets/touchswipe/jquery.touch-swipe.min.js"></script> <script src="assets/theme/js/script.js"></script>
</body>
</html>