From 4aa9431ea3b1900b4832e3ee6fe3f23523e855f5 Mon Sep 17 00:00:00 2001 From: Ulf Gebhardt Date: Tue, 15 Jan 2019 13:04:18 +0100 Subject: [PATCH 01/10] cleanup for production --- package.json | 4 +-- src/browser/DeputyProfileBrowser.ts | 2 +- src/index.ts | 34 +++++--------------------- src/parser/DeputyProfileParser.ts | 7 +++--- src/run.ts | 38 +++++++++++++++++++++++++++++ tsconfig.json | 12 ++++----- yarn.lock | 4 +-- 7 files changed, 58 insertions(+), 43 deletions(-) create mode 100644 src/run.ts diff --git a/package.json b/package.json index ba22484..b433ab1 100644 --- a/package.json +++ b/package.json @@ -14,7 +14,7 @@ }, "scripts": { "build": "tsc", - "dev": "nodemon .\\src\\index.ts", + "dev": "nodemon .\\src\\run.ts", "lint": "tslint --project tsconfig.json && yarn typecheck", "typecheck": "tsc --noEmit", "test": "mocha -r ts-node/register test/**/*.test.ts" @@ -47,4 +47,4 @@ "xmldom": "^0.1.27", "xpath": "^0.0.27" } -} +} \ No newline at end of file diff --git a/src/browser/DeputyProfileBrowser.ts b/src/browser/DeputyProfileBrowser.ts index 3e1e32b..0806d02 100644 --- a/src/browser/DeputyProfileBrowser.ts +++ b/src/browser/DeputyProfileBrowser.ts @@ -57,7 +57,7 @@ namespace Deputy_Browser { if (blobUrl == undefined) { throw new Error("URL stack is empty."); } - console.log(blobUrl.toString()); + // console.log(blobUrl.toString()); let response = await axios.default.get( blobUrl.toString(), diff --git a/src/index.ts b/src/index.ts index 4687406..3d3a4a3 100644 --- a/src/index.ts +++ b/src/index.ts @@ -1,35 +1,13 @@ -import * as fs from 'fs'; -import * as util from 'util'; - -import { Scraper } from 'scapacra'; - import { IProtocolScraperConfigurationOptions } from './config/ProtocolScraperConfiguration'; import { ProtocolSpeechScraperConfiguration } from './config/ProtocolSpeechScraperConfiguration'; import { ProtocolVotingScraperConfiguration } from './config/ProtocolVotingScraperConfiguration'; import { ProposedDecisionScraperConfiguration } from './config/ProposedDecisionScraperConfiguration'; import { DeputyProfileScraperConfiguration } from './config/DeputyProfileScraperConfiguration'; -async function scrape() { - let options: IProtocolScraperConfigurationOptions = { - maxCount: 2 - }; - - await Scraper.scrape([ - // new ProtocolSpeechScraperConfiguration(options), - new ProtocolVotingScraperConfiguration(options), - // new ProposedDecisionScraperConfiguration() - // new DeputyProfileScraperConfiguration() - ], ((dataPackages) => { - console.log(util.inspect(dataPackages, false, null, true)) - for (const dataPackage of dataPackages) { - let id = dataPackage.data.id; - if (id == null) { - id = dataPackage.data["top-id"]; - } - - fs.writeFileSync('out/scraperResult/deputies/' + id + '.json', JSON.stringify(dataPackage.data)); - } - })); +export { + IProtocolScraperConfigurationOptions, + ProtocolSpeechScraperConfiguration, + ProtocolVotingScraperConfiguration, + ProposedDecisionScraperConfiguration, + DeputyProfileScraperConfiguration } - -scrape().then(c => { }); \ No newline at end of file diff --git a/src/parser/DeputyProfileParser.ts b/src/parser/DeputyProfileParser.ts index cca56dd..afe2dab 100644 --- a/src/parser/DeputyProfileParser.ts +++ b/src/parser/DeputyProfileParser.ts @@ -102,7 +102,7 @@ namespace Deputy_Parser { } // ID - let mdb_id: string = ''; + let id: string = ''; const regex_id = / { if (group === 1) { - mdb_id = match; + id = match; } }); } @@ -351,8 +351,7 @@ namespace Deputy_Parser { }); } - const id = `${mdb_id}_${name}`.replace(/(\.|\/| |,)/g, '_'); - const result: any = { id, img, name, party, job, buero, links, bio, wk, wk_name, aemter, speeches, votes, publication_requirement, mdb_id }; + const result: any = { id, img, name, party, job, buero, links, bio, wk, wk_name, aemter, speeches, votes, publication_requirement }; return [{ metadata: data.metadata, diff --git a/src/run.ts b/src/run.ts new file mode 100644 index 0000000..958a5cf --- /dev/null +++ b/src/run.ts @@ -0,0 +1,38 @@ +import * as fs from 'fs'; +import * as util from 'util'; + +import { Scraper } from 'scapacra'; + +import { + IProtocolScraperConfigurationOptions, + ProtocolSpeechScraperConfiguration, + ProtocolVotingScraperConfiguration, + ProposedDecisionScraperConfiguration, + DeputyProfileScraperConfiguration +} from './'; + +async function scrape() { + let options: IProtocolScraperConfigurationOptions = { + maxCount: 2 + }; + + await Scraper.scrape([ + // new ProtocolSpeechScraperConfiguration(options), + // new ProtocolVotingScraperConfiguration(options), + // new ProposedDecisionScraperConfiguration() + new DeputyProfileScraperConfiguration() + ], ((dataPackages) => { + console.log(util.inspect(dataPackages, false, null, true)) + for (const dataPackage of dataPackages) { + let id = dataPackage.data.id; + if (id == null) { + id = dataPackage.data["top-id"]; + } + + const file_id = `${dataPackage.data.id}_${dataPackage.data.name}`.replace(/(\.|\/| |,)/g, '_'); + fs.writeFileSync('out/scraperResult/deputies/' + file_id + '.json', JSON.stringify(dataPackage)); + } + })); +} + +scrape().then(c => { }); \ No newline at end of file diff --git a/tsconfig.json b/tsconfig.json index 65059fe..f0b8809 100644 --- a/tsconfig.json +++ b/tsconfig.json @@ -1,17 +1,17 @@ { "compilerOptions": { /* Basic Options */ - "target": "es6", /* Specify ECMAScript target version: 'ES3' (default), 'ES5', 'ES2015', 'ES2016', 'ES2017', or 'ESNEXT'. */ - "module": "commonjs", /* Specify module code generation: 'commonjs', 'amd', 'system', 'umd' or 'es2015'. */ - "watch": true, + "target": "es6", /* Specify ECMAScript target version: 'ES3' (default), 'ES5', 'ES2015', 'ES2016', 'ES2017', or 'ESNEXT'. */ + "module": "commonjs", /* Specify module code generation: 'commonjs', 'amd', 'system', 'umd' or 'es2015'. */ + "watch": false, // "lib": [], /* Specify library files to be included in the compilation: */ // "allowJs": true, /* Allow javascript files to be compiled. */ // "checkJs": true, /* Report errors in .js files. */ // "jsx": "preserve", /* Specify JSX code generation: 'preserve', 'react-native', or 'react'. */ - "declaration": true, /* Generates corresponding '.d.ts' file. */ + "declaration": true, /* Generates corresponding '.d.ts' file. */ // "sourceMap": true, /* Generates corresponding '.map' file. */ // "outFile": "./", /* Concatenate and emit output to single file. */ - "outDir": "./dist", /* Redirect output structure to the directory. */ + "outDir": "./dist", /* Redirect output structure to the directory. */ // "rootDir": "./", /* Specify the root directory of input files. Use to control the output directory structure with --outDir. */ // "removeComments": true, /* Do not emit comments to output. */ // "noEmit": true, /* Do not emit outputs. */ @@ -19,7 +19,7 @@ // "downlevelIteration": true, /* Provide full support for iterables in 'for-of', spread, and destructuring when targeting 'ES5' or 'ES3'. */ // "isolatedModules": true, /* Transpile each file as a separate module (similar to 'ts.transpileModule'). */ /* Strict Type-Checking Options */ - "strict": true /* Enable all strict type-checking options. */ + "strict": true /* Enable all strict type-checking options. */ // "noImplicitAny": true, /* Raise error on expressions and declarations with an implied 'any' type. */ // "strictNullChecks": true, /* Enable strict null checks. */ // "noImplicitThis": true, /* Raise error on 'this' expressions with an implied 'any' type. */ diff --git a/yarn.lock b/yarn.lock index b447954..ee0c8a1 100644 --- a/yarn.lock +++ b/yarn.lock @@ -1601,9 +1601,9 @@ sax@>=0.6.0, sax@^1.2.4: resolved "https://registry.yarnpkg.com/sax/-/sax-1.2.4.tgz#2816234e2378bddc4e5354fab5caa895df7100d9" integrity sha512-NqVDv9TpANUjFm0N8uM5GxL36UgKi9/atZw+x7YFnQ8ckwFGKrl4xX4yWtrey3UJm5nP1kUbnYgLopqWNSRhWw== -"scapacra@https://github.com/demokratie-live/scapacra.git#pre-release/1.0.1": +"scapacra@https://github.com/demokratie-live/scapacra.git#master": version "1.0.0" - resolved "https://github.com/demokratie-live/scapacra.git#424cb420ba18627ae2ffe0a60d93b67cafa0547b" + resolved "https://github.com/demokratie-live/scapacra.git#b60ff4b95e431420606a4f77c1945fccfac6d46a" dependencies: "@types/xml2js" "^0.4.3" "@types/xmldom" "^0.1.29" From c59ec0fdb18efbb7cde298957a30a4567e30a132 Mon Sep 17 00:00:00 2001 From: Ulf Gebhardt Date: Tue, 15 Jan 2019 13:07:28 +0100 Subject: [PATCH 02/10] referenced scapacra 1.0.0 instead of master --- package.json | 2 +- yarn.lock | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/package.json b/package.json index b433ab1..2e1cf0a 100644 --- a/package.json +++ b/package.json @@ -40,7 +40,7 @@ "@types/xmldom": "^0.1.29", "axios": "^0.18.0", "jsonschema": "^1.2.4", - "scapacra": "https://github.com/demokratie-live/scapacra.git#master", + "scapacra": "https://github.com/demokratie-live/scapacra.git#1.0.0", "typescript": "^3.1.6", "url": "^0.11.0", "xml2js": "^0.4.19", diff --git a/yarn.lock b/yarn.lock index ee0c8a1..3220656 100644 --- a/yarn.lock +++ b/yarn.lock @@ -1601,7 +1601,7 @@ sax@>=0.6.0, sax@^1.2.4: resolved "https://registry.yarnpkg.com/sax/-/sax-1.2.4.tgz#2816234e2378bddc4e5354fab5caa895df7100d9" integrity sha512-NqVDv9TpANUjFm0N8uM5GxL36UgKi9/atZw+x7YFnQ8ckwFGKrl4xX4yWtrey3UJm5nP1kUbnYgLopqWNSRhWw== -"scapacra@https://github.com/demokratie-live/scapacra.git#master": +"scapacra@https://github.com/demokratie-live/scapacra.git#1.0.0": version "1.0.0" resolved "https://github.com/demokratie-live/scapacra.git#b60ff4b95e431420606a4f77c1945fccfac6d46a" dependencies: From 9392441d9111e89374fc83b29c74507083270a77 Mon Sep 17 00:00:00 2001 From: Ulf Gebhardt Date: Tue, 15 Jan 2019 13:33:59 +0100 Subject: [PATCH 03/10] corrected package name, included prepare script, version bump --- package.json | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/package.json b/package.json index 2e1cf0a..c4792a6 100644 --- a/package.json +++ b/package.json @@ -1,7 +1,7 @@ { - "name": "bt-scapacra", - "version": "1.0.0", - "description": "Bundestag scapacra", + "name": "scapacra-bt", + "version": "1.0.1", + "description": "Scapacra Bundestag", "main": "./dist/index.js", "typings": "./dist/index.d.ts", "author": "DEMOCRACY Deutschland e.V.", @@ -17,7 +17,8 @@ "dev": "nodemon .\\src\\run.ts", "lint": "tslint --project tsconfig.json && yarn typecheck", "typecheck": "tsc --noEmit", - "test": "mocha -r ts-node/register test/**/*.test.ts" + "test": "mocha -r ts-node/register test/**/*.test.ts", + "prepare": "yarn run build" }, "bugs": { "url": "https://github.com/demokratie-live/bt-scapacra/issues" From a41c7ef582402c43e46c0bf1a9ccef5f4485b87c Mon Sep 17 00:00:00 2001 From: Ulf Gebhardt Date: Tue, 15 Jan 2019 14:05:53 +0100 Subject: [PATCH 04/10] adjusted compile options to not include test & run.ts --- tsconfig.json | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tsconfig.json b/tsconfig.json index f0b8809..1f1ad58 100644 --- a/tsconfig.json +++ b/tsconfig.json @@ -45,5 +45,11 @@ /* Experimental Options */ // "experimentalDecorators": true, /* Enables experimental support for ES7 decorators. */ // "emitDecoratorMetadata": true, /* Enables experimental support for emitting type metadata for decorators. */ - } + }, + "include": [ + "src/**/*.ts" + ], + "exclude": [ + "src/run.ts" + ] } \ No newline at end of file From 972349128da5852cb8f18b18c584b3216c5e9d91 Mon Sep 17 00:00:00 2001 From: Ulf Gebhardt Date: Thu, 17 Jan 2019 11:53:44 +0100 Subject: [PATCH 05/10] cleanup result, included directCandidate field --- src/parser/DeputyProfileParser.ts | 221 +++++++++++++++++------------- yarn.lock | 36 ++--- 2 files changed, 140 insertions(+), 117 deletions(-) diff --git a/src/parser/DeputyProfileParser.ts b/src/parser/DeputyProfileParser.ts index afe2dab..c7bbeb5 100644 --- a/src/parser/DeputyProfileParser.ts +++ b/src/parser/DeputyProfileParser.ts @@ -28,19 +28,19 @@ namespace Deputy_Parser { let m; - //Img & Name - let img: string = ''; + //ImgURL & Name + let imgURL: string = ''; let name: string = ''; - const regex_img_name = /
[\s\S]*?[\s\S]*?/gm; - while ((m = regex_img_name.exec(string)) !== null) { + const regex_imgURL_name = /
[\s\S]*?[\s\S]*?/gm; + while ((m = regex_imgURL_name.exec(string)) !== null) { // This is necessary to avoid infinite loops with zero-width matches - if (m.index === regex_img_name.lastIndex) { - regex_img_name.lastIndex++; + if (m.index === regex_imgURL_name.lastIndex) { + regex_imgURL_name.lastIndex++; } // The result can be accessed through the `m`-variable. m.forEach((match, group) => { if (group === 1) { - img = base_url + match; + imgURL = base_url + match; } if (group === 2) { name = match; @@ -84,19 +84,19 @@ namespace Deputy_Parser { }); } - // Büro - let buero: string[] = []; - const regex_buero = /
Abgeordnetenbüro<\/h5>[\s\S]*?

([\s\S]*?)<\/p>/gm; - while ((m = regex_buero.exec(string)) !== null) { + // Office + let office: string[] = []; + const regex_office = /

Abgeordnetenbüro<\/h5>[\s\S]*?

([\s\S]*?)<\/p>/gm; + while ((m = regex_office.exec(string)) !== null) { // This is necessary to avoid infinite loops with zero-width matches - if (m.index === regex_buero.lastIndex) { - regex_buero.lastIndex++; + if (m.index === regex_office.lastIndex) { + regex_office.lastIndex++; } // The result can be accessed through the `m`-variable. m.forEach((match, group) => { if (group === 1) { - buero = match.split(/
|/); + office = match.split(/
|/); } }); } @@ -156,202 +156,225 @@ namespace Deputy_Parser { } - // Bio - let bio_sel: string = ''; - let bio: string[] = []; - const regex_bio_sel = /

Biografie<\/h4>[\s\S]*?
[\s\S]*?
([\s\S]*?)<\/div>/gm; - const regex_bio = /

([\s\S]*?)<\/p>/gm; - while ((m = regex_bio_sel.exec(string)) !== null) { + // Biography + let biography_sel: string = ''; + let biography: string[] = []; + const regex_biography_sel = /

Biografie<\/h4>[\s\S]*?
[\s\S]*?
([\s\S]*?)<\/div>/gm; + const regex_biography = /

([\s\S]*?)<\/p>/gm; + while ((m = regex_biography_sel.exec(string)) !== null) { // This is necessary to avoid infinite loops with zero-width matches - if (m.index === regex_bio_sel.lastIndex) { - regex_bio_sel.lastIndex++; + if (m.index === regex_biography_sel.lastIndex) { + regex_biography_sel.lastIndex++; } // The result can be accessed through the `m`-variable. m.forEach((match, group) => { if (group === 1) { - bio_sel = match; + biography_sel = match; } }); } - while ((m = regex_bio.exec(bio_sel)) !== null) { + while ((m = regex_biography.exec(biography_sel)) !== null) { // This is necessary to avoid infinite loops with zero-width matches - if (m.index === regex_bio.lastIndex) { - regex_bio.lastIndex++; + if (m.index === regex_biography.lastIndex) { + regex_biography.lastIndex++; } // The result can be accessed through the `m`-variable. m.forEach((match, group) => { if (group === 1) { - bio.push(match.replace(/<\/?[^>]+(>|$)/g, '')); + biography.push(match.replace(/<\/?[^>]+(>|$)/g, '')); } }); } - // WK - let wk: string = ''; - let wk_name: string = ''; - const regex_wk = /Gewählt über Landesliste<\/h4>/ + const regex_directCandidate = /

Direkt gewählt<\/h4>/; + if (regex_directCandidate.exec(string) !== null) { + directCandidate = true; + } + + // Constituency + let constituency: string = ''; + let constituencyName: string = ''; + const regex_constituency = / { if (group === 1) { - wk = match; + constituency = match; } if (group === 2) { - wk_name = match; + constituencyName = match; } }); } - // Ämter - let aemter_sel: string = ''; - let aemter_raw: string[] = []; - let aemter: any[] = []; - const regex_aemter_sel = /
([\s\S]*?)

Veröffentlichungspflichtige Angaben<\/h4>/gm; - const regex_aemter = /

([\s\S]*?)<\/h5>[\s\S]*?