Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

#124 PageQueryIterable doesn't escape special characters #272

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
import org.dkpro.jwpl.util.ApiUtilities;
import org.dkpro.jwpl.util.StringUtils;
import org.hibernate.Session;
import org.hibernate.query.Query;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

Expand All @@ -48,6 +49,7 @@ public PageQueryIterable(Wikipedia wiki, PageQuery q) throws WikiApiException

this.wiki = wiki;
this.pageIdList = new ArrayList<>();
boolean hasTitlePattern = false;

// get a list with all pageIDs of the pages conforming with the query
String hql = "select p.pageId from Page as p ";
Expand All @@ -58,18 +60,23 @@ public PageQueryIterable(Wikipedia wiki, PageQuery q) throws WikiApiException
if (q.onlyArticlePages()) {
conditions.add("p.isDisambiguation = 0");
}
if (!"".equals(q.getTitlePattern())) {
conditions.add("p.name like '" + q.getTitlePattern() + "'");
if (q.getTitlePattern() != null && !q.getTitlePattern().isBlank()) {
conditions.add("p.name like :name");
hasTitlePattern = true;
}

String conditionString = StringUtils.join(conditions, " AND ");
if (conditionString.length() > 0) {
if (!conditionString.isEmpty()) {
hql += "where " + conditionString;
}

Session session = this.wiki.__getHibernateSession();
session.beginTransaction();
List<Integer> idList = session.createQuery(hql, Integer.class).list();
Query<Integer> query = session.createQuery(hql, Integer.class);
if (hasTitlePattern) {
query.setParameter("name", q.getTitlePattern());
}
List<Integer> idList = query.list();
session.getTransaction().commit();

int progress = 0;
Expand All @@ -95,14 +102,10 @@ public PageQueryIterable(Wikipedia wiki, PageQuery q) throws WikiApiException
page = wiki.getPage(pageID);
}
catch (WikiPageNotFoundException e) {
logger.error("Page with pageID {} could not be found. Fatal error. Terminating.",
logger.warn("Page with pageID {} could not be found. Fatal error. Terminating.",
pageID);
e.printStackTrace();
System.exit(1);
}

String[] tokens = page.getPlainText().split(" ");

if (!(q.getMinIndegree() >= 0 && q.getMaxIndegree() >= 0
&& q.getMinIndegree() <= q.getMaxIndegree())) {
q.setMinIndegree(0);
Expand Down Expand Up @@ -157,6 +160,8 @@ public PageQueryIterable(Wikipedia wiki, PageQuery q) throws WikiApiException
if (categoriesSize < q.getMinCategories() || categoriesSize > q.getMaxCategories()) {
continue;
}

String[] tokens = page.getPlainText().split(" ");
if (tokens.length < q.getMinTokens() || tokens.length > q.getMaxTokens()) {
continue;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ public void test_pageIteratorTest()
assertNotNull(p);
nrOfPages++;
}
assertEquals(33, nrOfPages, "Number of pages == 33");
assertEquals(34, nrOfPages, "Number of pages == 34");

while (articleIter.hasNext()) {
Page p = articleIter.next();
Expand All @@ -73,7 +73,7 @@ public void test_pageIteratorTest()
}

// Assuming 33 is the correct number now
assertEquals(33, nrOfArticles, "Number of articles == 33");
assertEquals(34, nrOfArticles, "Number of articles == 34");

}

Expand All @@ -92,7 +92,7 @@ public void test_pageIteratorTestBufferSize()
Page p = pageIter.next();
nrOfPages++;
}
assertEquals(33, nrOfPages, "Number of pages == 33");
assertEquals(34, nrOfPages, "Number of pages == 34");
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
/*
* Licensed to the Technische Universität Darmstadt under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The Technische Universität Darmstadt
* licenses this file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.dkpro.jwpl.api;

import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertTrue;
import static org.junit.jupiter.api.Assertions.fail;

import java.util.Iterator;

import org.dkpro.jwpl.api.exception.WikiApiException;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.NullAndEmptySource;
import org.junit.jupiter.params.provider.ValueSource;

public class PageQueryIterableTest
extends BaseJWPLTest
{

private PageQuery pq;

@BeforeAll
public static void setupWikipedia()
{
DatabaseConfiguration db = obtainHSDLDBConfiguration();
try {
wiki = new Wikipedia(db);
}
catch (Exception e) {
fail("Wikipedia could not be initialized: " + e.getLocalizedMessage());
}
}

@BeforeEach
public void setup() {
pq = new PageQuery();
}

// Shows now exception occurs during creation
@Test
public void testCreatePageQueryIterable() throws WikiApiException
{
PageQueryIterable pqi = new PageQueryIterable(wiki, pq);
assertNotNull(pqi);
}

// Example with ' character in titlePattern verifies issue #124
@ParameterizedTest
@ValueSource(strings = {"Wikipedia%", "Wiki_edia%", "Moore'%"})
public void testIteratorWithValidTitlePattern(String input) throws WikiApiException
{
pq.setTitlePattern(input);
pq.setOnlyArticlePages(true);
PageQueryIterable pqi = new PageQueryIterable(wiki, pq);
assertNotNull(pqi);
Iterator<Page> it = pqi.iterator();
assertNotNull(it);
int count = 0;
while (it.hasNext()) {
count++;
assertNotNull(it.next());
}
assertTrue(count >= 1);
}

// Example with null or blank titlePattern => should fetch all ID's in DB
@ParameterizedTest
@NullAndEmptySource
@ValueSource(strings = {" ", "\t", "\n"})
public void testIteratorWithEmptyTitlePattern(String input) throws WikiApiException
{
pq.setTitlePattern(input);
PageQueryIterable pqi = new PageQueryIterable(wiki, pq);
assertNotNull(pqi);
Iterator<Page> it = pqi.iterator();
assertNotNull(it);
int count = 0;
while (it.hasNext()) {
count++;
assertNotNull(it.next());
}
assertEquals(34, count);
}

@ParameterizedTest
@ValueSource(strings = {"Wikipedia%", "Wiki_edia%", "Moore'%"})
public void testIteratorWithMaTokens(String input) throws WikiApiException
{
pq.setTitlePattern(input);
pq.setMaxTokens(20);
PageQueryIterable pqi = new PageQueryIterable(wiki, pq);
assertNotNull(pqi);
Iterator<Page> it = pqi.iterator();
assertNotNull(it);
int count = 0;
while (it.hasNext()) {
count++;
assertNotNull(it.next());
}
assertTrue(count >= 1);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ public void test_titleIteratorTest()
assertNotNull(t);
nrOfTitles++;
}
assertEquals(39, nrOfTitles, "Number of titles == 39");
assertEquals(40, nrOfTitles, "Number of titles == 40");

}
}
8 changes: 4 additions & 4 deletions dkpro-jwpl-api/src/test/resources/db/wikiapi_test.properties
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#HSQL Database Engine 2.4.0
#Tue Aug 21 12:43:37 CEST 2018
version=2.4.0
#HSQL Database Engine 2.7.2
#Mon Nov 06 15:01:36 CET 2023
tx_timestamp=119
modified=no
tx_timestamp=0
version=2.7.2
25 changes: 14 additions & 11 deletions dkpro-jwpl-api/src/test/resources/db/wikiapi_test.script
Original file line number Diff line number Diff line change
@@ -1,24 +1,25 @@
SET DATABASE UNIQUE NAME HSQLDB64408D16F3
SET DATABASE GC 0
SET DATABASE DEFAULT RESULT MEMORY ROWS 0
SET DATABASE EVENT LOG LEVEL 0
SET DATABASE TRANSACTION CONTROL LOCKS
SET DATABASE DEFAULT ISOLATION LEVEL READ COMMITTED
SET DATABASE TRANSACTION ROLLBACK ON CONFLICT TRUE
SET DATABASE TEXT TABLE DEFAULTS ''
SET DATABASE SQL NAMES FALSE
SET DATABASE SQL RESTRICT EXEC FALSE
SET DATABASE SQL REFERENCES FALSE
SET DATABASE SQL SIZE FALSE
SET DATABASE SQL TYPES FALSE
SET DATABASE SQL TDC DELETE TRUE
SET DATABASE SQL TDC UPDATE TRUE
SET DATABASE SQL SYS INDEX NAMES FALSE
SET DATABASE SQL CONCAT NULLS TRUE
SET DATABASE SQL UNIQUE NULLS TRUE
SET DATABASE SQL CONVERT TRUNCATE TRUE
SET DATABASE SQL AVG SCALE 0
SET DATABASE SQL DOUBLE NAN TRUE
SET FILES WRITE DELAY 0
SET FILES BACKUP INCREMENT FALSE
SET FILES BACKUP INCREMENT TRUE
SET FILES CACHE SIZE 10000
SET FILES CACHE ROWS 50000
SET FILES SCALE 1
Expand All @@ -28,19 +29,19 @@ SET FILES NIO TRUE
SET FILES NIO SIZE 256
SET FILES LOG TRUE
SET FILES LOG SIZE 200
SET FILES CHECK 119
SET DATABASE COLLATION "German" NO PAD
CREATE USER SA PASSWORD DIGEST 'd41d8cd98f00b204e9800998ecf8427e'
CREATE SCHEMA PUBLIC AUTHORIZATION DBA
SET SCHEMA PUBLIC
CREATE MEMORY TABLE PUBLIC.CATEGORY(ID BIGINT GENERATED BY DEFAULT AS IDENTITY(START WITH 1) NOT NULL PRIMARY KEY,PAGEID INTEGER,NAME VARCHAR(255),UNIQUE(PAGEID))
ALTER TABLE PUBLIC.CATEGORY ALTER COLUMN ID RESTART WITH 18
CREATE INDEX NAMEINDEX ON PUBLIC.CATEGORY(NAME)
CREATE MEMORY TABLE PUBLIC.METADATA(ID BIGINT GENERATED BY DEFAULT AS IDENTITY(START WITH 1) NOT NULL PRIMARY KEY,LANGUAGE VARCHAR(255),DISAMBIGUATIONCATEGORY VARCHAR(255),MAINCATEGORY VARCHAR(255),NROFPAGES BIGINT,NROFREDIRECTS BIGINT,NROFDISAMBIGUATIONPAGES BIGINT,NROFCATEGORIES BIGINT,VERSION VARCHAR(255))
ALTER TABLE PUBLIC.METADATA ALTER COLUMN ID RESTART WITH 2
CREATE MEMORY TABLE PUBLIC.PAGE(ID BIGINT GENERATED BY DEFAULT AS IDENTITY(START WITH 1) NOT NULL PRIMARY KEY,PAGEID INTEGER,NAME VARCHAR(255),TEXT VARCHAR(16777216),ISDISAMBIGUATION BOOLEAN,UNIQUE(PAGEID))
ALTER TABLE PUBLIC.PAGE ALTER COLUMN ID RESTART WITH 31
ALTER TABLE PUBLIC.PAGE ALTER COLUMN ID RESTART WITH 35
CREATE MEMORY TABLE PUBLIC.PAGEMAPLINE(ID BIGINT GENERATED BY DEFAULT AS IDENTITY(START WITH 1) NOT NULL PRIMARY KEY,NAME VARCHAR(255),PAGEID INTEGER,STEM VARCHAR(255),LEMMA VARCHAR(255))
ALTER TABLE PUBLIC.PAGEMAPLINE ALTER COLUMN ID RESTART WITH 37
ALTER TABLE PUBLIC.PAGEMAPLINE ALTER COLUMN ID RESTART WITH 41
CREATE INDEX NAME_INDEX ON PUBLIC.PAGEMAPLINE(NAME)
CREATE MEMORY TABLE PUBLIC.CATEGORY_INLINKS(ID BIGINT NOT NULL,INLINKS INTEGER,CONSTRAINT FK3F4337732A72A718 FOREIGN KEY(ID) REFERENCES PUBLIC.CATEGORY(ID))
CREATE MEMORY TABLE PUBLIC.CATEGORY_OUTLINKS(ID BIGINT NOT NULL,OUTLINKS INTEGER,CONSTRAINT FK9885334C2A72A718 FOREIGN KEY(ID) REFERENCES PUBLIC.CATEGORY(ID))
Expand All @@ -51,11 +52,11 @@ CREATE MEMORY TABLE PUBLIC.PAGE_OUTLINKS(ID BIGINT NOT NULL,OUTLINKS INTEGER,CON
CREATE MEMORY TABLE PUBLIC.PAGE_REDIRECTS(ID BIGINT NOT NULL,REDIRECTS VARCHAR(255),CONSTRAINT FK1484BA67F9E0A429 FOREIGN KEY(ID) REFERENCES PUBLIC.PAGE(ID))
ALTER SEQUENCE SYSTEM_LOBS.LOB_ID RESTART WITH 1
SET DATABASE DEFAULT INITIAL SCHEMA PUBLIC
GRANT USAGE ON DOMAIN INFORMATION_SCHEMA.SQL_IDENTIFIER TO PUBLIC
GRANT USAGE ON DOMAIN INFORMATION_SCHEMA.YES_OR_NO TO PUBLIC
GRANT USAGE ON DOMAIN INFORMATION_SCHEMA.TIME_STAMP TO PUBLIC
GRANT USAGE ON DOMAIN INFORMATION_SCHEMA.CARDINAL_NUMBER TO PUBLIC
GRANT USAGE ON DOMAIN INFORMATION_SCHEMA.YES_OR_NO TO PUBLIC
GRANT USAGE ON DOMAIN INFORMATION_SCHEMA.CHARACTER_DATA TO PUBLIC
GRANT USAGE ON DOMAIN INFORMATION_SCHEMA.SQL_IDENTIFIER TO PUBLIC
GRANT USAGE ON DOMAIN INFORMATION_SCHEMA.TIME_STAMP TO PUBLIC
GRANT DBA TO SA
SET SCHEMA SYSTEM_LOBS
INSERT INTO BLOCKS VALUES(0,2147483647,0)
Expand All @@ -78,7 +79,7 @@ INSERT INTO CATEGORY VALUES(15,10,'SIR')
INSERT INTO CATEGORY VALUES(16,5,'People_of_Telecooperation')
INSERT INTO CATEGORY VALUES(17,30,'Unconnected_category')
INSERT INTO METADATA VALUES(1,'test','Disambiguation','Telecooperation',36,6,2,17,'1.0')
INSERT INTO PAGE VALUES(1,1014,'Wikipedia_API','Wikipedia API ist die wichtigste [[Software]] \u00fcberhaupt.\u000D\u000A[[JWPL|Wikipedia API]].\u000D\u000A\u000D\u000A*Nicht zu \u00fcbertreffen.\u000D\u000A\u000D\u000A*Unglaublich\u000D\u000A\u000D\u000A*[[http://www.ukp.tu-darmstadt.de]]\u000D\u000A\u000D\u000A[[en:Wikipedia API]] [[fi:WikipediaAPI]]',FALSE)
INSERT INTO PAGE VALUES(1,1014,'Wikipedia_API','Wikipedia API ist die wichtigste [[Software]] \u00fcberhaupt.\u000d\u000a[[JWPL|Wikipedia API]].\u000d\u000a\u000d\u000a*Nicht zu \u00fcbertreffen.\u000d\u000a\u000d\u000a*Unglaublich\u000d\u000a\u000d\u000a*[[http://www.ukp.tu-darmstadt.de]]\u000d\u000a\u000d\u000a[[en:Wikipedia API]] [[fi:WikipediaAPI]]',FALSE)
INSERT INTO PAGE VALUES(2,1019,'Christoph_Mueller','Christoph_Mueller',FALSE)
INSERT INTO PAGE VALUES(3,107,'TK3','TK3',FALSE)
INSERT INTO PAGE VALUES(4,1022,'Torsten_Zesch','Torsten_Zesch',FALSE)
Expand Down Expand Up @@ -109,8 +110,9 @@ INSERT INTO PAGE VALUES(28,1031,'Max_Muehlhaeuser','Max_Muehlhaeuser',FALSE)
INSERT INTO PAGE VALUES(29,1032,'Semantic_information_retrieval_(computer_science)','This is semantic information retrieval in the sense of computer science.',FALSE)
INSERT INTO PAGE VALUES(30,2000,'Unconnected_page','This page is unconnected from all other pages',FALSE)
INSERT INTO PAGE VALUES(31,4000,'Discussion:Wikipedia_API','This page is a discussion page for Wikipedia_API.',FALSE)
INSERT INTO PAGE VALUES(32,6000,'Humanbiologie','Die Attraktivit\u00E4t der Fachrichtungen Humanbiologie beziehungsweise Biomedizin als Studienf\u00E4cher ist in j\u00fcngerer Zeit deutlich gestiegen.\u000D\u000A {| cellspacing="5"\u000D\u000A !align="left"|Studiengang\u000D\u000A!align="left"|besteht seit\u000D\u000A!align="left"|Abschluss\u000D\u000A!align="left"|Hochschule\u000D\u000A|-\u000D\u000A|Humanbiologie (Biomedical Science)\u000D\u000A|1979\u000D\u000A|[[Bachelor#Der Bachelor-Abschluss in Europa Bachelor]] / Master\u000D\u000A|[[Philipps-Universit\u00E6t Marburg|Marburg (U)]]\u000D\u000A|-\u000D\u000A|Molekulare Biomedizin\u000D\u000A|2014\u000D\u000A|Bachelor\u000D\u000A|[[Rheinische Fachhochschule K\u00f6ln]]\u000D\u000A|}\u000D\u000A[[Kategorie:Biologie]] [[Kategorie:Medizin]] [[Kategorie:Humangenetik]] [[Kategorie:Studienfach]]', FALSE)
INSERT INTO PAGE VALUES(33,6001,'Liste_von_Materia_Medica_der_traditionellen_uigurischen_Medizin','Dies ist eine Liste von [[Materia Medica]] der traditionellen [[Uiguren|uigurischen]] [[Medizin]]. Die uigurische Medizin entwickelte sich aus der [[Arabische Medizin|arabischen Medizin]], der [[Medizin_des_Altertums#Medizin im Antiken Griechenland|antiken griechischen Medizin]] und der [[Traditionelle chinesische Medizin|traditionellen chinesischen Medizin]].<ref>[http://www.cintcm.com/e_cintcm/e_cmm/Uigur%20drugs.htm cintcm.com: The traditional Uigur drugs] – gefunden am 13. Juli 2010</ref> \u00DCbersicht <small><center>Quellen: [http://www.cintcm.com/e_cintcm/e_cmm/Uigur%20drugs.htm cintcm.com], [http://www.tcm-resources.com/xiangguan/zhongdianziyuan/minzuyaozhonglei.doc tcm-resources.com]</center></small>', FALSE)
INSERT INTO PAGE VALUES(32,6000,'Humanbiologie','Die Attraktivit\u00e4t der Fachrichtungen Humanbiologie beziehungsweise Biomedizin als Studienf\u00e4cher ist in j\u00fcngerer Zeit deutlich gestiegen.\u000d\u000a {| cellspacing="5"\u000d\u000a !align="left"|Studiengang\u000d\u000a!align="left"|besteht seit\u000d\u000a!align="left"|Abschluss\u000d\u000a!align="left"|Hochschule\u000d\u000a|-\u000d\u000a|Humanbiologie (Biomedical Science)\u000d\u000a|1979\u000d\u000a|[[Bachelor#Der Bachelor-Abschluss in Europa Bachelor]] / Master\u000d\u000a|[[Philipps-Universit\u00e6t Marburg|Marburg (U)]]\u000d\u000a|-\u000d\u000a|Molekulare Biomedizin\u000d\u000a|2014\u000d\u000a|Bachelor\u000d\u000a|[[Rheinische Fachhochschule K\u00f6ln]]\u000d\u000a|}\u000d\u000a[[Kategorie:Biologie]] [[Kategorie:Medizin]] [[Kategorie:Humangenetik]] [[Kategorie:Studienfach]]',FALSE)
INSERT INTO PAGE VALUES(33,6001,'Liste_von_Materia_Medica_der_traditionellen_uigurischen_Medizin','Dies ist eine Liste von [[Materia Medica]] der traditionellen [[Uiguren|uigurischen]] [[Medizin]]. Die uigurische Medizin entwickelte sich aus der [[Arabische Medizin|arabischen Medizin]], der [[Medizin_des_Altertums#Medizin im Antiken Griechenland|antiken griechischen Medizin]] und der [[Traditionelle chinesische Medizin|traditionellen chinesischen Medizin]].<ref>[http://www.cintcm.com/e_cintcm/e_cmm/Uigur%20drugs.htm cintcm.com: The traditional Uigur drugs] \u00e2\u0080\u0093 gefunden am 13. Juli 2010</ref> \u00dcbersicht <small><center>Quellen: [http://www.cintcm.com/e_cintcm/e_cmm/Uigur%20drugs.htm cintcm.com], [http://www.tcm-resources.com/xiangguan/zhongdianziyuan/minzuyaozhonglei.doc tcm-resources.com]</center></small>',FALSE)
INSERT INTO PAGE VALUES(34,6002,'Moore''s_law','Moore''s_law',FALSE)
INSERT INTO PAGEMAPLINE VALUES(1,'Net_Centric_Systems',101,NULL,NULL)
INSERT INTO PAGEMAPLINE VALUES(2,'TK1',103,NULL,NULL)
INSERT INTO PAGEMAPLINE VALUES(3,'TK2',105,NULL,NULL)
Expand Down Expand Up @@ -150,6 +152,7 @@ INSERT INTO PAGEMAPLINE VALUES(36,'Unconnected_page',2000,NULL,NULL)
INSERT INTO PAGEMAPLINE VALUES(37,'Discussion:Wikipedia_API',4000,NULL,NULL)
INSERT INTO PAGEMAPLINE VALUES(38,'Humanbiologie',6000,NULL,NULL)
INSERT INTO PAGEMAPLINE VALUES(39,'Liste_von_Materia_Medica_der_traditionellen_uigurischen_Medizin',6001,NULL,NULL)
INSERT INTO PAGEMAPLINE VALUES(40,'Moore''s_law',6002,NULL,NULL)
INSERT INTO CATEGORY_INLINKS VALUES(2,8)
INSERT INTO CATEGORY_INLINKS VALUES(3,1)
INSERT INTO CATEGORY_INLINKS VALUES(4,6)
Expand Down