From 1d59b4872956922d2f42d5dbdc6d4256ee8501ba Mon Sep 17 00:00:00 2001 From: Martin Wiesner Date: Mon, 30 Oct 2023 10:54:45 +0100 Subject: [PATCH] #251 Fix inconsistent indentation and formatting of code - corrects indentation - removes irrelevant whitespaces or empty lines - adds Override annotation where applicable / missing - fixes typos along the path - simplifies expressions for better readability --- .../java/org/dkpro/jwpl/api/Category.java | 693 +-- .../jwpl/api/CategoryDescendantsIterable.java | 61 +- .../jwpl/api/CategoryDescendantsIterator.java | 258 +- .../org/dkpro/jwpl/api/CategoryGraph.java | 2453 ++++++----- .../dkpro/jwpl/api/CategoryGraphManager.java | 152 +- .../org/dkpro/jwpl/api/CategoryIterable.java | 55 +- .../org/dkpro/jwpl/api/CategoryIterator.java | 303 +- .../jwpl/api/CategoryTitleComparator.java | 26 +- .../java/org/dkpro/jwpl/api/CycleHandler.java | 340 +- .../dkpro/jwpl/api/DatabaseConfiguration.java | 300 +- .../java/org/dkpro/jwpl/api/MetaData.java | 250 +- .../main/java/org/dkpro/jwpl/api/Page.java | 328 +- .../java/org/dkpro/jwpl/api/PageIterable.java | 68 +- .../java/org/dkpro/jwpl/api/PageIterator.java | 438 +- .../java/org/dkpro/jwpl/api/PageQuery.java | 494 ++- .../org/dkpro/jwpl/api/PageQueryIterable.java | 279 +- .../org/dkpro/jwpl/api/PageQueryIterator.java | 64 +- .../dkpro/jwpl/api/PageTitleComparator.java | 23 +- .../main/java/org/dkpro/jwpl/api/Title.java | 275 +- .../org/dkpro/jwpl/api/TitleIterable.java | 55 +- .../org/dkpro/jwpl/api/TitleIterator.java | 226 +- .../org/dkpro/jwpl/api/WikiConstants.java | 604 +-- .../java/org/dkpro/jwpl/api/Wikipedia.java | 1393 +++--- .../org/dkpro/jwpl/api/WikipediaInfo.java | 661 +-- .../jwpl/api/exception/WikiApiException.java | 32 +- .../jwpl/api/exception/WikiException.java | 32 +- .../WikiInitializationException.java | 34 +- .../exception/WikiPageNotFoundException.java | 34 +- .../exception/WikiRelatednessException.java | 40 - .../exception/WikiTitleParsingException.java | 33 +- .../dkpro/jwpl/api/hibernate/Category.java | 97 +- .../dkpro/jwpl/api/hibernate/CategoryDAO.java | 82 +- .../dkpro/jwpl/api/hibernate/GenericDAO.java | 168 +- .../dkpro/jwpl/api/hibernate/MetaData.java | 139 +- .../dkpro/jwpl/api/hibernate/MetaDataDAO.java | 82 +- .../org/dkpro/jwpl/api/hibernate/Page.java | 156 +- .../org/dkpro/jwpl/api/hibernate/PageDAO.java | 78 +- .../dkpro/jwpl/api/hibernate/PageMapLine.java | 83 +- .../jwpl/api/hibernate/WikiHibernateUtil.java | 2 +- .../jwpl/api/sweble/PlainTextConverter.java | 1008 ++--- .../api/sweble/TemplateNameExtractor.java | 136 +- .../jwpl/api/util/GraphSerialization.java | 159 +- .../api/util/SerializableDirectedGraph.java | 55 +- .../org/dkpro/jwpl/util/ApiUtilities.java | 106 +- .../org/dkpro/jwpl/util/CommonUtilities.java | 72 +- .../java/org/dkpro/jwpl/util/DbUtilities.java | 49 +- .../org/dkpro/jwpl/util/GraphUtilities.java | 86 +- .../dkpro/jwpl/util/HibernateUtilities.java | 78 +- .../src/main/java/org/dkpro/jwpl/util/OS.java | 59 +- .../java/org/dkpro/jwpl/util/StringUtils.java | 157 +- .../dkpro/jwpl/util/UnmodifiableArraySet.java | 213 +- .../distance/LevenshteinStringDistance.java | 115 +- .../jwpl/util/distance/StringDistance.java | 8 +- .../datamachine/domain/DataMachineFiles.java | 316 +- .../domain/DataMachineGenerator.java | 257 +- .../datamachine/domain/JWPLDataMachine.java | 101 +- .../version/SingleDumpVersionJDKGeneric.java | 454 +- .../SingleDumpVersionJDKIntKeyFactory.java | 27 +- .../SingleDumpVersionJDKLongKeyFactory.java | 3 +- .../SingleDumpVersionJDKStringKeyFactory.java | 3 +- .../version/SingleDumpVersionOriginal.java | 714 ++- .../dump/xml/BinaryDumpTableInputStream.java | 27 +- .../dump/xml/DataMachineRevisionParser.java | 27 +- .../dump/xml/SimpleBinaryDumpWriter.java | 220 +- .../dump/xml/SimpleXmlDumpReader.java | 50 +- .../jwpl/datamachine/dump/xml/XML2Binary.java | 46 +- .../file/DeleteFilesAtShutdown.java | 59 +- .../dkpro/jwpl/mwdumper/dumper/Dumper.java | 412 +- .../jwpl/mwdumper/dumper/ProgressFilter.java | 105 +- .../org/dkpro/jwpl/mwdumper/dumper/Tools.java | 92 +- .../importer/AfterTimeStampFilter.java | 16 +- .../importer/BeforeTimeStampFilter.java | 16 +- .../dkpro/jwpl/mwdumper/importer/Buffer.java | 51 +- .../jwpl/mwdumper/importer/Contributor.java | 24 +- .../jwpl/mwdumper/importer/DumpWriter.java | 26 +- .../mwdumper/importer/ExactListFilter.java | 14 +- .../jwpl/mwdumper/importer/LatestFilter.java | 76 +- .../jwpl/mwdumper/importer/ListFilter.java | 54 +- .../jwpl/mwdumper/importer/MultiWriter.java | 116 +- .../mwdumper/importer/NamespaceFilter.java | 96 +- .../jwpl/mwdumper/importer/NamespaceSet.java | 78 +- .../jwpl/mwdumper/importer/NotalkFilter.java | 12 +- .../dkpro/jwpl/mwdumper/importer/Page.java | 20 +- .../jwpl/mwdumper/importer/PageFilter.java | 84 +- .../jwpl/mwdumper/importer/Revision.java | 34 +- .../mwdumper/importer/RevisionListFilter.java | 114 +- .../jwpl/mwdumper/importer/Siteinfo.java | 10 +- .../jwpl/mwdumper/importer/SphinxWriter.java | 116 +- .../jwpl/mwdumper/importer/SqlFileStream.java | 44 +- .../jwpl/mwdumper/importer/SqlLiteral.java | 18 +- .../mwdumper/importer/SqlServerStream.java | 66 +- .../jwpl/mwdumper/importer/SqlStream.java | 14 +- .../jwpl/mwdumper/importer/SqlWriter.java | 652 +-- .../jwpl/mwdumper/importer/SqlWriter14.java | 126 +- .../jwpl/mwdumper/importer/SqlWriter15.java | 192 +- .../mwdumper/importer/TimeStampFilter.java | 74 +- .../dkpro/jwpl/mwdumper/importer/Title.java | 152 +- .../mwdumper/importer/TitleMatchFilter.java | 20 +- .../jwpl/mwdumper/importer/XmlDumpReader.java | 720 +-- .../jwpl/mwdumper/importer/XmlDumpWriter.java | 241 +- .../jwpl/mwdumper/importer/XmlWriter.java | 352 +- .../java/org/dkpro/jwpl/parser/Content.java | 170 +- .../dkpro/jwpl/parser/ContentContainer.java | 469 +- .../org/dkpro/jwpl/parser/ContentElement.java | 569 +-- .../org/dkpro/jwpl/parser/DefinitionList.java | 176 +- .../main/java/org/dkpro/jwpl/parser/Link.java | 238 +- .../jwpl/parser/LinkAnchorExtractor.java | 172 +- .../org/dkpro/jwpl/parser/NestedList.java | 12 +- .../jwpl/parser/NestedListContainer.java | 101 +- .../dkpro/jwpl/parser/NestedListElement.java | 13 +- .../java/org/dkpro/jwpl/parser/Paragraph.java | 63 +- .../org/dkpro/jwpl/parser/ParsedPage.java | 606 +-- .../dkpro/jwpl/parser/ParsedPageObject.java | 27 +- .../java/org/dkpro/jwpl/parser/Section.java | 254 +- .../dkpro/jwpl/parser/SectionContainer.java | 327 +- .../org/dkpro/jwpl/parser/SectionContent.java | 276 +- .../main/java/org/dkpro/jwpl/parser/Span.java | 307 +- .../java/org/dkpro/jwpl/parser/SrcSpan.java | 84 +- .../java/org/dkpro/jwpl/parser/Table.java | 298 +- .../org/dkpro/jwpl/parser/TableElement.java | 143 +- .../java/org/dkpro/jwpl/parser/Template.java | 98 +- .../dkpro/jwpl/parser/html/HtmlWriter.java | 858 ++-- .../org/dkpro/jwpl/parser/html/ParsedPage.css | 261 +- .../dkpro/jwpl/parser/html/ParsedPageCSS.java | 186 +- .../mediawiki/EmptyStructureRemover.java | 246 +- .../jwpl/parser/mediawiki/FlushTemplates.java | 29 +- .../mediawiki/GermanTemplateParser.java | 204 +- .../MediaWikiContentElementParser.java | 15 +- .../parser/mediawiki/MediaWikiParser.java | 31 +- .../mediawiki/MediaWikiParserFactory.java | 1020 +++-- .../mediawiki/MediaWikiTemplateParser.java | 33 +- .../jwpl/parser/mediawiki/ModularParser.java | 3911 ++++++++--------- .../parser/mediawiki/ParserConstants.java | 14 +- .../parser/mediawiki/ResolvedTemplate.java | 160 +- .../ShowTemplateNamesAndParameters.java | 57 +- .../jwpl/parser/mediawiki/SpanManager.java | 450 +- .../parser/mediawiki/SrcPosRangeChecker.java | 233 +- .../parser/selectiveaccess/ConfigLoader.java | 202 +- .../SelectiveAccessHandler.java | 707 ++- .../api/AbstractRevisionService.java | 101 +- .../api/ChronoRevisionIterator.java | 633 +-- .../jwpl/revisionmachine/api/Contributor.java | 86 +- .../jwpl/revisionmachine/api/Revision.java | 856 ++-- .../api/RevisionAPIConfiguration.java | 394 +- .../jwpl/revisionmachine/api/RevisionApi.java | 3446 +++++++-------- .../api/RevisionDataInterface.java | 105 +- .../revisionmachine/api/RevisionIterator.java | 861 ++-- .../api/RevisionIteratorInterface.java | 28 +- .../api/chrono/ChronoFullRevision.java | 663 ++- .../api/chrono/ChronoIterator.java | 462 +- .../api/chrono/ChronoStorage.java | 632 ++- .../api/chrono/ChronoStorageBlock.java | 383 +- .../archivers/Bzip2Archiver.java | 305 +- .../exceptions/ArticleReaderException.java | 66 +- .../exceptions/ConfigurationException.java | 115 +- .../common/exceptions/DecodingException.java | 69 +- .../common/exceptions/DiffException.java | 69 +- .../common/exceptions/EncodingException.java | 69 +- .../common/exceptions/ErrorFactory.java | 619 ++- .../common/exceptions/ErrorKeys.java | 70 +- .../common/exceptions/LoggingException.java | 69 +- .../exceptions/SQLConsumerException.java | 69 +- .../common/exceptions/TimeoutException.java | 69 +- .../common/logging/Logger.java | 483 +- .../common/logging/LoggerType.java | 70 +- .../common/logging/LoggingFactory.java | 130 +- .../logging/messages/DiffToolLogMessages.java | 160 +- .../consumer/ArticleConsumerLogMessages.java | 462 +- .../consumer/ConsumerLogMessages.java | 308 +- .../consumer/DiffConsumerLogMessages.java | 252 +- .../consumer/SQLConsumerLogMessages.java | 134 +- .../common/util/LetterNode.java | 244 +- .../common/util/MathUtilities.java | 163 +- .../common/util/MultipleKeywordTree.java | 193 +- .../common/util/SingleKeywordTree.java | 142 +- .../common/util/Surrogates.java | 102 +- .../revisionmachine/common/util/Time.java | 338 +- .../common/util/WikipediaXMLKeys.java | 246 +- .../common/util/WikipediaXMLWriter.java | 697 ++- .../revisionmachine/difftool/DiffTool.java | 96 +- .../difftool/DiffToolThread.java | 588 ++- .../difftool/config/ConfigurationKeys.java | 555 ++- .../difftool/config/ConfigurationManager.java | 193 +- .../difftool/config/ConfigurationReader.java | 1529 +++---- .../difftool/config/OutputTypes.java | 30 +- .../difftool/config/gui/ConfigGUI.java | 79 +- .../difftool/config/gui/ConfigMenuBar.java | 83 +- .../config/gui/control/ArchiveRegistry.java | 263 +- .../config/gui/control/ComponentRegistry.java | 189 +- .../config/gui/control/ConfigController.java | 1146 +++-- .../config/gui/control/ConfigSettings.java | 433 +- .../gui/control/ConfigVerification.java | 206 +- .../difftool/config/gui/data/ConfigEnum.java | 24 +- .../config/gui/data/ConfigErrorKeys.java | 48 +- .../difftool/config/gui/data/ConfigItem.java | 111 +- .../config/gui/data/ConfigItemTypes.java | 24 +- .../gui/data/OutputCompressionEnum.java | 30 +- .../difftool/config/gui/data/PanelKeys.java | 70 +- .../config/gui/dialogs/ConfigDialog.java | 291 +- .../config/gui/dialogs/InputDialog.java | 387 +- .../config/gui/dialogs/XMLFileChooser.java | 64 +- .../config/gui/panels/AbstractPanel.java | 123 +- .../config/gui/panels/CachePanel.java | 518 ++- .../config/gui/panels/ConfigPanel.java | 234 +- .../config/gui/panels/DebugPanel.java | 449 +- .../gui/panels/ExternalProgramsPanel.java | 318 +- .../config/gui/panels/FilterPanel.java | 389 +- .../config/gui/panels/InputPanel.java | 811 ++-- .../config/gui/panels/LoggingPanel.java | 287 +- .../difftool/config/gui/panels/ModePanel.java | 411 +- .../config/gui/panels/OutputPanel.java | 692 ++- .../difftool/config/gui/panels/SQLPanel.java | 544 ++- .../config/simpleconfig/SimpleConfig.java | 33 +- .../article/ArticleReaderInterface.java | 78 +- .../article/reader/ArticleFilter.java | 272 +- .../consumer/article/reader/InputFactory.java | 386 +- .../reader/TimedWikipediaXMLReader.java | 356 +- .../article/reader/WikipediaXMLReader.java | 1343 +++--- .../diff/DiffCalculatorInterface.java | 70 +- .../diff/TaskTransmitterInterface.java | 73 +- .../diff/calculation/BlockManagement.java | 594 ++- .../calculation/BlockManagementInterface.java | 40 +- .../consumer/diff/calculation/DiffBlock.java | 273 +- .../diff/calculation/DiffCalculator.java | 1371 +++--- .../diff/calculation/TimedDiffCalculator.java | 311 +- .../difftool/consumer/dump/SQLEscape.java | 102 +- .../consumer/dump/WriterInterface.java | 65 +- .../consumer/dump/codec/DataFileEncoder.java | 250 +- .../consumer/dump/codec/SQLEncoder.java | 941 ++-- .../dump/codec/SQLEncoderInterface.java | 131 +- .../consumer/dump/codec/SQLEncoding.java | 228 +- .../consumer/dump/codec/TimedSQLEncoder.java | 257 +- .../dump/writer/DataFileArchiveWriter.java | 366 +- .../consumer/dump/writer/DataFileWriter.java | 377 +- .../consumer/dump/writer/OutputFactory.java | 129 +- .../dump/writer/SQLArchiveWriter.java | 417 +- .../dump/writer/SQLDatabaseWriter.java | 341 +- .../consumer/dump/writer/SQLFileWriter.java | 398 +- .../dump/writer/TimedSQLArchiveWriter.java | 243 +- .../dump/writer/TimedSQLDatabaseWriter.java | 241 +- .../dump/writer/TimedSQLFileWriter.java | 239 +- .../difftool/data/OutputType.java | 90 +- .../difftool/data/SurrogateModes.java | 100 +- .../data/archive/ArchiveDescription.java | 141 +- .../difftool/data/archive/ArchiveManager.java | 111 +- .../difftool/data/archive/InputType.java | 88 +- .../difftool/data/codec/BitReader.java | 255 +- .../difftool/data/codec/BitWriter.java | 306 +- .../data/codec/RevisionCodecData.java | 464 +- .../difftool/data/codec/RevisionDecoder.java | 1042 ++--- .../difftool/data/codec/RevisionEncoder.java | 766 ++-- .../data/codec/RevisionEncoderInterface.java | 68 +- .../difftool/data/tasks/ISizeable.java | 22 +- .../difftool/data/tasks/Task.java | 476 +- .../difftool/data/tasks/TaskTypes.java | 68 +- .../difftool/data/tasks/content/Diff.java | 679 ++- .../data/tasks/content/DiffAction.java | 171 +- .../difftool/data/tasks/content/DiffPart.java | 317 +- .../data/tasks/info/ArticleInformation.java | 875 ++-- .../revisionmachine/index/IndexGenerator.java | 367 +- .../revisionmachine/index/IndexIterator.java | 281 +- .../jwpl/revisionmachine/index/Indexer.java | 427 +- .../index/indices/AbstractIndex.java | 229 +- .../index/indices/ArticleIndex.java | 196 +- .../index/indices/ArticleIndexData.java | 176 +- .../index/indices/ChronoIndex.java | 285 +- .../index/indices/ChronoIndexData.java | 268 +- .../index/indices/RevisionIndex.java | 92 +- .../index/writer/DataFileWriter.java | 183 +- .../index/writer/DatabaseWriter.java | 260 +- .../index/writer/IndexWriterInterface.java | 67 +- .../index/writer/SQLFileWriter.java | 211 +- .../wikiapi_simple_20090119_stripped.script | 2 +- .../timemachine/domain/JWPLTimeMachine.java | 101 +- .../jwpl/timemachine/domain/Revision.java | 119 +- .../jwpl/timemachine/domain/SettingsXML.java | 177 +- .../timemachine/domain/TimeMachineFiles.java | 161 +- .../domain/TimeMachineGenerator.java | 208 +- .../version/DumpVersionFastUtilIntKey.java | 648 ++- .../dump/version/DumpVersionJDKGeneric.java | 693 ++- .../version/DumpVersionJDKIntKeyFactory.java | 27 +- .../version/DumpVersionJDKLongKeyFactory.java | 27 +- .../DumpVersionJDKStringKeyFactory.java | 27 +- .../dump/version/OriginalDumpVersion.java | 941 ++-- .../jwpl/timemachine/dump/xml/PageReader.java | 58 +- .../jwpl/timemachine/dump/xml/PageWriter.java | 96 +- .../timemachine/dump/xml/RevisionReader.java | 58 +- .../timemachine/dump/xml/RevisionWriter.java | 72 +- .../jwpl/timemachine/dump/xml/TextReader.java | 56 +- .../jwpl/timemachine/dump/xml/TextWriter.java | 64 +- .../dump/xml/TimeMachineRevisionParser.java | 29 +- .../dump/xml/XMLDumpTableInputStream.java | 124 +- .../xml/XMLDumpTableInputStreamThread.java | 125 +- .../jwpl/tutorial/api/T1a_HelloWorld.java | 38 +- .../jwpl/tutorial/api/T1b_HelloWorld.java | 43 +- .../jwpl/tutorial/api/T1c_HelloWorld.java | 62 +- .../dkpro/jwpl/tutorial/api/T2_PageInfo.java | 74 +- .../jwpl/tutorial/api/T3_PageDetails.java | 102 +- .../jwpl/tutorial/api/T4_Categories.java | 100 +- .../dkpro/jwpl/tutorial/api/T5_TownList.java | 80 +- .../jwpl/tutorial/api/T6_HelperMethods.java | 42 +- .../tutorial/parser/T1_SimpleParserDemo.java | 49 +- .../tutorial/parser/T2_InternalLinks.java | 58 +- .../jwpl/tutorial/parser/T3_LinkContexts.java | 41 +- .../parser/T4_InterfacingWithWikipedia.java | 60 +- .../parser/T5_CleaningTemplateImage.java | 69 +- .../jwpl/tutorial/parser/T6_NestedLists.java | 104 +- .../jwpl/tutorial/parser/T7_HtmlFileDemo.java | 43 +- .../dkpro/jwpl/tutorial/parser/TestFile.java | 424 +- .../jwpl/util/revisions/RevisionUtils.java | 182 +- .../jwpl/util/templates/RevisionPair.java | 304 +- .../dkpro/jwpl/util/templates/TextPair.java | 536 ++- .../util/templates/WikipediaTemplateInfo.java | 2856 ++++++------ .../generator/GeneratorConstants.java | 15 +- .../generator/simple/GeneratorMode.java | 21 +- .../generator/simple/TemplateFilter.java | 204 +- .../simple/TemplateInfoGeneratorStarter.java | 422 +- .../WikipediaTemplateInfoDumpWriter.java | 381 +- .../WikipediaTemplateInfoGenerator.java | 652 ++- .../util/templates/parser/ParseUtils.java | 215 +- .../parser/SectionExtractionTest.java | 69 +- .../templates/parser/SectionExtractor.java | 525 ++- .../wikimachine/debug/AbstractLogger.java | 79 +- .../wikimachine/debug/CompositeLogger.java | 26 +- .../wikimachine/debug/FileMemoryLogger.java | 76 +- .../dkpro/jwpl/wikimachine/debug/ILogger.java | 20 +- .../wikimachine/debug/InputStreamSpy.java | 78 +- .../jwpl/wikimachine/debug/Slf4JLogger.java | 26 +- .../decompression/BZip2Decompressor.java | 35 +- .../decompression/GZipDecompressor.java | 20 +- .../decompression/IDecompressor.java | 10 +- .../decompression/UniversalDecompressor.java | 358 +- .../domain/AbstractSnapshotGenerator.java | 46 +- .../wikimachine/domain/Configuration.java | 181 +- .../domain/DumpVersionProcessor.java | 311 +- .../dkpro/jwpl/wikimachine/domain/Files.java | 203 +- .../domain/ISnapshotGenerator.java | 12 +- .../jwpl/wikimachine/domain/MetaData.java | 334 +- .../dump/sql/CategorylinksParser.java | 139 +- .../wikimachine/dump/sql/PagelinksParser.java | 155 +- .../jwpl/wikimachine/dump/sql/SQLEscape.java | 112 +- .../wikimachine/dump/sql/SQLFileParser.java | 116 +- .../dump/version/AbstractDumpVersion.java | 358 +- .../dump/version/IDumpVersion.java | 80 +- .../dump/version/IDumpVersionFactory.java | 9 +- .../dump/xml/AbstractXmlDumpReader.java | 1045 +++-- .../wikimachine/dump/xml/DumpTableEnum.java | 10 +- .../dump/xml/DumpTableInputStream.java | 9 +- .../jwpl/wikimachine/dump/xml/PageParser.java | 128 +- .../wikimachine/dump/xml/RevisionParser.java | 80 +- .../jwpl/wikimachine/dump/xml/TextParser.java | 96 +- .../dump/xml/WikiXMLDumpReader.java | 98 +- .../factory/IEnvironmentFactory.java | 24 +- .../wikimachine/factory/SpringFactory.java | 124 +- .../wikimachine/hashing/IStringHashCode.java | 8 +- .../hashing/StringHashCodeDisabled.java | 20 +- .../hashing/StringHashCodeJBoss.java | 59 +- .../hashing/StringHashCodeJDK.java | 20 +- .../jwpl/wikimachine/util/Redirects.java | 203 +- .../dkpro/jwpl/wikimachine/util/Strings.java | 329 +- .../jwpl/wikimachine/util/TimestampUtil.java | 63 +- .../jwpl/wikimachine/util/TxtFileWriter.java | 62 +- .../wikimachine/util/UTFDataInputStream.java | 82 +- .../wikimachine/util/UTFDataOutputStream.java | 100 +- 364 files changed, 45701 insertions(+), 48564 deletions(-) delete mode 100644 dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/exception/WikiRelatednessException.java diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/Category.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/Category.java index 7f818c9a..e4dea46b 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/Category.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/Category.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -20,376 +20,385 @@ import java.util.HashSet; import java.util.Set; -import org.hibernate.LockMode; -import org.hibernate.Session; - import org.dkpro.jwpl.api.exception.WikiApiException; import org.dkpro.jwpl.api.exception.WikiPageNotFoundException; import org.dkpro.jwpl.api.exception.WikiTitleParsingException; import org.dkpro.jwpl.api.hibernate.CategoryDAO; +import org.hibernate.LockMode; +import org.hibernate.Session; import org.hibernate.type.StandardBasicTypes; public class Category implements WikiConstants { - private final CategoryDAO catDAO; - private org.dkpro.jwpl.api.hibernate.Category hibernateCategory; - private final Wikipedia wiki; - - - /** - * Creates a category object. - * @param wiki The wikipedia object. - * @param id The hibernate id of the category. - * @throws WikiPageNotFoundException If the category does not exist. - */ - protected Category (Wikipedia wiki, long id) throws WikiPageNotFoundException { - this.wiki = wiki; - catDAO = new CategoryDAO(wiki); - createCategory(id); - } - - /** - * Creates a category object. - * @param wiki The wikipedia object. - * @param pageID The pageID of the category. - * @throws WikiPageNotFoundException If the category does not exist. - */ - protected Category (Wikipedia wiki, int pageID) throws WikiPageNotFoundException { - this.wiki = wiki; - catDAO = new CategoryDAO(wiki); - createCategory(pageID); - } - - /** - * Creates a category object. - * @param wiki The wikipedia object. - * @param pName The name of the category. - * @throws WikiPageNotFoundException If the category does not exist. - */ - public Category(Wikipedia wiki, String pName) throws WikiApiException { - if (pName == null || pName.length() == 0) { - throw new WikiPageNotFoundException(); - } - this.wiki = wiki; - catDAO = new CategoryDAO(wiki); - Title catTitle = new Title(pName); - createCategory(catTitle); - } - - /** - * @see Category#Category(Wikipedia, long) - */ - private void createCategory(long id) throws WikiPageNotFoundException { - Session session = this.wiki.__getHibernateSession(); - session.beginTransaction(); - hibernateCategory = catDAO.findById(id); - session.getTransaction().commit(); - - if (hibernateCategory == null) { - throw new WikiPageNotFoundException("No category with id " + id + " was found."); - } + private final CategoryDAO catDAO; + private org.dkpro.jwpl.api.hibernate.Category hibernateCategory; + private final Wikipedia wiki; + + + /** + * Creates a category object. + * + * @param wiki The wikipedia object. + * @param id The hibernate id of the category. + * @throws WikiPageNotFoundException If the category does not exist. + */ + protected Category(Wikipedia wiki, long id) throws WikiPageNotFoundException { + this.wiki = wiki; + catDAO = new CategoryDAO(wiki); + createCategory(id); + } + + /** + * Creates a category object. + * + * @param wiki The wikipedia object. + * @param pageID The pageID of the category. + * @throws WikiPageNotFoundException If the category does not exist. + */ + protected Category(Wikipedia wiki, int pageID) throws WikiPageNotFoundException { + this.wiki = wiki; + catDAO = new CategoryDAO(wiki); + createCategory(pageID); + } + + /** + * Creates a category object. + * + * @param wiki The wikipedia object. + * @param pName The name of the category. + * @throws WikiPageNotFoundException If the category does not exist. + */ + public Category(Wikipedia wiki, String pName) throws WikiApiException { + if (pName == null || pName.length() == 0) { + throw new WikiPageNotFoundException(); } - - /** - * @see Category#Category(Wikipedia, int) - */ - private void createCategory(int pageID) throws WikiPageNotFoundException { - createCategory( wiki.__getCategoryHibernateId(pageID)); + this.wiki = wiki; + catDAO = new CategoryDAO(wiki); + Title catTitle = new Title(pName); + createCategory(catTitle); + } + + /** + * @see Category#Category(Wikipedia, long) + */ + private void createCategory(long id) throws WikiPageNotFoundException { + Session session = this.wiki.__getHibernateSession(); + session.beginTransaction(); + hibernateCategory = catDAO.findById(id); + session.getTransaction().commit(); + + if (hibernateCategory == null) { + throw new WikiPageNotFoundException("No category with id " + id + " was found."); } - - /** - * @see Category#Category(Wikipedia, String) - */ - private void createCategory(Title title) throws WikiPageNotFoundException { - String name = title.getWikiStyleTitle(); - Session session = this.wiki.__getHibernateSession(); - session.beginTransaction(); - - Integer returnValue; - - String query = "select cat.pageId from Category as cat where cat.name = :name"; - if(wiki.getDatabaseConfiguration().supportsCollation()) { - query += Wikipedia.SQL_COLLATION; - } - returnValue = session.createNativeQuery(query, Integer.class) - .setParameter("name", name, StandardBasicTypes.STRING) - .uniqueResult(); - session.getTransaction().commit(); - - // if there is no category with this name, the hibernateCategory is null - if (returnValue == null) { - hibernateCategory = null; - throw new WikiPageNotFoundException("No category with name " + name + " was found."); - } - else { - int pageID = returnValue; - createCategory( pageID); - } - } - - /** - * This returns the internal id. Do not confuse this with the pageId. - * @return Returns the internal id. - */ - /* - * Note well: - * Access is limited to package-private here intentionally, as the database ID is considered framework-internal use. - */ - long __getId() { - Session session = this.wiki.__getHibernateSession(); - session.beginTransaction(); - session.lock(hibernateCategory, LockMode.NONE); - long id = hibernateCategory.getId(); - session.getTransaction().commit(); - return id; + } + + /** + * @see Category#Category(Wikipedia, int) + */ + private void createCategory(int pageID) throws WikiPageNotFoundException { + createCategory(wiki.__getCategoryHibernateId(pageID)); + } + + /** + * @see Category#Category(Wikipedia, String) + */ + private void createCategory(Title title) throws WikiPageNotFoundException { + String name = title.getWikiStyleTitle(); + Session session = this.wiki.__getHibernateSession(); + session.beginTransaction(); + + Integer returnValue; + + String query = "select cat.pageId from Category as cat where cat.name = :name"; + if (wiki.getDatabaseConfiguration().supportsCollation()) { + query += Wikipedia.SQL_COLLATION; } - - /** - * @return A unique page id. - */ - public int getPageId() { - Session session = this.wiki.__getHibernateSession(); - session.beginTransaction(); - session.lock(hibernateCategory, LockMode.NONE); - int pageID = hibernateCategory.getPageId(); - session.getTransaction().commit(); - return pageID; + returnValue = session.createNativeQuery(query, Integer.class) + .setParameter("name", name, StandardBasicTypes.STRING) + .uniqueResult(); + session.getTransaction().commit(); + + // if there is no category with this name, the hibernateCategory is null + if (returnValue == null) { + hibernateCategory = null; + throw new WikiPageNotFoundException("No category with name " + name + " was found."); + } else { + int pageID = returnValue; + createCategory(pageID); } - - /** - * @return A set containing parents (supercategories) of this category. - */ - public Set getParents() { - Session session = this.wiki.__getHibernateSession(); - session.beginTransaction(); - session.lock(hibernateCategory, LockMode.NONE); - Set tmpSet = new HashSet<>(hibernateCategory.getInLinks()); - session.getTransaction().commit(); - - Set categories = new HashSet<>(); - for (int pageID : tmpSet) { - categories.add(this.wiki.getCategory(pageID)); - } - return categories; + } + + /** + * This returns the internal id. Do not confuse this with the pageId. + * + * @return Returns the internal id. + */ + /* + * Note well: + * Access is limited to package-private here intentionally, as the database ID is considered framework-internal use. + */ + long __getId() { + Session session = this.wiki.__getHibernateSession(); + session.beginTransaction(); + session.lock(hibernateCategory, LockMode.NONE); + long id = hibernateCategory.getId(); + session.getTransaction().commit(); + return id; + } + + /** + * @return A unique page id. + */ + public int getPageId() { + Session session = this.wiki.__getHibernateSession(); + session.beginTransaction(); + session.lock(hibernateCategory, LockMode.NONE); + int pageID = hibernateCategory.getPageId(); + session.getTransaction().commit(); + return pageID; + } + + /** + * @return A set containing parents (super categories) of this category. + */ + public Set getParents() { + Session session = this.wiki.__getHibernateSession(); + session.beginTransaction(); + session.lock(hibernateCategory, LockMode.NONE); + Set tmpSet = new HashSet<>(hibernateCategory.getInLinks()); + session.getTransaction().commit(); + + Set categories = new HashSet<>(); + for (int pageID : tmpSet) { + categories.add(this.wiki.getCategory(pageID)); } - - /** - * This is a more efficient shortcut for writing "getParents().size()", as that would require to load all the parents first. - * @return The number of parents of this category. - */ - public int getNumberOfParents() { - int nrOfInlinks = 0; - - long id = this.__getId(); - Session session = this.wiki.__getHibernateSession(); - session.beginTransaction(); - String sql = "select count(inLinks) from category_inlinks where id = :id"; - Long returnValue = session.createNativeQuery(sql, Long.class) + return categories; + } + + /** + * This is a more efficient shortcut for writing "getParents().size()", as that would require to load all the parents first. + * + * @return The number of parents of this category. + */ + public int getNumberOfParents() { + int nrOfInlinks = 0; + + long id = this.__getId(); + Session session = this.wiki.__getHibernateSession(); + session.beginTransaction(); + String sql = "select count(inLinks) from category_inlinks where id = :id"; + Long returnValue = session.createNativeQuery(sql, Long.class) .setParameter("id", id, StandardBasicTypes.LONG) .uniqueResult(); - session.getTransaction().commit(); + session.getTransaction().commit(); - if (returnValue != null) { - nrOfInlinks = returnValue.intValue(); - } - return nrOfInlinks; + if (returnValue != null) { + nrOfInlinks = returnValue.intValue(); } - - /** - * @return A set containing the IDs of the parents of this category. - */ - public Set getParentIDs() { - Session session = this.wiki.__getHibernateSession(); - session.beginTransaction(); - session.lock(hibernateCategory, LockMode.NONE); - Set tmpSet = new HashSet<>(hibernateCategory.getInLinks()); - session.getTransaction().commit(); - return tmpSet; + return nrOfInlinks; + } + + /** + * @return A set containing the IDs of the parents of this category. + */ + public Set getParentIDs() { + Session session = this.wiki.__getHibernateSession(); + session.beginTransaction(); + session.lock(hibernateCategory, LockMode.NONE); + Set tmpSet = new HashSet<>(hibernateCategory.getInLinks()); + session.getTransaction().commit(); + return tmpSet; + } + + /** + * @return A set containing the children (subcategories) of this category. + */ + public Set getChildren() { + Session session = this.wiki.__getHibernateSession(); + session.beginTransaction(); + session.lock(hibernateCategory, LockMode.NONE); + Set tmpSet = new HashSet<>(hibernateCategory.getOutLinks()); + session.getTransaction().commit(); + + Set categories = new HashSet<>(); + for (int pageID : tmpSet) { + categories.add(this.wiki.getCategory(pageID)); } - - /** - * @return A set containing the children (subcategories) of this category. - */ - public Set getChildren() { - Session session = this.wiki.__getHibernateSession(); - session.beginTransaction(); - session.lock(hibernateCategory, LockMode.NONE); - Set tmpSet = new HashSet<>(hibernateCategory.getOutLinks()); - session.getTransaction().commit(); - - Set categories = new HashSet<>(); - for (int pageID : tmpSet) { - categories.add(this.wiki.getCategory(pageID)); - } - return categories; - } - - /** - * This is a more efficient shortcut for writing "getChildren().size()", as that would require to load all the children first. - * @return The number of children of this category. - */ - public int getNumberOfChildren() { - int nrOfOutlinks = 0; - - long id = this.__getId(); - Session session = this.wiki.__getHibernateSession(); - session.beginTransaction(); - String sql = "select count(outLinks) from category_outlinks where id = :id"; - Long returnValue = session.createNativeQuery(sql, Long.class) + return categories; + } + + /** + * This is a more efficient shortcut for writing "getChildren().size()", as that would require to load all the children first. + * + * @return The number of children of this category. + */ + public int getNumberOfChildren() { + int nrOfOutlinks = 0; + + long id = this.__getId(); + Session session = this.wiki.__getHibernateSession(); + session.beginTransaction(); + String sql = "select count(outLinks) from category_outlinks where id = :id"; + Long returnValue = session.createNativeQuery(sql, Long.class) .setParameter("id", id, StandardBasicTypes.LONG) .uniqueResult(); - session.getTransaction().commit(); - - if (returnValue != null) { - nrOfOutlinks = returnValue.intValue(); - } - return nrOfOutlinks; - } + session.getTransaction().commit(); - /** - * @return A set containing the IDs of the children of this category. - */ - public Set getChildrenIDs() { - Session session = this.wiki.__getHibernateSession(); - session.beginTransaction(); - session.lock(hibernateCategory, LockMode.NONE); - Set tmpSet = new HashSet<>(hibernateCategory.getOutLinks()); - session.getTransaction().commit(); - return tmpSet; + if (returnValue != null) { + nrOfOutlinks = returnValue.intValue(); } - - /** - * @return The title of the category. - * @throws WikiTitleParsingException - */ - public Title getTitle() throws WikiTitleParsingException { - Session session = this.wiki.__getHibernateSession(); - session.beginTransaction(); - session.lock(hibernateCategory, LockMode.NONE); - String name = hibernateCategory.getName(); - session.getTransaction().commit(); - Title title = new Title(name); - return title; - } - - /** - * @return The set of articles that are categorized under this category. - * @throws WikiApiException - */ - public Set getArticles() throws WikiApiException { - Set tmpSet = getArticleIds(); - Set pages = new HashSet<>(); - for (int pageID : tmpSet) { - pages.add(this.wiki.getPage(pageID)); - } - return pages; - } - - /** - * @return The set of article ids that are categorized under this category. - */ - public Set getArticleIds() { - Session session = this.wiki.__getHibernateSession(); - session.beginTransaction(); - session.lock(hibernateCategory, LockMode.NONE); - Set tmpSet = new HashSet<>(hibernateCategory.getPages()); - session.getTransaction().commit(); - - return tmpSet; + return nrOfOutlinks; + } + + /** + * @return A set containing the IDs of the children of this category. + */ + public Set getChildrenIDs() { + Session session = this.wiki.__getHibernateSession(); + session.beginTransaction(); + session.lock(hibernateCategory, LockMode.NONE); + Set tmpSet = new HashSet<>(hibernateCategory.getOutLinks()); + session.getTransaction().commit(); + return tmpSet; + } + + /** + * @return The title of the category. + * @throws WikiTitleParsingException Thrown if errors occurred. + */ + public Title getTitle() throws WikiTitleParsingException { + Session session = this.wiki.__getHibernateSession(); + session.beginTransaction(); + session.lock(hibernateCategory, LockMode.NONE); + String name = hibernateCategory.getName(); + session.getTransaction().commit(); + Title title = new Title(name); + return title; + } + + /** + * @return The set of articles that are categorized under this category. + * @throws WikiApiException Thrown if errors occurred. + */ + public Set getArticles() throws WikiApiException { + Set tmpSet = getArticleIds(); + Set pages = new HashSet<>(); + for (int pageID : tmpSet) { + pages.add(this.wiki.getPage(pageID)); } - - /** - * This is a more efficient shortcut for writing "getPages().size()", as that would require to load all the pages first. - * @return The number of pages. - */ - public int getNumberOfPages() { - int nrOfPages = 0; - - long id = this.__getId(); - Session session = this.wiki.__getHibernateSession(); - session.beginTransaction(); - String sql = "select count(pages) from category_pages where id = :id"; - Long returnValue = session.createNativeQuery(sql, Long.class) + return pages; + } + + /** + * @return The set of article ids that are categorized under this category. + */ + public Set getArticleIds() { + Session session = this.wiki.__getHibernateSession(); + session.beginTransaction(); + session.lock(hibernateCategory, LockMode.NONE); + Set tmpSet = new HashSet<>(hibernateCategory.getPages()); + session.getTransaction().commit(); + + return tmpSet; + } + + /** + * This is a more efficient shortcut for writing "getPages().size()", as that would require to load all the pages first. + * + * @return The number of pages. + */ + public int getNumberOfPages() { + int nrOfPages = 0; + + long id = this.__getId(); + Session session = this.wiki.__getHibernateSession(); + session.beginTransaction(); + String sql = "select count(pages) from category_pages where id = :id"; + Long returnValue = session.createNativeQuery(sql, Long.class) .setParameter("id", id, StandardBasicTypes.LONG) .uniqueResult(); - session.getTransaction().commit(); + session.getTransaction().commit(); - if (returnValue != null) { - nrOfPages = returnValue.intValue(); - } - return nrOfPages; + if (returnValue != null) { + nrOfPages = returnValue.intValue(); } - - /** - * This method exposes implementation details and should not be made public. - * It is used for performance tuning. - * @return The set of pages that are categorized under this category. - */ - /* - * Note well: - * Access is limited to package-private here intentionally, as it is API-internal use only. - */ - Set __getPages() { - return getArticleIds(); + return nrOfPages; + } + + /** + * This method exposes implementation details and should not be made public. + * It is used for performance tuning. + * + * @return The set of pages that are categorized under this category. + */ + /* + * Note well: + * Access is limited to package-private here intentionally, as it is API-internal use only. + */ + Set __getPages() { + return getArticleIds(); + } + + /** + * Returns *all* recursively collected descendants (=subcategories) of this category. + * + * @return An iterable of all descendants (=subcategories) of this category. + */ + public Iterable getDescendants() { + return new CategoryDescendantsIterable(wiki, this); + } + + /** + * Returns *all* recursively collected descendants (=subcategories) of this category. + * + * @return An iterable of all descendants (=subcategories) of this category. + */ + protected Iterable getDescendants(int bufferSize) { + return new CategoryDescendantsIterable(wiki, bufferSize, this); + } + + /** + * Returns the siblings of this category. + * + * @return Returns the siblings of this category or null, if there are none. + */ + public Set getSiblings() { + Set siblings = new HashSet<>(); + + // add siblings + for (Category parent : this.getParents()) { + siblings.addAll(parent.getChildren()); } - /** - * Returns *all* recursively collected descendants (=subcategories) of this category. - * @return An iterable of all descendants (=subcategories) of this category. - */ - public Iterable getDescendants() { - return new CategoryDescendantsIterable(wiki, this); + // remove this category from list + siblings.remove(this); + + return siblings; + } + + /** + * @return A string with information about a {@link Category}. + * @throws WikiApiException Thrown if errors occurred. + */ + protected String getCategoryInfo() throws WikiApiException { + StringBuilder sb = new StringBuilder(1000); + + sb.append("ID : ").append(__getId()).append(LF); + sb.append("PageID : ").append(getPageId()).append(LF); + sb.append("Name : ").append(getTitle()).append(LF); + sb.append("In-Links").append(LF); + for (Category parent : getParents()) { + sb.append(" ").append(parent.getTitle()).append(LF); } - - /** - * Returns *all* recursively collected descendants (=subcategories) of this category. - * @return An iterable of all descendants (=subcategories) of this category. - */ - protected Iterable getDescendants(int bufferSize) { - return new CategoryDescendantsIterable(wiki, bufferSize, this); + sb.append("Out-Links").append(LF); + for (Category child : getChildren()) { + sb.append(" ").append(child.getTitle()).append(LF); } - - /** - * Returns the siblings of this category. - * @return Returns the siblings of this category or null, if there are none. - */ - public Set getSiblings() { - Set siblings = new HashSet<>(); - - // add siblings - for (Category parent : this.getParents()) { - siblings.addAll(parent.getChildren()); - } - - // remove this category from list - siblings.remove(this); - - return siblings; - } - - /** - * @return A string with information about a {@link Category}. - * @throws WikiApiException - */ - protected String getCategoryInfo() throws WikiApiException { - StringBuilder sb = new StringBuilder(1000); - - sb.append("ID : ").append(__getId()).append(LF); - sb.append("PageID : ").append(getPageId()).append(LF); - sb.append("Name : ").append(getTitle()).append(LF); - sb.append("In-Links").append(LF); - for (Category parent : getParents()) { - sb.append(" ").append(parent.getTitle()).append(LF); - } - sb.append("Out-Links").append(LF); - for (Category child : getChildren()) { - sb.append(" ").append(child.getTitle()).append(LF); - } - sb.append("Pages").append(LF); - for (Page page : getArticles()) { - sb.append(" ").append(page.getTitle()).append(LF); - } - return sb.toString(); + sb.append("Pages").append(LF); + for (Page page : getArticles()) { + sb.append(" ").append(page.getTitle()).append(LF); } + return sb.toString(); + } } diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/CategoryDescendantsIterable.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/CategoryDescendantsIterable.java index cafa4213..347f3087 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/CategoryDescendantsIterable.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/CategoryDescendantsIterable.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -19,36 +19,35 @@ import java.util.Iterator; - /** - * An iterable over category objects retrieved by Category.getDescendants() - * + * An {@link Iterable} over category objects retrieved by {@link Category#getDescendants()}. */ public class CategoryDescendantsIterable implements Iterable { - private final Wikipedia wiki; - private final Category startCategory; - - /** - * The size of the page buffer. - * With bufferSize = 1, a database connection is needed for retrieving a single article. - * Higher bufferSize gives better performance, but needs memory. - * Initialize it with 25. - */ - private int bufferSize = 25; - - public CategoryDescendantsIterable(Wikipedia wiki, Category startCategory) { - this.wiki = wiki; - this.startCategory = startCategory; - } - - public CategoryDescendantsIterable(Wikipedia wiki, int bufferSize, Category startCategory) { - this.wiki = wiki; - this.bufferSize = bufferSize; - this.startCategory = startCategory; - } - - public Iterator iterator() { - return new CategoryDescendantsIterator(wiki, bufferSize, startCategory); - } + private final Wikipedia wiki; + private final Category startCategory; + + /* + * The size of the page buffer. + * With bufferSize = 1, a database connection is needed for retrieving a single article. + * Higher bufferSize gives better performance, but needs memory. + * Initialize it with 25. + */ + private int bufferSize = 25; + + public CategoryDescendantsIterable(Wikipedia wiki, Category startCategory) { + this.wiki = wiki; + this.startCategory = startCategory; + } + + public CategoryDescendantsIterable(Wikipedia wiki, int bufferSize, Category startCategory) { + this.wiki = wiki; + this.bufferSize = bufferSize; + this.startCategory = startCategory; + } + + @Override + public Iterator iterator() { + return new CategoryDescendantsIterator(wiki, bufferSize, startCategory); + } } diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/CategoryDescendantsIterator.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/CategoryDescendantsIterator.java index cfaebb0c..165e1f3c 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/CategoryDescendantsIterator.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/CategoryDescendantsIterator.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -28,155 +28,153 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; - /** - * An iterator over category objects retrieved by Category.getDescendants() - * + * An {@link Iterator} over category objects retrieved by {@link Category#getDescendants()}. */ public class CategoryDescendantsIterator implements Iterator { - private final Logger logger = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + private final Logger logger = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); - private final Wikipedia wiki; + private final Wikipedia wiki; - private final CategoryBuffer buffer; + private final CategoryBuffer buffer; - /** Contains all category ids that have not been expanded, yet. */ - private final Set notExpandedCategories; + /** + * Contains all category ids that have not been expanded, yet. + */ + private final Set notExpandedCategories; - /** As we do not inspect the whole graph at once now, we need a way to check whether a node was already expanded, to avoid infinite loops. */ - private final Set expandedCategoryIds; + /** + * As we do not inspect the whole graph at once now, we need a way to check whether a node was already expanded, to avoid infinite loops. + */ + private final Set expandedCategoryIds; - public CategoryDescendantsIterator(Wikipedia wiki, int bufferSize, Category startCategory) { - this.wiki = wiki; - buffer = new CategoryBuffer(bufferSize); - notExpandedCategories = new HashSet<>(); - // initialize with children of start category - for (Category catItem : startCategory.getChildren()) { - notExpandedCategories.add(catItem.getPageId()); - } + public CategoryDescendantsIterator(Wikipedia wiki, int bufferSize, Category startCategory) { + this.wiki = wiki; + buffer = new CategoryBuffer(bufferSize); + notExpandedCategories = new HashSet<>(); + // initialize with children of start category + for (Category catItem : startCategory.getChildren()) { + notExpandedCategories.add(catItem.getPageId()); + } - expandedCategoryIds = new HashSet<>(); + expandedCategoryIds = new HashSet<>(); + } + + @Override + public boolean hasNext() { + return buffer.hasNext(); + } + + @Override + public Category next() { + return buffer.next(); + } + + @Override + public void remove() { + throw new UnsupportedOperationException(); + } + + /** + * Buffers categories in a list. + */ + class CategoryBuffer { + + private final List buffer; + private final int maxBufferSize; // the number of pages to be buffered after a query to the database. + private int bufferFillSize; // even a 500 slot buffer can be filled with only 5 elements + private int bufferOffset; // the offset in the buffer + private int dataOffset; // the overall offset in the data + + public CategoryBuffer(int bufferSize) { + this.maxBufferSize = bufferSize; + this.buffer = new ArrayList<>(); + this.bufferFillSize = 0; + this.bufferOffset = 0; + this.dataOffset = 0; + + //TODO test whether this works when zero pages are retrieved + // we can test this here using a unit test that retrieves no descendants! } - public boolean hasNext(){ - return buffer.hasNext(); + /** + * If there are elements in the buffer left, then return true. + * If the end of the filled buffer is reached, then try to load new buffer. + * + * @return True, if there are pages left. False otherwise. + */ + public boolean hasNext() { + if (bufferOffset < bufferFillSize) { + return true; + } else { + return this.fillBuffer(); + } } - public Category next(){ - return buffer.next(); + /** + * @return The next Category or null if no more categories are available. + */ + public Category next() { + // if there are still elements in the buffer, just retrieve the next one + if (bufferOffset < bufferFillSize) { + return this.getBufferElement(); + } + // if there are no more elements => try to fill a new buffer + else if (this.fillBuffer()) { + return this.getBufferElement(); + } else { + // if it cannot be filled => return null + return null; + } } - public void remove() { - throw new UnsupportedOperationException(); + private Category getBufferElement() { + Category cat = buffer.get(bufferOffset); + bufferOffset++; + dataOffset++; + return cat; } - /** - * Buffers categories in a list. - * - * - */ - class CategoryBuffer{ - - private final List buffer; - private final int maxBufferSize; // the number of pages to be buffered after a query to the database. - private int bufferFillSize; // even a 500 slot buffer can be filled with only 5 elements - private int bufferOffset; // the offset in the buffer - private int dataOffset; // the overall offset in the data - - public CategoryBuffer(int bufferSize){ - this.maxBufferSize = bufferSize; - this.buffer = new ArrayList<>(); - this.bufferFillSize = 0; - this.bufferOffset = 0; - this.dataOffset = 0; - -//TODO test whether this works when zero pages are retrieved -// we can test this here using a unit test that retrieves no descendants! - } + private boolean fillBuffer() { - /** - * If there are elements in the buffer left, then return true. - * If the end of the filled buffer is reached, then try to load new buffer. - * @return True, if there are pages left. False otherwise. - */ - public boolean hasNext(){ - if (bufferOffset < bufferFillSize) { - return true; - } - else { - return this.fillBuffer(); - } - } + // clear the old buffer and all variables regarding the state of the buffer + buffer.clear(); + bufferOffset = 0; + bufferFillSize = 0; - /** - * - * @return The next Category or null if no more categories are available. - */ - public Category next(){ - // if there are still elements in the buffer, just retrieve the next one - if (bufferOffset < bufferFillSize) { - return this.getBufferElement(); - } - // if there are no more elements => try to fill a new buffer - else if (this.fillBuffer()) { - return this.getBufferElement(); - } - else { - // if it cannot be filled => return null - return null; - } - } + // add not expanded categories to queue + List queue = new LinkedList<>(notExpandedCategories); - private Category getBufferElement() { - Category cat = buffer.get(bufferOffset); - bufferOffset++; - dataOffset++; - return cat; - } + // expand until buffer size is reached + while (!queue.isEmpty() && buffer.size() < maxBufferSize) { + // remove first element from queue + Category currentCat = wiki.getCategory(queue.get(0)); + queue.remove(0); + + // if the node was not previously expanded + if (!expandedCategoryIds.contains(currentCat.getPageId())) { + buffer.add(currentCat); + notExpandedCategories.remove(currentCat.getPageId()); + expandedCategoryIds.add(currentCat.getPageId()); + + logger.debug("buf: " + buffer.size()); + logger.debug("notExp: " + notExpandedCategories); + logger.debug("exp: " + expandedCategoryIds); - private boolean fillBuffer() { - - // clear the old buffer and all variables regarding the state of the buffer - buffer.clear(); - bufferOffset = 0; - bufferFillSize = 0; - - List queue = new LinkedList<>(); - - // add not expanded categories to queue - queue.addAll(notExpandedCategories); - - // expand until buffer size is reached - while (!queue.isEmpty() && buffer.size() < maxBufferSize) { - // remove first element from queue - Category currentCat = wiki.getCategory(queue.get(0)); - queue.remove(0); - - // if the node was not previously expanded - if (!expandedCategoryIds.contains(currentCat.getPageId())) { - buffer.add(currentCat); - notExpandedCategories.remove(currentCat.getPageId()); - expandedCategoryIds.add(currentCat.getPageId()); - - logger.debug("buf: " + buffer.size()); - logger.debug("notExp: " + notExpandedCategories); - logger.debug("exp: " + expandedCategoryIds); - - for (Category child : currentCat.getChildren()) { - queue.add(child.getPageId()); - notExpandedCategories.add(child.getPageId()); - } - } - } - - if (buffer.size() > 0) { - bufferFillSize = buffer.size(); - return true; - } - else { - return false; - } + for (Category child : currentCat.getChildren()) { + queue.add(child.getPageId()); + notExpandedCategories.add(child.getPageId()); + } } + } + + if (buffer.size() > 0) { + bufferFillSize = buffer.size(); + return true; + } else { + return false; + } } + } } diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/CategoryGraph.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/CategoryGraph.java index 2067140f..92edf585 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/CategoryGraph.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/CategoryGraph.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -36,13 +36,6 @@ import java.util.Map; import java.util.Set; -import org.jgrapht.GraphPath; -import org.jgrapht.alg.connectivity.ConnectivityInspector; -import org.jgrapht.alg.shortestpath.DijkstraShortestPath; -import org.jgrapht.graph.AsUndirectedGraph; -import org.jgrapht.graph.DefaultDirectedGraph; -import org.jgrapht.graph.DefaultEdge; - import org.dkpro.jwpl.api.exception.WikiApiException; import org.dkpro.jwpl.api.exception.WikiPageNotFoundException; import org.dkpro.jwpl.api.exception.WikiTitleParsingException; @@ -50,243 +43,252 @@ import org.dkpro.jwpl.util.ApiUtilities; import org.dkpro.jwpl.util.CommonUtilities; import org.dkpro.jwpl.util.OS; +import org.jgrapht.GraphPath; +import org.jgrapht.alg.connectivity.ConnectivityInspector; +import org.jgrapht.alg.shortestpath.DijkstraShortestPath; +import org.jgrapht.graph.AsUndirectedGraph; +import org.jgrapht.graph.DefaultDirectedGraph; +import org.jgrapht.graph.DefaultEdge; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * The category graph is constructed from the links connecting Wikipedia categories. * It provides various accessors and graph algorithms. - * */ public class CategoryGraph implements WikiConstants, Serializable { - - private static final Logger logger = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); - - static final long serialVersionUID = 1L; - - // the wikipedia object - private Wikipedia wiki; - - // the category graph - private DefaultDirectedGraph graph; - // the category graph - private AsUndirectedGraph undirectedGraph; - - // a map holding the degree distribution of the graph - private Map degreeDistribution; - - // number of nodes in the graph - private int numberOfNodes; - - // number of edges in the graph - private int numberOfEdges; - - // A map holding the (recursive) number of hyponyms for each node. - // Recursive means that the hyponyms of hyponyms are also taken into account. - private Map hyponymCountMap = null; - private final String hyponymCountMapFilename = "hypoCountMap"; - - // a mapping from all nodes to a list of nodes on the path to the root - private Map> rootPathMap = null; - private final String rootPathMapFilename = "rootPathMap"; - - private double averageShortestPathLength = Double.NEGATIVE_INFINITY; - private double diameter = Double.NEGATIVE_INFINITY; - private double averageDegree = Double.NEGATIVE_INFINITY; - private double clusterCoefficient = Double.NEGATIVE_INFINITY; - private double depth = Double.NEGATIVE_INFINITY; - - /** - * Creates an empty {@link CategoryGraph}. You cannot do much with such a graph. - * Sometimes an empty category graph can be useful if you just need a CategoryGraph object, but do not care about its content. - */ - public CategoryGraph() throws WikiApiException { - logger.warn("Attention. You created an empty category graph. Intentionally?"); + private static final Logger logger = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + + static final long serialVersionUID = 1L; + + // the wikipedia object + private Wikipedia wiki; + + // the category graph + private DefaultDirectedGraph graph; + // the category graph + private AsUndirectedGraph undirectedGraph; + + // a map holding the degree distribution of the graph + private Map degreeDistribution; + + // number of nodes in the graph + private int numberOfNodes; + + // number of edges in the graph + private int numberOfEdges; + + // A map holding the (recursive) number of hyponyms for each node. + // Recursive means that the hyponyms of hyponyms are also taken into account. + private Map hyponymCountMap = null; + private final String hyponymCountMapFilename = "hypoCountMap"; + + // a mapping from all nodes to a list of nodes on the path to the root + private Map> rootPathMap = null; + private final String rootPathMapFilename = "rootPathMap"; + + private double averageShortestPathLength = Double.NEGATIVE_INFINITY; + private double diameter = Double.NEGATIVE_INFINITY; + private double averageDegree = Double.NEGATIVE_INFINITY; + private double clusterCoefficient = Double.NEGATIVE_INFINITY; + private double depth = Double.NEGATIVE_INFINITY; + + + /** + * Creates an empty {@link CategoryGraph}. You cannot do much with such a graph. + * Sometimes an empty category graph can be useful if you just need a CategoryGraph object, but do not care about its content. + */ + public CategoryGraph() throws WikiApiException { + logger.warn("Attention. You created an empty category graph. Intentionally?"); + } + + /** + * Creates an {@link CategoryGraph} using a serialized DirectedGraph object. + * + * @param pWiki A {@link Wikipedia} object. + * @param location The location of the serialized graph + * @throws WikiApiException Thrown if errors occurred. + */ + public CategoryGraph(Wikipedia pWiki, File location) throws WikiApiException { + try { + constructCategoryGraph(pWiki, GraphSerialization.loadGraph(location)); + } catch (IOException | ClassNotFoundException e) { + throw new WikiApiException(e); } - - /** - * Creates an {@link CategoryGraph} using a serialized DirectedGraph object. - * @param pWiki A {@link Wikipedia} object. - * @param location The location of the serialized graph - * @throws WikiApiException Thrown if errors occurred. - */ - public CategoryGraph(Wikipedia pWiki, File location) throws WikiApiException{ - try { - constructCategoryGraph(pWiki, GraphSerialization.loadGraph(location)); - } catch (IOException | ClassNotFoundException e) { - throw new WikiApiException(e); - } + } + + /** + * Creates a {@link CategoryGraph} object using all categories of the given Wikipedia. + * + * @param pWiki A {@link Wikipedia} object. + * @throws WikiApiException Thrown if errors occurred. + */ + public CategoryGraph(Wikipedia pWiki) throws WikiApiException { + constructCategoryGraph(pWiki, pWiki.__getCategories(), null); + } + + /** + * Creates a CategoryGraph object using all categories, but filters all categories starting with strings contained in the filterList. + * + * @param pWiki The Wikipedia object. + * @param filterList A list of strings. All categories starting with or matching such a string are not added to the category graph. + * @throws WikiApiException Thrown if errors occurred. + */ + public CategoryGraph(Wikipedia pWiki, List filterList) throws WikiApiException { + constructCategoryGraph(pWiki, pWiki.__getCategories(), filterList); + } + + /** + * Creates a CategoryGraph object using the categories given by the iterable + * + * @param pWiki The Wikipedia object. + * @param categories An iterable of the categories to use for construction of the category graph. + * @throws WikiApiException Thrown if errors occurred. + */ + public CategoryGraph(Wikipedia pWiki, Iterable categories) throws WikiApiException { + Set pageIDs = new HashSet<>(); + while (categories.iterator().hasNext()) { + pageIDs.add(categories.iterator().next().getPageId()); } - - /** - * Creates a {@link CategoryGraph} object using all categories of the given Wikipedia. - * @param pWiki A {@link Wikipedia} object. - * @throws WikiApiException Thrown if errors occurred. - */ - public CategoryGraph(Wikipedia pWiki) throws WikiApiException { - constructCategoryGraph(pWiki, pWiki.__getCategories(), null); + constructCategoryGraph(pWiki, pageIDs, null); + } + + /** + * Creates a CategoryGraph object using the categories given by the iterable, but filters all categories starting with strings contained in the filterList + * + * @param pWiki The Wikipedia object. + * @param categories An iterable of the categories to use for construction of the category graph. + * @param filterList A list of strings. All categories starting with or matching such a string are not added to the category graph. + * @throws WikiApiException Thrown if errors occurred. + */ + public CategoryGraph(Wikipedia pWiki, Iterable categories, List filterList) throws WikiApiException { + Set pageIDs = new HashSet<>(); + while (categories.iterator().hasNext()) { + pageIDs.add(categories.iterator().next().getPageId()); } - - /** - * Creates a CategoryGraph object using all categories, but filters all categories starting with strings contained in the filterList. - * @param pWiki The Wikipedia object. - * @param filterList A list of strings. All categories starting with or matching such a string are not added to the category graph. - * @throws WikiApiException Thrown if errors occurred. - */ - public CategoryGraph(Wikipedia pWiki, List filterList) throws WikiApiException { - constructCategoryGraph(pWiki, pWiki.__getCategories(), filterList); - } - - /** - * Creates a CategoryGraph object using the categories given by the iterable - * @param pWiki The Wikipedia object. - * @param categories An iterable of the categories to use for construction of the category graph. - * @throws WikiApiException Thrown if errors occurred. - */ - public CategoryGraph(Wikipedia pWiki, Iterable categories) throws WikiApiException { - Set pageIDs = new HashSet<>(); - while (categories.iterator().hasNext()) { - pageIDs.add(categories.iterator().next().getPageId()); + constructCategoryGraph(pWiki, pageIDs, filterList); + } + + /** + * Creates a category graph using a subset (that may also be the full set :) of the categories. + * + * @param pWiki The wiki object. + * @param pPageIDs A set of pageIDs of the category pages that should be used to build the category graph. + * @throws WikiApiException Thrown if errors occurred. + */ + protected CategoryGraph(Wikipedia pWiki, Set pPageIDs) throws WikiApiException { + constructCategoryGraph(pWiki, pPageIDs, null); + } + + public CategoryGraph(Wikipedia pWiki, DefaultDirectedGraph pGraph) throws WikiApiException { + constructCategoryGraph(pWiki, pGraph); + } + + private void constructCategoryGraph(Wikipedia pWiki, DefaultDirectedGraph pGraph) throws WikiApiException { + this.wiki = pWiki; + this.graph = pGraph; + this.numberOfNodes = this.graph.vertexSet().size(); + this.numberOfEdges = this.graph.edgeSet().size(); + this.undirectedGraph = new AsUndirectedGraph<>(this.graph); + } + + private void constructCategoryGraph(Wikipedia pWiki, Set pPageIDs, List filterList) throws WikiApiException { + // create the graph as a directed Graph + // algorithms that need to be called on a undirected graph or should ignore direction + // can be called on an AsUndirectedGraph view of the directed graph + graph = new DefaultDirectedGraph<>(DefaultEdge.class); + + wiki = pWiki; + + degreeDistribution = new HashMap<>(); + + for (int pageID : pPageIDs) { + if (filterList != null) { + long hibernateID = pWiki.__getCategoryHibernateId(pageID); + if (hibernateID == -1) { + throw new WikiApiException(pageID + " is not a valid pageID"); + } + + Category cat; + try { + cat = new Category(this.wiki, hibernateID); + } catch (WikiPageNotFoundException e) { + throw new WikiApiException("Category not found"); } - constructCategoryGraph(pWiki, pageIDs, null); - } - /** - * Creates a CategoryGraph object using the categories given by the iterable, but filters all categories starting with strings contained in the filterList - * @param pWiki The Wikipedia object. - * @param categories An iterable of the categories to use for construction of the category graph. - * @param filterList A list of strings. All categories starting with or matching such a string are not added to the category graph. - * @throws WikiApiException Thrown if errors occurred. - */ - public CategoryGraph(Wikipedia pWiki, Iterable categories, List filterList) throws WikiApiException { - Set pageIDs = new HashSet<>(); - while (categories.iterator().hasNext()) { - pageIDs.add(categories.iterator().next().getPageId()); + if (matchesFilter(cat, filterList)) { + continue; } - constructCategoryGraph(pWiki, pageIDs, filterList); - } + } - /** - * Creates a category graph using a subset (that may also be the full set :) of the categories. - * @param pWiki The wiki object. - * @param pPageIDs A set of pageIDs of the category pages that should be used to build the category graph. - * @throws WikiApiException Thrown if errors occurred. - */ - protected CategoryGraph(Wikipedia pWiki, Set pPageIDs) throws WikiApiException { - constructCategoryGraph(pWiki, pPageIDs, null); + graph.addVertex(pageID); } - public CategoryGraph(Wikipedia pWiki, DefaultDirectedGraph pGraph) throws WikiApiException { - constructCategoryGraph(pWiki, pGraph); - } - private void constructCategoryGraph(Wikipedia pWiki, DefaultDirectedGraph pGraph) throws WikiApiException { - this.wiki = pWiki; - this.graph = pGraph; - this.numberOfNodes = this.graph.vertexSet().size(); - this.numberOfEdges = this.graph.edgeSet().size(); - this.undirectedGraph = new AsUndirectedGraph<>(this.graph); + numberOfNodes = graph.vertexSet().size(); + + // add edges + logger.info(OS.getUsedMemory() + " MB memory used."); + int progress = 0; + for (int pageID : graph.vertexSet()) { + progress++; + ApiUtilities.printProgressInfo(progress, pPageIDs.size(), 10, ApiUtilities.ProgressInfoMode.TEXT, "Adding edges"); + + long hibernateID = pWiki.__getCategoryHibernateId(pageID); + if (hibernateID == -1) { + throw new WikiApiException(pageID + " is not a valid pageID"); + } + + // get the category + Category cat; + try { + cat = new Category(this.wiki, hibernateID); + } catch (WikiPageNotFoundException e) { + throw new WikiApiException("Category not found"); + } + + // get parents and children + // if the corresponding nodes are in the graph (it could be a subset) => add them to the graph + Set inLinks = cat.getParentIDs(); + Set outLinks = cat.getChildrenIDs(); + + // add edges + // If an edge already exits, it is silenty ignored by JGraphT. So we do not have to check this. + for (int inLink : inLinks) { + if (graph.vertexSet().contains(inLink)) { + if (inLink == pageID) { + logger.debug("Self-loop for node " + pageID + " (" + cat.getTitle() + ")"); + } else { + graph.addEdge(inLink, pageID); + } + } + } + for (int outLink : outLinks) { + if (graph.vertexSet().contains(outLink)) { + if (outLink == pageID) { + logger.debug("Self-loop for node " + pageID + " (" + cat.getTitle() + ")"); + } else { + graph.addEdge(pageID, outLink); + } + } + } } - private void constructCategoryGraph(Wikipedia pWiki, Set pPageIDs, List filterList) throws WikiApiException { - // create the graph as a directed Graph - // algorithms that need to be called on a undirected graph or should ignore direction - // can be called on an AsUndirectedGraph view of the directed graph - graph = new DefaultDirectedGraph<>(DefaultEdge.class); - - wiki = pWiki; - - degreeDistribution = new HashMap<>(); + numberOfEdges = graph.edgeSet().size(); - for (int pageID : pPageIDs) { - if (filterList != null) { - long hibernateID = pWiki.__getCategoryHibernateId(pageID); - if (hibernateID == -1) { - throw new WikiApiException(pageID + " is not a valid pageID"); - } + logger.info("Added " + this.getNumberOfNodes() + " nodes."); + logger.info("Added " + this.getNumberOfEdges() + " edges."); - Category cat; - try { - cat = new Category(this.wiki, hibernateID); - } catch (WikiPageNotFoundException e) { - throw new WikiApiException("Category not found"); - } + CycleHandler cycleHandler = new CycleHandler(wiki, this); + logger.info("Graph contains cycles: " + cycleHandler.containsCycle()); + cycleHandler.removeCycles(); + logger.info("Graph contains cycles: " + cycleHandler.containsCycle()); - if (matchesFilter(cat,filterList)) { - continue; - } - } + this.numberOfEdges = this.graph.edgeSet().size(); + this.undirectedGraph = new AsUndirectedGraph<>(this.graph); - graph.addVertex(pageID); - } - - - numberOfNodes = graph.vertexSet().size(); - - // add edges - logger.info(OS.getUsedMemory() + " MB memory used."); - int progress = 0; - for (int pageID : graph.vertexSet()) { - progress++; - ApiUtilities.printProgressInfo(progress, pPageIDs.size(), 10, ApiUtilities.ProgressInfoMode.TEXT, "Adding edges"); - - long hibernateID = pWiki.__getCategoryHibernateId(pageID); - if (hibernateID == -1) { - throw new WikiApiException(pageID + " is not a valid pageID"); - } - - // get the category - Category cat; - try { - cat = new Category(this.wiki, hibernateID); - } catch (WikiPageNotFoundException e) { - throw new WikiApiException("Category not found"); - } - - // get parents and children - // if the corresponding nodes are in the graph (it could be a subset) => add them to the graph - Set inLinks = cat.getParentIDs(); - Set outLinks = cat.getChildrenIDs(); - - // add edges - // If an edge already exits, it is silenty ignored by JGraphT. So we do not have to check this. - for (int inLink : inLinks) { - if (graph.vertexSet().contains(inLink)) { - if (inLink == pageID) { - logger.debug("Self-loop for node " + pageID + " (" + cat.getTitle() + ")"); - } - else { - graph.addEdge(inLink, pageID); - } - } - } - for (int outLink : outLinks) { - if (graph.vertexSet().contains(outLink)) { - if (outLink == pageID) { - logger.debug("Self-loop for node " + pageID + " (" + cat.getTitle() + ")"); - } - else { - graph.addEdge(pageID, outLink); - } - } - } - } - - numberOfEdges = graph.edgeSet().size(); - - logger.info("Added " + this.getNumberOfNodes() + " nodes."); - logger.info("Added " + this.getNumberOfEdges() + " edges."); - - CycleHandler cycleHandler = new CycleHandler(wiki, this); - logger.info("Graph contains cycles: " + cycleHandler.containsCycle()); - cycleHandler.removeCycles(); - logger.info("Graph contains cycles: " + cycleHandler.containsCycle()); - - this.numberOfEdges = this.graph.edgeSet().size(); - this.undirectedGraph = new AsUndirectedGraph<>(this.graph); - - } + } //// older version without filterList // private void constructCategoryGraph(Wikipedia pWiki, Set pPageIDs) throws WikiApiException { @@ -365,104 +367,108 @@ private void constructCategoryGraph(Wikipedia pWiki, Set pPageIDs, List // } - /** - * Checks whether the category title matches the filter (a filter matches a string, if the string starts with the filter expression). - * @param cat A category. - * @param filterList A list of filter strings. - * @return True, if the category title starts with or is equal to a string in the filter list. False, otherwise. - * @throws WikiTitleParsingException Thrown if errors occurred. - */ - private boolean matchesFilter(Category cat, List filterList) throws WikiTitleParsingException { - String categoryTitle = cat.getTitle().getPlainTitle(); - for (String filter : filterList) { - if (categoryTitle.startsWith(filter)) { - logger.info(categoryTitle + " starts with " + filter + " => removing"); - return true; - } - } - return false; - } - - /** - * Gets the lowest common subsumer (LCS) of two nodes. - * The LCS of two nodes is first node on the path to the root, that has both nodes as sons. - * Nodes that are not in the same connected component as the root node are defined to have no LCS. - * @param category1 The first category node. - * @param category2 The second category node. - * @return The lowest common subsumer of the two nodes, or null if there is no LCS. - */ - public Category getLCS(Category category1, Category category2) throws WikiApiException { - return getLCS(category1.getPageId(),category2.getPageId()); + /** + * Checks whether the category title matches the filter (a filter matches a string, if the string starts with the filter expression). + * + * @param cat A category. + * @param filterList A list of filter strings. + * @return True, if the category title starts with or is equal to a string in the filter list. False, otherwise. + * @throws WikiTitleParsingException Thrown if errors occurred. + */ + private boolean matchesFilter(Category cat, List filterList) throws WikiTitleParsingException { + String categoryTitle = cat.getTitle().getPlainTitle(); + for (String filter : filterList) { + if (categoryTitle.startsWith(filter)) { + logger.info(categoryTitle + " starts with " + filter + " => removing"); + return true; + } } - - - /** - * Gets the lowest common subsumer (LCS) of two nodes. - * The LCS of two nodes is first node on the path to the root, that has both nodes as sons. - * Nodes that are not in the same connected component as the root node are defined to have no LCS. - * @param categoryPageId1 The pageid of the first category node. - * @param categoryPageId2 The pageid of the second category node. - * @return The pageId of the lowest common subsumer of the two nodes, or null if there is no LCS. - */ - public int getLCSId(int categoryPageId1, int categoryPageId2) throws WikiApiException { + return false; + } + + /** + * Gets the lowest common subsumer (LCS) of two nodes. + * The LCS of two nodes is first node on the path to the root, that has both nodes as sons. + * Nodes that are not in the same connected component as the root node are defined to have no LCS. + * + * @param category1 The first category node. + * @param category2 The second category node. + * @return The lowest common subsumer of the two nodes, or null if there is no LCS. + */ + public Category getLCS(Category category1, Category category2) throws WikiApiException { + return getLCS(category1.getPageId(), category2.getPageId()); + } + + + /** + * Gets the lowest common subsumer (LCS) of two nodes. + * The LCS of two nodes is first node on the path to the root, that has both nodes as sons. + * Nodes that are not in the same connected component as the root node are defined to have no LCS. + * + * @param categoryPageId1 The pageid of the first category node. + * @param categoryPageId2 The pageid of the second category node. + * @return The pageId of the lowest common subsumer of the two nodes, or null if there is no LCS. + */ + public int getLCSId(int categoryPageId1, int categoryPageId2) throws WikiApiException { // TODO here might be a problem concerning multiple inheritence in the category graph, if there is more than one path of equal length to the root, the method will only find one, but the other (not found) LCS may have a higher information content // TODO is the lcs between the same node really defined or should this be handled in the measures (i.e. SR(n1,n1) = 1 per definitionem??) - if (categoryPageId1 == categoryPageId2) { - return categoryPageId1; - } + if (categoryPageId1 == categoryPageId2) { + return categoryPageId1; + } - List nodeList1 = getRootPathMap().get(categoryPageId1); - List nodeList2 = getRootPathMap().get(categoryPageId2); + List nodeList1 = getRootPathMap().get(categoryPageId1); + List nodeList2 = getRootPathMap().get(categoryPageId2); - // if one of the paths is null => return -1 - if (nodeList1 == null || nodeList2 == null || nodeList1.size() == 0 || nodeList2.size() == 0) { - logger.debug("One of the node lists is null or empty!"); - return -1; - } + // if one of the paths is null => return -1 + if (nodeList1 == null || nodeList2 == null || nodeList1.size() == 0 || nodeList2.size() == 0) { + logger.debug("One of the node lists is null or empty!"); + return -1; + } - logger.debug(nodeList1.toString()); - logger.debug(nodeList2.toString()); + logger.debug(nodeList1.toString()); + logger.debug(nodeList2.toString()); - // node 1 subsumes node 2 ? - for (int tmpNode2 : nodeList2) { - if (tmpNode2 == categoryPageId1) { - return categoryPageId1; - } - } + // node 1 subsumes node 2 ? + for (int tmpNode2 : nodeList2) { + if (tmpNode2 == categoryPageId1) { + return categoryPageId1; + } + } - // node 2 subsumes node 1 ? - for (int tmpNode1 : nodeList1) { - if (tmpNode1 == categoryPageId2) { - return categoryPageId2; - } - } - // they have a lcs ? - for (int tmpNode1 : nodeList1) { - for (int tmpNode2 : nodeList2) { - if (tmpNode1 == tmpNode2) { - return tmpNode1; - } - } + // node 2 subsumes node 1 ? + for (int tmpNode1 : nodeList1) { + if (tmpNode1 == categoryPageId2) { + return categoryPageId2; + } + } + // they have a lcs ? + for (int tmpNode1 : nodeList1) { + for (int tmpNode2 : nodeList2) { + if (tmpNode1 == tmpNode2) { + return tmpNode1; } + } + } - logger.debug("No lcs found."); + logger.debug("No lcs found."); - return -1; - } + return -1; + } - /** - * Gets the lowest common subsumer (LCS) of two nodes. - * The LCS of two nodes is first node on the path to the root, that has both nodes as sons. - * Nodes that are not in the same connected component as the root node are defined to have no LCS. - * @param categoryPageId1 The pageid of the first category node. - * @param categoryPageId2 The pageid of the second category node. - * @return The lowest common subsumer of the two nodes, or null if there is no LCS. - */ - public Category getLCS(int categoryPageId1, int categoryPageId2) throws WikiApiException { - int lcsid = getLCSId(categoryPageId1, categoryPageId2); - return lcsid>-1?wiki.getCategory(getLCSId(categoryPageId1, categoryPageId2)):null; - } + /** + * Gets the lowest common subsumer (LCS) of two nodes. + * The LCS of two nodes is first node on the path to the root, that has both nodes as sons. + * Nodes that are not in the same connected component as the root node are defined to have no LCS. + * + * @param categoryPageId1 The pageid of the first category node. + * @param categoryPageId2 The pageid of the second category node. + * @return The lowest common subsumer of the two nodes, or null if there is no LCS. + */ + public Category getLCS(int categoryPageId1, int categoryPageId2) throws WikiApiException { + int lcsid = getLCSId(categoryPageId1, categoryPageId2); + return lcsid > -1 ? wiki.getCategory(getLCSId(categoryPageId1, categoryPageId2)) : null; + } // /** @@ -571,355 +577,350 @@ public Category getLCS(int categoryPageId1, int categoryPageId2) throws WikiApiE // } - /** - * Returns the shortest path from node to root as a list of pageIds of the nodes on the path. Node and root are included in the path node list. - * @param root The root node of the graph. - * @param node A node of the graph. - * @return The shortest path from node to root as a list of pagIs of the nodes on the path; or null if no path exists - * @throws WikiApiException Thrown if errors occurred. - */ - private List getPathToRoot(int root, int node) throws WikiApiException { - List pathToRoot = new LinkedList<>(); - List shortestPath = new ArrayList<>(); + /** + * Returns the shortest path from node to root as a list of pageIds of the nodes on the path. Node and root are included in the path node list. + * + * @param root The root node of the graph. + * @param node A node of the graph. + * @return The shortest path from node to root as a list of pagIs of the nodes on the path; or null if no path exists + * @throws WikiApiException Thrown if errors occurred. + */ + private List getPathToRoot(int root, int node) throws WikiApiException { + List pathToRoot = new LinkedList<>(); + List shortestPath = new ArrayList<>(); + + expandPath(root, node, pathToRoot, shortestPath); + + if (shortestPath.size() == 0) { + return null; + } else { + return shortestPath; + } + } - expandPath(root, node, pathToRoot, shortestPath); + private void expandPath(int root, int currentNode, List currentPath, List shortestPath) { - if (shortestPath.size() == 0) { - return null; - } - else { - return shortestPath; - } - } + // add the current node to the path + currentPath.add(currentNode); - private void expandPath(int root, int currentNode, List currentPath, List shortestPath) { - - // add the current node to the path - currentPath.add(currentNode); - - // if root node reached, check whether it is a shortest path - if (currentNode == root) { - logger.debug("found root"); - - if (shortestPath.size() != 0) { - if (currentPath.size() < shortestPath.size()) { - logger.debug("setting new shortest path"); - shortestPath.clear(); - shortestPath.addAll(currentPath); - } - } - else { - logger.debug("initializing shortest path"); - shortestPath.addAll(currentPath); - } - } + // if root node reached, check whether it is a shortest path + if (currentNode == root) { + logger.debug("found root"); - // do not expand paths that are longer or equal than the current shortest path - // this is a runtime efficiency optimization! - if (shortestPath.size() != 0 && currentPath.size() >= shortestPath.size()) { - return; + if (shortestPath.size() != 0) { + if (currentPath.size() < shortestPath.size()) { + logger.debug("setting new shortest path"); + shortestPath.clear(); + shortestPath.addAll(currentPath); } + } else { + logger.debug("initializing shortest path"); + shortestPath.addAll(currentPath); + } + } - Set incomingEdges = this.graph.incomingEdgesOf(currentNode); + // do not expand paths that are longer or equal than the current shortest path + // this is a runtime efficiency optimization! + if (shortestPath.size() != 0 && currentPath.size() >= shortestPath.size()) { + return; + } - // no incoming edges => return path without adding this node - if (incomingEdges == null || incomingEdges.size() == 0) { - logger.debug("found non-root source"); - return; - } + Set incomingEdges = this.graph.incomingEdgesOf(currentNode); - for (DefaultEdge incomingEdge : incomingEdges) { - int sourceNode = graph.getEdgeSource(incomingEdge); - - if (sourceNode == currentNode) { - logger.warn("Source node equals current node."); - System.exit(1); - } - List savedPath = new LinkedList<>(currentPath); - expandPath(root, sourceNode, currentPath, shortestPath); - currentPath.clear(); - currentPath.addAll(savedPath); - } + // no incoming edges => return path without adding this node + if (incomingEdges == null || incomingEdges.size() == 0) { + logger.debug("found non-root source"); + return; + } - return; + for (DefaultEdge incomingEdge : incomingEdges) { + int sourceNode = graph.getEdgeSource(incomingEdge); + + if (sourceNode == currentNode) { + logger.warn("Source node equals current node."); + System.exit(1); + } + List savedPath = new LinkedList<>(currentPath); + expandPath(root, sourceNode, currentPath, shortestPath); + currentPath.clear(); + currentPath.addAll(savedPath); } + return; + } + + + /** + * Gets the path length between two category nodes - measured in "edges". + * + * @param node1 The first category node. + * @param node2 The second category node. + * @return The number of edges of the path between node1 and node2. 0, if the nodes are identical. -1, if no path exists. + */ + public int getPathLengthInEdges(Category node1, Category node2) { + if (this.graph.containsVertex(node1.getPageId()) && this.graph.containsVertex(node2.getPageId())) { + if (node1.getPageId() == node2.getPageId()) { + return 0; + } + + // get the path from root node to node 1 + GraphPath edgeList = DijkstraShortestPath.findPathBetween(undirectedGraph, node1.getPageId(), node2.getPageId()); + if (edgeList == null) { + return -1; + } else { + return edgeList.getLength(); + } + } + // if the given nodes are not in the category graph, return -1 + else { + return -1; + } + } + + /** + * Computing the path length in very large graphs like the Wikipedia category graph is very time consuming. + * However, we know that the graph is almost a taxonomy (it contains some cycles that can be removed). + * The path from each category to the root is stored in the rootPathMap. + * We can use this information to speed up computation dramatically. + * However, we might miss some shortest path to a node if there are multiple paths to the root. + *

+ * It is very similar to finding the LCS. + * If there is no LCS, than there also is no path. + * If one of the nodes is on the path to the root, than we already know the distance. + * Otherwise the distance can be computed as the sum of the distance of node1 to the LCS + the distance of node2 to the LCS. + * + * @param cat1 The first category. + * @param cat2 The second category. + * @return The number of edges of the path between node1 and node2. 0, if the nodes are identical. -1, if no path exists. + * @throws WikiApiException Thrown if errors occurred. + */ + public int getTaxonomicallyBoundPathLengthInEdges(Category cat1, Category cat2) throws WikiApiException { + int node1 = cat1.getPageId(); + int node2 = cat2.getPageId(); + + // if the given nodes are not in the category graph, return -1 + if (!this.graph.containsVertex(node1) || !this.graph.containsVertex(node2)) { + return -1; + } - /** - * Gets the path length between two category nodes - measured in "edges". - * @param node1 The first category node. - * @param node2 The second category node. - * @return The number of edges of the path between node1 and node2. 0, if the nodes are identical. -1, if no path exists. - */ - public int getPathLengthInEdges(Category node1, Category node2) { - if (this.graph.containsVertex(node1.getPageId()) && this.graph.containsVertex(node2.getPageId())) { - if (node1.getPageId() == node2.getPageId()) { - return 0; - } - - // get the path from root node to node 1 - GraphPath edgeList = DijkstraShortestPath.findPathBetween(undirectedGraph, node1.getPageId(), node2.getPageId()); - if (edgeList == null) { - return -1; - } - else { - return edgeList.getLength(); - } - } - // if the given nodes are not in the category graph, return -1 - else { - return -1; - } + if (node1 == node2) { + return 0; } - /** - * Computing the path length in very large graphs like the Wikipedia category graph is very time consuming. - * However, we know that the graph is almost a taxonomy (it contains some cycles that can be removed). - * The path from each category to the root is stored in the rootPathMap. - * We can use this information to speed up computation dramatically. - * However, we might miss some shortest path to a node if there are multiple paths to the root. - *

- * It is very similar to finding the LCS. - * If there is no LCS, than there also is no path. - * If one of the nodes is on the path to the root, than we already know the distance. - * Otherwise the distance can be computed as the sum of the distance of node1 to the LCS + the distance of node2 to the LCS. - * - * @param cat1 The first category. - * @param cat2 The second category. - * @return The number of edges of the path between node1 and node2. 0, if the nodes are identical. -1, if no path exists. - * @throws WikiApiException Thrown if errors occurred. - */ - public int getTaxonomicallyBoundPathLengthInEdges(Category cat1, Category cat2) throws WikiApiException { - int node1 = cat1.getPageId(); - int node2 = cat2.getPageId(); - - // if the given nodes are not in the category graph, return -1 - if (!this.graph.containsVertex(node1) || !this.graph.containsVertex(node2)) { - return -1; - } - if (node1 == node2) { - return 0; - } + List nodeList1 = getRootPathMap().get(node1); + List nodeList2 = getRootPathMap().get(node2); + // if one of the paths is null => return null + if (nodeList1 == null || nodeList2 == null || nodeList1.size() == 0 || nodeList2.size() == 0) { + logger.debug("One of the node lists is null or empty!"); + return -1; + } - List nodeList1 = getRootPathMap().get(node1); - List nodeList2 = getRootPathMap().get(node2); + logger.debug(nodeList1.toString()); + logger.debug(nodeList2.toString()); - // if one of the paths is null => return null - if (nodeList1 == null || nodeList2 == null || nodeList1.size() == 0 || nodeList2.size() == 0) { - logger.debug("One of the node lists is null or empty!"); - return -1; - } + // node1 is on path of node2 to the root + int distance1 = 0; + for (int tmpNode2 : nodeList2) { + if (tmpNode2 == node1) { + return distance1; + } + distance1++; + } - logger.debug(nodeList1.toString()); - logger.debug(nodeList2.toString()); + // node2 is on path of node1 to the root + int distance2 = 0; + for (int tmpNode1 : nodeList1) { + if (tmpNode1 == node2) { + return distance2; + } + distance2++; + } - // node1 is on path of node2 to the root - int distance1=0; - for (int tmpNode2 : nodeList2) { - if (tmpNode2 == node1) { - return distance1; - } - distance1++; - } + // they have a lcs ? + distance1 = 0; + for (int tmpNode1 : nodeList1) { + distance2 = 0; + for (int tmpNode2 : nodeList2) { + if (tmpNode1 == tmpNode2) { + return distance1 + distance2; + } + distance2++; + } + distance1++; + } - // node2 is on path of node1 to the root - int distance2=0; - for (int tmpNode1 : nodeList1) { - if (tmpNode1 == node2) { - return distance2; - } - distance2++; - } + return -1; + } - // they have a lcs ? - distance1=0; - for (int tmpNode1 : nodeList1) { - distance2=0; - for (int tmpNode2 : nodeList2) { - if (tmpNode1 == tmpNode2) { - return distance1 + distance2; - } - distance2++; - } - distance1++; - } + public int getTaxonomicallyBoundPathLengthInNodes(Category cat1, Category cat2) throws WikiApiException { + int retValue = getTaxonomicallyBoundPathLengthInEdges(cat1, cat2); - return -1; + if (retValue == 0) { + return 0; + } else if (retValue > 0) { + return (--retValue); + } else if (retValue == -1) { + return -1; + } else { + throw new WikiApiException("Unknown return value."); + } + } + + + /** + * Gets the path length between two category nodes - measured in "nodes". + * + * @param node1 The first node. + * @param node2 The second node. + * @return The number of nodes of the path between node1 and node2. 0, if the nodes are identical or neighbors. -1, if no path exists. + */ + public int getPathLengthInNodes(Category node1, Category node2) throws WikiApiException { + + int retValue = getPathLengthInEdges(node1, node2); + + if (retValue == 0) { + return 0; + } else if (retValue > 0) { + return (--retValue); + } else if (retValue == -1) { + return -1; + } else { + throw new WikiApiException("Unknown return value."); + } + } + + /** + * Creates the hyponym map, that maps from nodes to their (recursive) number of hyponyms for each node. + * "recursive" means that the hyponyms of hyponyms are also taken into account. + * + * @throws WikiApiException + */ + private void createHyponymCountMap() throws WikiApiException { + // do only create hyponymMap, if it was not already computed + if (hyponymCountMap != null) { + return; } - public int getTaxonomicallyBoundPathLengthInNodes(Category cat1, Category cat2) throws WikiApiException { - int retValue = getTaxonomicallyBoundPathLengthInEdges(cat1, cat2); + File hyponymCountMapSerializedFile = new File(wiki.getWikipediaId() + "_" + hyponymCountMapFilename); + hyponymCountMap = new HashMap<>(); - if (retValue == 0) { - return 0; - } - else if (retValue > 0) { - return (--retValue); - } - else if (retValue == -1) { - return -1; - } - else { - throw new WikiApiException("Unknown return value."); - } + if (hyponymCountMapSerializedFile.exists()) { + logger.info("Loading saved hyponymyCountMap ..."); + hyponymCountMap = (Map) this.deserializeMap(hyponymCountMapSerializedFile); + logger.info("Done loading saved hyponymyCountMap"); + return; } + // a queue holding the nodes to process - /** - * Gets the path length between two category nodes - measured in "nodes". - * @param node1 The first node. - * @param node2 The second node. - * @return The number of nodes of the path between node1 and node2. 0, if the nodes are identical or neighbors. -1, if no path exists. - */ - public int getPathLengthInNodes(Category node1, Category node2) throws WikiApiException { + // In the category graph a node may have more than one father. + // Thus, we check whether a node was already visited. + // Then, it is not expanded again. + Set visited = new HashSet<>(); - int retValue = getPathLengthInEdges(node1, node2); + // initialize the queue with all leaf nodes + Set leafNodes = this.__getLeafNodes(); + List queue = new ArrayList<>(leafNodes); - if (retValue == 0) { - return 0; - } - else if (retValue > 0) { - return (--retValue); - } - else if (retValue == -1) { - return -1; - } - else { - throw new WikiApiException("Unknown return value."); - } - } + logger.info(leafNodes.size() + " leaf nodes."); - /** - * Creates the hyponym map, that maps from nodes to their (recursive) number of hyponyms for each node. - * "recursive" means that the hyponyms of hyponyms are also taken into account. - * @throws WikiApiException - */ - private void createHyponymCountMap() throws WikiApiException { - // do only create hyponymMap, if it was not already computed - if (hyponymCountMap != null) { - return; - } + // while the queue is not empty + while (!queue.isEmpty()) { + // remove first element from queue + int currNode = queue.get(0); + queue.remove(0); - File hyponymCountMapSerializedFile = new File(wiki.getWikipediaId() + "_" + hyponymCountMapFilename); - hyponymCountMap = new HashMap<>(); + // logger.info(queue.size()); - if (hyponymCountMapSerializedFile.exists()) { - logger.info("Loading saved hyponymyCountMap ..."); - hyponymCountMap = (Map) this.deserializeMap(hyponymCountMapSerializedFile); - logger.info("Done loading saved hyponymyCountMap"); - return; - } + if (visited.contains(currNode)) { + continue; + } - // a queue holding the nodes to process - - // In the category graph a node may have more than one father. - // Thus, we check whether a node was already visited. - // Then, it is not expanded again. - Set visited = new HashSet<>(); - - // initialize the queue with all leaf nodes - Set leafNodes = this.__getLeafNodes(); - List queue = new ArrayList<>(leafNodes); - - logger.info(leafNodes.size() + " leaf nodes."); - - // while the queue is not empty - while (!queue.isEmpty()) { - // remove first element from queue - int currNode = queue.get(0); - queue.remove(0); - - // logger.info(queue.size()); - - if (visited.contains(currNode)) { - continue; - } - - Set children = __getChildren(currNode); - - int validChildren = 0; - int sumChildHyponyms = 0; - boolean invalid = false; - for (int child : children) { - if (graph.containsVertex(child)) { - if (hyponymCountMap.containsKey(child)) { - sumChildHyponyms += hyponymCountMap.get(child); - validChildren++; - } - else { - invalid = true; - } - } - } - - if (invalid) { - // One of the childs is not in the hyponymCountMap yet - // Re-Enter the node into the queue and continue with next node - queue.add(currNode); - continue; - } - - // mark as visited - visited.add(currNode); - - // number of hyponomys of current node is the number of its own hyponomies and the sum of the hyponomies of its children. - int currNodeHyponomyCount = validChildren + sumChildHyponyms; - hyponymCountMap.put(currNode, currNodeHyponomyCount); - - // add parents of current node to queue - for (int parent : __getParents(currNode)) { - if (graph.containsVertex(parent)) { - queue.add(parent); - } - } - - } // while queue not empty - - logger.info(visited.size() + " nodes visited"); - if (visited.size() != graph.vertexSet().size()) { - throw new WikiApiException("Visited only " + visited.size() + " out of " + graph.vertexSet().size() + " nodes."); - } - if (hyponymCountMap.size() != graph.vertexSet().size()) { - throw new WikiApiException("HyponymCountMap does not contain an entry for each node in the graph." + hyponymCountMap.size() + "/" + graph.vertexSet().size()); + Set children = __getChildren(currNode); + + int validChildren = 0; + int sumChildHyponyms = 0; + boolean invalid = false; + for (int child : children) { + if (graph.containsVertex(child)) { + if (hyponymCountMap.containsKey(child)) { + sumChildHyponyms += hyponymCountMap.get(child); + validChildren++; + } else { + invalid = true; + } } + } - scaleHyponymCountMap(); - logger.info("Computed hyponymCountMap"); - serializeMap(hyponymCountMap, hyponymCountMapSerializedFile); - logger.info("Serialized hyponymCountMap"); - } + if (invalid) { + // One of the childs is not in the hyponymCountMap yet + // Re-Enter the node into the queue and continue with next node + queue.add(currNode); + continue; + } + // mark as visited + visited.add(currNode); - /** - * As the categoryGraph is a graph rather than a tree, the hyponymCount for top nodes can be greater than the number of nodes in the graph. - * This is due to the multiple counting of nodes having more than one parent. - * Thus, we have to scale hyponym counts to fall in [0,NumberOfNodes]. - * @throws WikiApiException Thrown if errors occurred. - */ - private void scaleHyponymCountMap() throws WikiApiException { - for (int key : getHyponymCountMap().keySet()) { - if (getHyponymCountMap().get(key) > graph.vertexSet().size()) { -// TODO scaling function is not optimal (to say the least :) - getHyponymCountMap().put(key, (graph.vertexSet().size()-1)); - } + // number of hyponomys of current node is the number of its own hyponomies and the sum of the hyponomies of its children. + int currNodeHyponomyCount = validChildren + sumChildHyponyms; + hyponymCountMap.put(currNode, currNodeHyponomyCount); + + // add parents of current node to queue + for (int parent : __getParents(currNode)) { + if (graph.containsVertex(parent)) { + queue.add(parent); } + } + + } // while queue not empty + + logger.info(visited.size() + " nodes visited"); + if (visited.size() != graph.vertexSet().size()) { + throw new WikiApiException("Visited only " + visited.size() + " out of " + graph.vertexSet().size() + " nodes."); + } + if (hyponymCountMap.size() != graph.vertexSet().size()) { + throw new WikiApiException("HyponymCountMap does not contain an entry for each node in the graph." + hyponymCountMap.size() + "/" + graph.vertexSet().size()); } - /** - * @return The leaf nodes of the graph, i.e. nodes with outdegree = 0. - * @throws WikiApiException - */ - protected Set __getLeafNodes() throws WikiApiException { - Set leafNodes = new HashSet<>(); - for (int node : graph.vertexSet()) { - if (getOutDegree(node) == 0) { - leafNodes.add(node); - } - } - return leafNodes; + scaleHyponymCountMap(); + logger.info("Computed hyponymCountMap"); + serializeMap(hyponymCountMap, hyponymCountMapSerializedFile); + logger.info("Serialized hyponymCountMap"); + } + + + /** + * As the categoryGraph is a graph rather than a tree, the hyponymCount for top nodes can be greater than the number of nodes in the graph. + * This is due to the multiple counting of nodes having more than one parent. + * Thus, we have to scale hyponym counts to fall in [0,NumberOfNodes]. + * + * @throws WikiApiException Thrown if errors occurred. + */ + private void scaleHyponymCountMap() throws WikiApiException { + for (int key : getHyponymCountMap().keySet()) { + if (getHyponymCountMap().get(key) > graph.vertexSet().size()) { +// TODO scaling function is not optimal (to say the least :) + getHyponymCountMap().put(key, (graph.vertexSet().size() - 1)); + } } + } + + /** + * @return The leaf nodes of the graph, i.e. nodes with outdegree = 0. + * @throws WikiApiException + */ + protected Set __getLeafNodes() throws WikiApiException { + Set leafNodes = new HashSet<>(); + for (int node : graph.vertexSet()) { + if (getOutDegree(node) == 0) { + leafNodes.add(node); + } + } + return leafNodes; + } //// The method did not consider that IC has to monotonically decrease from leaves to root node // /** @@ -935,453 +936,464 @@ protected Set __getLeafNodes() throws WikiApiException { // return (1 - (Math.log(numberOfHyponyms + 1) / Math.log(numberOfCategories)) ); // } - /** - * Intrinsic information content (Seco Etal. 2004) allows to compute information content from the structure of the taxonomy (no corpus needed). - * IC(n) = 1 - log( hypo(n) + 1) / log(#cat) - * hypo(n) is the (recursive) number of hyponyms of a node n. Recursive means that the hyponyms of hyponyms are also taken into account - * #cat is the number of categories in the graph - * @param category The category node for which the intrinsic information content should be returned. - * @return The intrinsic information content for this category node. - * @throws WikiApiException Thrown if errors occurred. - */ - public double getIntrinsicInformationContent(Category category) throws WikiApiException { - int node = category.getPageId(); - - int hyponymCount = getHyponymCountMap().get(node); - int numberOfNodes = this.getNumberOfNodes(); - - if (hyponymCount > numberOfNodes) { - throw new WikiApiException("Something is wrong with the hyponymCountMap. " + hyponymCount + " hyponyms, but only " + numberOfNodes + " nodes."); - } - - logger.debug(category.getTitle().getPlainTitle() + " has # hyponyms: " + hyponymCount); - - double intrinsicIC = -1; - if (hyponymCount >= 0) { - intrinsicIC = (1 - ( Math.log(hyponymCount + 1) / Math.log(numberOfNodes) ) ); - } - return intrinsicIC; + /** + * Intrinsic information content (Seco Etal. 2004) allows to compute information content from the structure of the taxonomy (no corpus needed). + * IC(n) = 1 - log( hypo(n) + 1) / log(#cat) + * hypo(n) is the (recursive) number of hyponyms of a node n. Recursive means that the hyponyms of hyponyms are also taken into account + * #cat is the number of categories in the graph + * + * @param category The category node for which the intrinsic information content should be returned. + * @return The intrinsic information content for this category node. + * @throws WikiApiException Thrown if errors occurred. + */ + public double getIntrinsicInformationContent(Category category) throws WikiApiException { + int node = category.getPageId(); + + int hyponymCount = getHyponymCountMap().get(node); + int numberOfNodes = this.getNumberOfNodes(); + + if (hyponymCount > numberOfNodes) { + throw new WikiApiException("Something is wrong with the hyponymCountMap. " + hyponymCount + " hyponyms, but only " + numberOfNodes + " nodes."); } - /** - * Computes the paths from each category node to the root. - * Computing n paths will take some time. - * Thus, efficient computing is based on the assumption that all subpaths in the shortest path to the root, are also shortest paths for the corresponding nodes. - * Starting with the leaf nodes gives the longest initial paths with most subpaths. - * @throws WikiApiException Thrown if errors occurred. - */ - public void createRootPathMap() throws WikiApiException { - - // do only create rootPathMap, if it was not already computed - if (rootPathMap != null) { - return; - } - - File rootPathFile = new File(wiki.getWikipediaId() + "_" + this.rootPathMapFilename); - - // try to load rootPathMap from precomputed file - if (rootPathFile.exists()) { - logger.info("Loading saved rootPathMap ..."); - rootPathMap = (Map>) deserializeMap(rootPathFile); - logger.info("Done loading saved rootPathMap"); - return; - } + logger.debug(category.getTitle().getPlainTitle() + " has # hyponyms: " + hyponymCount); - logger.info("Computing rootPathMap"); - rootPathMap = new HashMap<>(); + double intrinsicIC = -1; + if (hyponymCount >= 0) { + intrinsicIC = (1 - (Math.log(hyponymCount + 1) / Math.log(numberOfNodes))); + } + return intrinsicIC; + } + + /** + * Computes the paths from each category node to the root. + * Computing n paths will take some time. + * Thus, efficient computing is based on the assumption that all subpaths in the shortest path to the root, are also shortest paths for the corresponding nodes. + * Starting with the leaf nodes gives the longest initial paths with most subpaths. + * + * @throws WikiApiException Thrown if errors occurred. + */ + public void createRootPathMap() throws WikiApiException { + + // do only create rootPathMap, if it was not already computed + if (rootPathMap != null) { + return; + } - // a queue holding the nodes to process - List queue = new ArrayList<>(); + File rootPathFile = new File(wiki.getWikipediaId() + "_" + this.rootPathMapFilename); - // initialize the queue with all leaf nodes - Set leafNodes = this.__getLeafNodes(); - queue.addAll(leafNodes); + // try to load rootPathMap from precomputed file + if (rootPathFile.exists()) { + logger.info("Loading saved rootPathMap ..."); + rootPathMap = (Map>) deserializeMap(rootPathFile); + logger.info("Done loading saved rootPathMap"); + return; + } - logger.info(queue.size() + " leaf nodes."); - fillRootPathMap(queue); + logger.info("Computing rootPathMap"); + rootPathMap = new HashMap<>(); - queue.clear(); // queue should be empty now, but clear anyway + // a queue holding the nodes to process + List queue = new ArrayList<>(); - // add non-leaf nodes that have not been on a shortest, yet - for (Category cat : wiki.getCategories()) { - if (!rootPathMap.containsKey(cat.getPageId())) { - queue.add(cat.getPageId()); - } - } + // initialize the queue with all leaf nodes + Set leafNodes = this.__getLeafNodes(); + queue.addAll(leafNodes); - logger.info(queue.size() + " non leaf nodes not on a shortest leaf-node to root path."); - fillRootPathMap(queue); + logger.info(queue.size() + " leaf nodes."); + fillRootPathMap(queue); - for (Category cat : wiki.getCategories()) { - if (!rootPathMap.containsKey(cat.getPageId())) { - logger.info("no path for " + cat.getPageId()); - } - } - - // from the root path map, we can very easily get the depth - this.depth = getDepthFromRootPathMap(); + queue.clear(); // queue should be empty now, but clear anyway - logger.info("Setting depth of category graph: " + this.depth); - - logger.info("Serializing rootPathMap"); - this.serializeMap(rootPathMap, rootPathFile); + // add non-leaf nodes that have not been on a shortest, yet + for (Category cat : wiki.getCategories()) { + if (!rootPathMap.containsKey(cat.getPageId())) { + queue.add(cat.getPageId()); + } } - // TODO the method is only public, because the test deletes the file after creating it - I have no idea at the moment how to do it - /** - * Deleted the root path map file. - * @throws WikiApiException Thrown if errors occurred. - */ - public void deleteRootPathMap() throws WikiApiException { - File rootPathFile = new File(this.rootPathMapFilename + "_" + wiki.getLanguage() + "_" + wiki.getMetaData().getVersion()); - rootPathFile.delete(); - } + logger.info(queue.size() + " non leaf nodes not on a shortest leaf-node to root path."); + fillRootPathMap(queue); - private void fillRootPathMap(List queue) throws WikiApiException { - int root = wiki.getMetaData().getMainCategory().getPageId(); - - // while the queue is not empty - while (!queue.isEmpty()) { - // remove first element from queue - int currentNode = queue.get(0); - queue.remove(0); - - logger.debug("Queue size: " + queue.size()); - - // if we have already insert a path for this node => continue with the next - if (getRootPathMap().containsKey(currentNode)) { - continue; - } - - // compute path from current node to root - List nodesOnPath = getPathToRoot(root, currentNode); - - // if there is no path => skip - if (nodesOnPath == null) { - getRootPathMap().put(currentNode, new ArrayList<>()); - continue; - } - - // the first entry should be the current Node, the last entry should be the root - // check whether this assumption is valid - if (nodesOnPath.get(0) != currentNode || // the first node of the list should always be the current node - nodesOnPath.get(nodesOnPath.size()-1) != root) { // the last node of the list should always be the root node - logger.error("Something is wrong with the path to the root"); - logger.error(nodesOnPath.get(0) + " -- " + currentNode); - logger.error(nodesOnPath.get(nodesOnPath.size()-1) + " -- " + root); - logger.error("size = {}", nodesOnPath.size()); - System.exit(1); - } - - int i = 0; - for (int nodeOnPath : nodesOnPath) { - // if we have already insert a path for this node => continue with the next - if (getRootPathMap().containsKey(nodeOnPath)) { - continue; - } - // insert path - else { - getRootPathMap().put(nodeOnPath, new ArrayList<>(nodesOnPath.subList(i, nodesOnPath.size()))); - } - i++; - } - } // while queue not empty + for (Category cat : wiki.getCategories()) { + if (!rootPathMap.containsKey(cat.getPageId())) { + logger.info("no path for " + cat.getPageId()); + } } - /** - * @param pageID The pageID of the category. - * @return The indegree of the given category. - */ - protected int getInDegree(int pageID) { - return graph.inDegreeOf(pageID); + // from the root path map, we can very easily get the depth + this.depth = getDepthFromRootPathMap(); + + logger.info("Setting depth of category graph: " + this.depth); + + logger.info("Serializing rootPathMap"); + this.serializeMap(rootPathMap, rootPathFile); + } + + // TODO the method is only public, because the test deletes the file after creating it - I have no idea at the moment how to do it + + /** + * Deleted the root path map file. + * + * @throws WikiApiException Thrown if errors occurred. + */ + public void deleteRootPathMap() throws WikiApiException { + File rootPathFile = new File(this.rootPathMapFilename + "_" + wiki.getLanguage() + "_" + wiki.getMetaData().getVersion()); + rootPathFile.delete(); + } + + private void fillRootPathMap(List queue) throws WikiApiException { + int root = wiki.getMetaData().getMainCategory().getPageId(); + + // while the queue is not empty + while (!queue.isEmpty()) { + // remove first element from queue + int currentNode = queue.get(0); + queue.remove(0); + + logger.debug("Queue size: " + queue.size()); + + // if we have already insert a path for this node => continue with the next + if (getRootPathMap().containsKey(currentNode)) { + continue; + } + + // compute path from current node to root + List nodesOnPath = getPathToRoot(root, currentNode); + + // if there is no path => skip + if (nodesOnPath == null) { + getRootPathMap().put(currentNode, new ArrayList<>()); + continue; + } + + // the first entry should be the current Node, the last entry should be the root + // check whether this assumption is valid + if (nodesOnPath.get(0) != currentNode || // the first node of the list should always be the current node + nodesOnPath.get(nodesOnPath.size() - 1) != root) { // the last node of the list should always be the root node + logger.error("Something is wrong with the path to the root"); + logger.error(nodesOnPath.get(0) + " -- " + currentNode); + logger.error(nodesOnPath.get(nodesOnPath.size() - 1) + " -- " + root); + logger.error("size = {}", nodesOnPath.size()); + System.exit(1); + } + + int i = 0; + for (int nodeOnPath : nodesOnPath) { + // if we have already insert a path for this node => continue with the next + if (getRootPathMap().containsKey(nodeOnPath)) { + continue; + } + // insert path + else { + getRootPathMap().put(nodeOnPath, new ArrayList<>(nodesOnPath.subList(i, nodesOnPath.size()))); + } + i++; + } + } // while queue not empty + } + + /** + * @param pageID The pageID of the category. + * @return The indegree of the given category. + */ + protected int getInDegree(int pageID) { + return graph.inDegreeOf(pageID); + } + + /** + * @param pageID The pageID of the category. + * @return The outdegree of the given category. + */ + protected int getOutDegree(int pageID) { + return graph.outDegreeOf(pageID); + } + + /** + * @param pageID The pageID of the category. + * @return A set of child nodes of the given category. + */ + protected Set __getChildren(int pageID) { + Set outgoingEdges = graph.outgoingEdgesOf(pageID); + Set outLinks = new HashSet<>(); + for (DefaultEdge edge : outgoingEdges) { + outLinks.add(graph.getEdgeTarget(edge)); } - - /** - * @param pageID The pageID of the category. - * @return The outdegree of the given category. - */ - protected int getOutDegree(int pageID) { - return graph.outDegreeOf(pageID); + return outLinks; + } + + /** + * @param pageID The pageID of the category. + * @return A set of parent nodes of the given category. + */ + protected Set __getParents(int pageID) { + Set incomingEdges = graph.incomingEdgesOf(pageID); + Set inLinks = new HashSet<>(); + for (DefaultEdge edge : incomingEdges) { + inLinks.add(graph.getEdgeSource(edge)); } - - /** - * @param pageID The pageID of the category. - * @return A set of child nodes of the given category. - */ - protected Set __getChildren(int pageID) { - Set outgoingEdges = graph.outgoingEdgesOf(pageID); - Set outLinks = new HashSet<>(); - for (DefaultEdge edge : outgoingEdges) { - outLinks.add(graph.getEdgeTarget(edge)); - } - return outLinks; + return inLinks; + } + + /** + * @return Returns the largest connected component as a new graph. If the base graph already is connected, it simply returns the whole graph. + */ + public CategoryGraph getLargestConnectedComponent() throws WikiApiException { + ConnectivityInspector connectInspect = new ConnectivityInspector<>(graph); + + // if the graph is connected, simply return the whole graph + if (connectInspect.isConnected()) { + return this; } - /** - * @param pageID The pageID of the category. - * @return A set of parent nodes of the given category. - */ - protected Set __getParents(int pageID) { - Set incomingEdges = graph.incomingEdgesOf(pageID); - Set inLinks = new HashSet<>(); - for (DefaultEdge edge : incomingEdges) { - inLinks.add(graph.getEdgeSource(edge)); - } - return inLinks; - } - - /** - * @return Returns the largest connected component as a new graph. If the base graph already is connected, it simply returns the whole graph. - */ - public CategoryGraph getLargestConnectedComponent() throws WikiApiException { - ConnectivityInspector connectInspect = new ConnectivityInspector<>(graph); - - // if the graph is connected, simply return the whole graph - if (connectInspect.isConnected()) { - return this; - } - - // else, get the largest connected component - List> connectedComponentList = connectInspect.connectedSets(); + // else, get the largest connected component + List> connectedComponentList = connectInspect.connectedSets(); - logger.info(connectedComponentList.size() + " connected components."); + logger.info(connectedComponentList.size() + " connected components."); - int i = 0; - int maxSize = 0; - Set largestComponent = new HashSet<>(); - for (Set connectedComponent : connectedComponentList) { - i++; - if (connectedComponent.size() > maxSize) { - maxSize = connectedComponent.size(); - largestComponent = connectedComponent; - } - } - - double largestComponentRatio = largestComponent.size() * 100 / this.getNumberOfNodes(); - logger.info ("Largest component contains " + largestComponentRatio + "% (" + largestComponent.size() + "/" + this.getNumberOfNodes() + ") of the nodes in the graph."); - - return CategoryGraphManager.getCategoryGraph(wiki, largestComponent); + int i = 0; + int maxSize = 0; + Set largestComponent = new HashSet<>(); + for (Set connectedComponent : connectedComponentList) { + i++; + if (connectedComponent.size() > maxSize) { + maxSize = connectedComponent.size(); + largestComponent = connectedComponent; + } } - /** - * Get the number of nodes in the graph. - * @return The number of nodes in the graph. - */ - public int getNumberOfNodes() { - return numberOfNodes; + double largestComponentRatio = largestComponent.size() * 100 / this.getNumberOfNodes(); + logger.info("Largest component contains " + largestComponentRatio + "% (" + largestComponent.size() + "/" + this.getNumberOfNodes() + ") of the nodes in the graph."); + + return CategoryGraphManager.getCategoryGraph(wiki, largestComponent); + } + + /** + * Get the number of nodes in the graph. + * + * @return The number of nodes in the graph. + */ + public int getNumberOfNodes() { + return numberOfNodes; + } + + /** + * Get the number of edges in the graph. + * + * @return The number of edges in the graph. + */ + public int getNumberOfEdges() { + return numberOfEdges; + } + + /** + * Computes the average of the path length between all pairs of nodes. + * The graph is treated as an undirected graph. + * Computing graph parameters requires touching all node pairs. + * Therefore, if one is called the others are computed as well and stored for later retrieval. + * + * @return The average of the shortest path lengths between all pairs of nodes. + */ + public double getAverageShortestPathLength() { + if (averageShortestPathLength < 0) { // has not been initialized + logger.debug("Calling setGraphParameters"); + setGraphParameters(); } - - /** - * Get the number of edges in the graph. - * @return The number of edges in the graph. - */ - public int getNumberOfEdges() { - return numberOfEdges; - } - - /** - * Computes the average of the path length between all pairs of nodes. - * The graph is treated as an undirected graph. - * Computing graph parameters requires touching all node pairs. - * Therefore, if one is called the others are computed as well and stored for later retrieval. - * @return The average of the shortest path lengths between all pairs of nodes. - */ - public double getAverageShortestPathLength() { - if (averageShortestPathLength < 0) { // has not been initialized - logger.debug("Calling setGraphParameters"); - setGraphParameters(); - } - return averageShortestPathLength; + return averageShortestPathLength; + } + + /** + * Computes the diameter of the graph (the maximum of the shortest path length between all pairs of nodes) + * The graph is treated as a undirected graph. + * Computing graph parameters requires touching all node pairs. + * Therefore, if one is called the others are computed as well and stored for later retrieval. + * + * @return The diameter of the graph. + */ + public double getDiameter() { + if (diameter < 0) { // has not been initialized + logger.debug("Calling setGraphParameters"); + setGraphParameters(); } - - /** - * Computes the diameter of the graph (the maximum of the shortest path length between all pairs of nodes) - * The graph is treated as a undirected graph. - * Computing graph parameters requires touching all node pairs. - * Therefore, if one is called the others are computed as well and stored for later retrieval. - * @return The diameter of the graph. - */ - public double getDiameter() { - if (diameter < 0) { // has not been initialized - logger.debug("Calling setGraphParameters"); - setGraphParameters(); - } - return diameter; + return diameter; + } + + /** + * Computes the average degree. The degree of a node is the number of edges edges that it is connected with. + * The graph is treated as an undirected graph. + * Computing graph parameters requires touching all node pairs. + * Therefore, if one is called the others are computed as well and stored for later retrieval. + * + * @return The average degree of the graph. + */ + public double getAverageDegree() { + if (averageDegree < 0) { // has not been initialized + logger.debug("Calling setGraphParameters"); + setGraphParameters(); } - - /** - * Computes the average degree. The degree of a node is the number of edges edges that it is connected with. - * The graph is treated as an undirected graph. - * Computing graph parameters requires touching all node pairs. - * Therefore, if one is called the others are computed as well and stored for later retrieval. - * @return The average degree of the graph. - */ - public double getAverageDegree() { - if (averageDegree < 0) { // has not been initialized - logger.debug("Calling setGraphParameters"); - setGraphParameters(); - } - return averageDegree; + return averageDegree; + } + + /** + * Compute the cluster coefficient of the graph (after Watts and Strogatz 1998) + * Cluster coefficient C is defined as the average of C_v over all edges. + * C_v is the fraction of the connections that exist between the neighbor nodes (k_v) of a vertex v and all allowable connections between the neighbors (k_v(k_v -1)/2). + * C_v = 2 * number of connections between / k_v*(k_v -1) + * + * @return The cluster coefficient. + */ + public double getClusterCoefficient() { + if (clusterCoefficient < 0) { // has not been initialized + logger.debug("Calling setGraphParameters"); + setGraphParameters(); } - - /** - * Compute the cluster coefficient of the graph (after Watts and Strogatz 1998) - * Cluster coefficient C is defined as the average of C_v over all edges. - * C_v is the fraction of the connections that exist between the neighbor nodes (k_v) of a vertex v and all allowable connections between the neighbors (k_v(k_v -1)/2). - * C_v = 2 * number of connections between / k_v*(k_v -1) - * @return The cluster coefficient. - */ - public double getClusterCoefficient() { - if (clusterCoefficient < 0) { // has not been initialized - logger.debug("Calling setGraphParameters"); - setGraphParameters(); - } - return clusterCoefficient; + return clusterCoefficient; + } + + /** + * Computes the degree distribution. The degree of a node is the number of edges that it is connected with. + * The graph is treated as an undirected graph. + * Computing graph parameters requires touching all node pairs. + * Therefore, if one is called the others are computed as well and stored for later retrieval. + * + * @return A map with the degree distribution of the graph. + */ + public Map getDegreeDistribution() { + if (degreeDistribution == null) { // has not been initialized + logger.debug("Calling setGraphParameters"); + setGraphParameters(); } - - /** - * Computes the degree distribution. The degree of a node is the number of edges that it is connected with. - * The graph is treated as an undirected graph. - * Computing graph parameters requires touching all node pairs. - * Therefore, if one is called the others are computed as well and stored for later retrieval. - * @return A map with the degree distribution of the graph. - */ - public Map getDegreeDistribution() { - if (degreeDistribution == null) { // has not been initialized - logger.debug("Calling setGraphParameters"); - setGraphParameters(); - } - return degreeDistribution; + return degreeDistribution; + } + + + /** + * Get the number of connections that exist between the neighbors of a node. + * + * @param node The node under consideration. + * @return The number of connections that exist between the neighbors of node. + */ + private int getNumberOfNeighborConnections(int node) { + int numberOfConnections = 0; + + // get the set of neighbors + Set neighbors = getNeighbors(node); + + if (neighbors.size() > 0) { + // for each pair of neighbors, test if there is a connection + Object[] nodeArray = neighbors.toArray(); + // sort the Array so we can use a simple iteration with two for loops to access all pairs + Arrays.sort(nodeArray); + + for (int i = 0; i < neighbors.size(); i++) { + int outerNode = (Integer) nodeArray[i]; + for (int j = i + 1; j < neighbors.size(); j++) { + int innerNode = (Integer) nodeArray[j]; + // in case of a connection - increade connection counter + // order of the nodes doesn't matter for undirected graphs + if (undirectedGraph.containsEdge(innerNode, outerNode)) { + numberOfConnections++; + } + } + } } - - /** - * Get the number of connections that exist between the neighbors of a node. - * @param node The node under consideration. - * @return The number of connections that exist between the neighbors of node. - */ - private int getNumberOfNeighborConnections(int node) { - int numberOfConnections = 0; - - // get the set of neighbors - Set neighbors = getNeighbors(node); - - if (neighbors.size() > 0) { - // for each pair of neighbors, test if there is a connection - Object[] nodeArray = neighbors.toArray(); - // sort the Array so we can use a simple iteration with two for loops to access all pairs - Arrays.sort(nodeArray); - - for (int i=0; i getNeighbors(int node) { - - Set neighbors = new HashSet<>(); - Set edges = undirectedGraph.edgesOf(node); - for (DefaultEdge edge : edges) { - if (undirectedGraph.getEdgeSource(edge) != node) { - neighbors.add(undirectedGraph.getEdgeSource(edge)); - } - if (undirectedGraph.getEdgeTarget(edge) != node) { - neighbors.add(undirectedGraph.getEdgeTarget(edge)); - } - } - return neighbors; + return numberOfConnections; + } + + /** + * Get the neighbors of a given node. + * The category graph is treated as an undirected graph. + * + * @param node the reference node. + * @return The set of category nodes that are neighbors of this category. + */ + protected Set getNeighbors(int node) { + + Set neighbors = new HashSet<>(); + Set edges = undirectedGraph.edgesOf(node); + for (DefaultEdge edge : edges) { + if (undirectedGraph.getEdgeSource(edge) != node) { + neighbors.add(undirectedGraph.getEdgeSource(edge)); + } + if (undirectedGraph.getEdgeTarget(edge) != node) { + neighbors.add(undirectedGraph.getEdgeTarget(edge)); + } } + return neighbors; + } - private void updateDegreeDistribution(int nodeDegree) { - if (degreeDistribution.containsKey(nodeDegree)) { - degreeDistribution.put(nodeDegree, (degreeDistribution.get(nodeDegree) + 1)); - } - else { - degreeDistribution.put(nodeDegree, 1); - } + private void updateDegreeDistribution(int nodeDegree) { + if (degreeDistribution.containsKey(nodeDegree)) { + degreeDistribution.put(nodeDegree, (degreeDistribution.get(nodeDegree) + 1)); + } else { + degreeDistribution.put(nodeDegree, 1); } + } - /** - * Computes and sets the diameter, the average degree and the average shortest path length of the graph. - * Do not call this in the constructor. May run a while. - * It is called in the getters, if parameters are not yet initialized when retrieved. - */ - private void setGraphParameters() { + /** + * Computes and sets the diameter, the average degree and the average shortest path length of the graph. + * Do not call this in the constructor. May run a while. + * It is called in the getters, if parameters are not yet initialized when retrieved. + */ + private void setGraphParameters() { - // Diameter is the maximum of all shortest path lengths - // Average shortest path length is (as the name says) the average of the shortest path length between all node pairs + // Diameter is the maximum of all shortest path lengths + // Average shortest path length is (as the name says) the average of the shortest path length between all node pairs - double maxPathLength = 0.0; - double shortestPathLengthSum = 0.0; - double degreeSum = 0.0; - double clusterCoefficientSum = 0.0; + double maxPathLength = 0.0; + double shortestPathLengthSum = 0.0; + double degreeSum = 0.0; + double clusterCoefficientSum = 0.0; - // iterate over all node pairs - Set nodes = undirectedGraph.vertexSet(); + // iterate over all node pairs + Set nodes = undirectedGraph.vertexSet(); - // a hashset of the nodes which have been the start node of the computation process - // for such nodes all path lengths have beeen already computed - Set wasSource = new HashSet<>(); + // a hashset of the nodes which have been the start node of the computation process + // for such nodes all path lengths have beeen already computed + Set wasSource = new HashSet<>(); - int progress = 0; - for (int node : nodes) { + int progress = 0; + for (int node : nodes) { - progress++; - ApiUtilities.printProgressInfo(progress, nodes.size(), 100, ApiUtilities.ProgressInfoMode.TEXT, "Getting graph parameters"); + progress++; + ApiUtilities.printProgressInfo(progress, nodes.size(), 100, ApiUtilities.ProgressInfoMode.TEXT, "Getting graph parameters"); - int nodeDegree = undirectedGraph.degreeOf(node); - degreeSum += nodeDegree; - updateDegreeDistribution(nodeDegree); + int nodeDegree = undirectedGraph.degreeOf(node); + degreeSum += nodeDegree; + updateDegreeDistribution(nodeDegree); - // cluster coefficient of a node is C_v is the fraction of the connections that exist between the neighbor nodes (k_v) of a this node and all allowable connections between the neighbors (k_v(k_v -1)/2) - // for degrees 0 or 1 there is no cluster coefficient, as there can be no connections between neighbors - if (undirectedGraph.degreeOf(node) > 1) { - double numberOfNeighborConnections = getNumberOfNeighborConnections(node); - clusterCoefficientSum += ( numberOfNeighborConnections / (nodeDegree * (nodeDegree - 1))); - } + // cluster coefficient of a node is C_v is the fraction of the connections that exist between the neighbor nodes (k_v) of a this node and all allowable connections between the neighbors (k_v(k_v -1)/2) + // for degrees 0 or 1 there is no cluster coefficient, as there can be no connections between neighbors + if (undirectedGraph.degreeOf(node) > 1) { + double numberOfNeighborConnections = getNumberOfNeighborConnections(node); + clusterCoefficientSum += (numberOfNeighborConnections / (nodeDegree * (nodeDegree - 1))); + } - // Returns the new shortestPathLengthSum and the new maxPathLength. - // They are returned as an double array for performance reasons. - // I do not want to create an object, as this function is called *very* often - double[] returnValues = computeShortestPathLenghts(node, shortestPathLengthSum, maxPathLength, wasSource); - shortestPathLengthSum = returnValues[0]; - maxPathLength = returnValues[1]; + // Returns the new shortestPathLengthSum and the new maxPathLength. + // They are returned as an double array for performance reasons. + // I do not want to create an object, as this function is called *very* often + double[] returnValues = computeShortestPathLenghts(node, shortestPathLengthSum, maxPathLength, wasSource); + shortestPathLengthSum = returnValues[0]; + maxPathLength = returnValues[1]; - // save the info that the node was already used as the source of path computation - wasSource.add(node); - } + // save the info that the node was already used as the source of path computation + wasSource.add(node); + } - if (nodes.size() > 1) { - this.averageShortestPathLength = shortestPathLengthSum / ( nodes.size() * (nodes.size()-1) / 2 ); // sum of path lengths / (number of node pairs) - } - else { - this.averageShortestPathLength = 0; // there is only one node - } - this.diameter = maxPathLength; - this.averageDegree = degreeSum / nodes.size(); - this.clusterCoefficient = clusterCoefficientSum / nodes.size(); + if (nodes.size() > 1) { + this.averageShortestPathLength = shortestPathLengthSum / (nodes.size() * (nodes.size() - 1) / 2); // sum of path lengths / (number of node pairs) + } else { + this.averageShortestPathLength = 0; // there is only one node } + this.diameter = maxPathLength; + this.averageDegree = degreeSum / nodes.size(); + this.clusterCoefficient = clusterCoefficientSum / nodes.size(); + } // /** // * Computes and sets the diameter, the average degree and the average shortest path length of the graph. @@ -1448,229 +1460,232 @@ private void setGraphParameters() { // this.clusterCoefficient = clusterCoefficientSum / nodes.size(); // } - /** - * Computes the shortest path from node to all other nodes. - * Paths to nodes that have already been the source of the shortest path computation - * are omitted (the path was already added to the path sum). - * Updates the sum of shortest path lengths and the diameter of the graph. - * As the JGraphT BreadthFirstIterator does not provide information about - * the distance to the start node in each step, we will use our own BFS implementation. - * @param pStartNode The start node of the search. - * @param pShortestPathLengthSum The sum of the shortes path lengths. - * @param pMaxPathLength The maximum path length found so far. - * @param pWasSource A set of nodes which have been the start node of the computation process. For such nodes all path lengths have beeen already computed. - * @return An array of double values. - * The first value is the shortestPathLengthSum and the second value is the maxPathLength. - * They are returned as an double array for performance reasons. - * I do not want to create an object, as this function is called *very* often. - */ - private double[] computeShortestPathLenghts(int pStartNode, double pShortestPathLengthSum, double pMaxPathLength, Set pWasSource) { - - // a set of nodes that have already been expanded -> algorithm should expand nodes monotonically and not go back - Set alreadyExpanded = new HashSet<>(); - - // a queue holding the newly discovered nodes with their distance to the start node - List queue = new ArrayList<>(); - - // initialize queue with start node - int[] innerList = new int[2]; - innerList[0] = pStartNode; // the node - innerList[1] = 0; // the distance to the start node - queue.add(innerList); - - // while the queue is not empty - while (!queue.isEmpty()) { - // remove first element from queue - int[] queueElement = queue.get(0); - int currentNode = queueElement[0]; - int distance = queueElement[1]; - queue.remove(0); - - // if the node was not already expanded - if (!alreadyExpanded.contains(currentNode)) { - // the node gets expanded now - alreadyExpanded.add(currentNode); - - // if the node was a source node in a previous run, we already have added this path - if (!pWasSource.contains(currentNode)) { - // add the distance of this node to shortestPathLengthSum - // check if maxPathLength must be updated - pShortestPathLengthSum += distance; - if (distance > pMaxPathLength) { - pMaxPathLength = distance; - } - } - // even if the node was a source node in a previous run there can be a path to other nodes over this node, so go on - - // get the neighbors of the queue element - Set neighbors = getNeighbors(currentNode); - - // iterate over all neighbors - for (int neighbor : neighbors) { - // if the node was not already expanded - if (!alreadyExpanded.contains(neighbor)) { - // add the node to the queue, increase node distance by one - int[] tmpList = new int[2]; - tmpList[0] = neighbor; - tmpList[1] = (distance + 1); - queue.add(tmpList); - } - } - } - } - double[] returnArray = {pShortestPathLengthSum, pMaxPathLength}; - return returnArray; + /** + * Computes the shortest path from node to all other nodes. + * Paths to nodes that have already been the source of the shortest path computation + * are omitted (the path was already added to the path sum). + * Updates the sum of shortest path lengths and the diameter of the graph. + * As the JGraphT BreadthFirstIterator does not provide information about + * the distance to the start node in each step, we will use our own BFS implementation. + * + * @param pStartNode The start node of the search. + * @param pShortestPathLengthSum The sum of the shortes path lengths. + * @param pMaxPathLength The maximum path length found so far. + * @param pWasSource A set of nodes which have been the start node of the computation process. For such nodes all path lengths have beeen already computed. + * @return An array of double values. + * The first value is the shortestPathLengthSum and the second value is the maxPathLength. + * They are returned as an double array for performance reasons. + * I do not want to create an object, as this function is called *very* often. + */ + private double[] computeShortestPathLenghts(int pStartNode, double pShortestPathLengthSum, double pMaxPathLength, Set pWasSource) { + + // a set of nodes that have already been expanded -> algorithm should expand nodes monotonically and not go back + Set alreadyExpanded = new HashSet<>(); + + // a queue holding the newly discovered nodes with their distance to the start node + List queue = new ArrayList<>(); + + // initialize queue with start node + int[] innerList = new int[2]; + innerList[0] = pStartNode; // the node + innerList[1] = 0; // the distance to the start node + queue.add(innerList); + + // while the queue is not empty + while (!queue.isEmpty()) { + // remove first element from queue + int[] queueElement = queue.get(0); + int currentNode = queueElement[0]; + int distance = queueElement[1]; + queue.remove(0); + + // if the node was not already expanded + if (!alreadyExpanded.contains(currentNode)) { + // the node gets expanded now + alreadyExpanded.add(currentNode); + + // if the node was a source node in a previous run, we already have added this path + if (!pWasSource.contains(currentNode)) { + // add the distance of this node to shortestPathLengthSum + // check if maxPathLength must be updated + pShortestPathLengthSum += distance; + if (distance > pMaxPathLength) { + pMaxPathLength = distance; + } + } + // even if the node was a source node in a previous run there can be a path to other nodes over this node, so go on + + // get the neighbors of the queue element + Set neighbors = getNeighbors(currentNode); + + // iterate over all neighbors + for (int neighbor : neighbors) { + // if the node was not already expanded + if (!alreadyExpanded.contains(neighbor)) { + // add the node to the queue, increase node distance by one + int[] tmpList = new int[2]; + tmpList[0] = neighbor; + tmpList[1] = (distance + 1); + queue.add(tmpList); + } + } + } } + double[] returnArray = {pShortestPathLengthSum, pMaxPathLength}; + return returnArray; + } + + /** + * This parameter is already set in the constructor as it is needed for computation of relatedness values. + * Therefore its computation does not trigger setGraphParameters (it is too slow), even if the depth is implicitly determined there, too. + * + * @return The depth of the category graph, i.e. the maximum path length starting with the root node. + * @throws WikiApiException Thrown if errors occurred. + */ + public double getDepth() throws WikiApiException { + if (depth < 0) { // has not been initialized + if (rootPathMap != null) { + this.depth = getDepthFromRootPathMap(); + logger.info("Getting depth from RootPathMap: " + this.depth); - /** - * This parameter is already set in the constructor as it is needed for computation of relatedness values. - * Therefore its computation does not trigger setGraphParameters (it is too slow), even if the depth is implicitly determined there, too. - * @return The depth of the category graph, i.e. the maximum path length starting with the root node. - * @throws WikiApiException Thrown if errors occurred. - */ - public double getDepth() throws WikiApiException { - if (depth < 0) { // has not been initialized - if (rootPathMap != null) { - this.depth = getDepthFromRootPathMap(); - logger.info("Getting depth from RootPathMap: " + this.depth); - - } - else { - depth = computeDepth(); - logger.info("Computing depth of the hierarchy: " + this.depth); - } - } - return depth; - } - - /** - * This parameter is already set in the constructor as it is needed for computation of relatedness values. - * Therefore its computation does not trigger setGraphParameters (it is too slow), even if the depth is implicitly determined there, too. - * @return The depth of the category graph, i.e. the maximum path length starting with the root node. - * @throws WikiApiException Thrown if errors occurred. - */ - private double getDepthFromRootPathMap() throws WikiApiException { - int max = 0; - for (List path : getRootPathMap().values()) { - if (path.size() > max) { - max = path.size(); - } - } - - max = max - 1; // depth is measured in nodes, not edges - - if (max < 0) { - return 0; - } - else { - return max; - } + } else { + depth = computeDepth(); + logger.info("Computing depth of the hierarchy: " + this.depth); + } } - - /** - * Computes the depth of the category graph, i.e. the maximum path length starting with the root node. - * @return The depth of the hierarchy. - * @throws WikiApiException Thrown if errors occurred. - */ - private double computeDepth() throws WikiApiException { - Category root = wiki.getMetaData().getMainCategory(); - if (root == null) { - logger.error("There is no root node for this wiki. Check the parameter that provides the name of the root node."); - return 0.0; - } - // test whether the root category is in this graph - if (!graph.containsVertex(root.getPageId())) { - logger.error("The root node is not part of this graph. Cannot compute depth of this graph. Setting depth to 0.0"); - return 0.0; - } - double maxPathLength = 0.0; - double[] returnValues = computeShortestPathLenghts(root.getPageId(), 0.0, maxPathLength, new HashSet<>()); - maxPathLength = returnValues[1]; - return maxPathLength; + return depth; + } + + /** + * This parameter is already set in the constructor as it is needed for computation of relatedness values. + * Therefore its computation does not trigger setGraphParameters (it is too slow), even if the depth is implicitly determined there, too. + * + * @return The depth of the category graph, i.e. the maximum path length starting with the root node. + * @throws WikiApiException Thrown if errors occurred. + */ + private double getDepthFromRootPathMap() throws WikiApiException { + int max = 0; + for (List path : getRootPathMap().values()) { + if (path.size() > max) { + max = path.size(); + } } - public String getGraphInfo() { - StringBuffer sb = new StringBuffer(1000); - Map degreeDistribution = getDegreeDistribution(); - - sb.append("Number of Nodes: " + getNumberOfNodes() + LF); - sb.append("Number of Edges: " + getNumberOfEdges() + LF); - sb.append("Avg. path length: " + getAverageShortestPathLength() + LF); - sb.append("Diameter: " + getDiameter() + LF); - sb.append("Average degree: " + getAverageDegree() + LF); - sb.append("Cluster coefficient: " + getClusterCoefficient() + LF); - sb.append("Degree distribution: " + CommonUtilities.getMapContents(degreeDistribution) + LF); + max = max - 1; // depth is measured in nodes, not edges - return sb.toString(); + if (max < 0) { + return 0; + } else { + return max; } - - /** - * @return Returns the graph. - */ - public DefaultDirectedGraph getGraph() { - return graph; + } + + /** + * Computes the depth of the category graph, i.e. the maximum path length starting with the root node. + * + * @return The depth of the hierarchy. + * @throws WikiApiException Thrown if errors occurred. + */ + private double computeDepth() throws WikiApiException { + Category root = wiki.getMetaData().getMainCategory(); + if (root == null) { + logger.error("There is no root node for this wiki. Check the parameter that provides the name of the root node."); + return 0.0; } - - public AsUndirectedGraph getUndirectedGraph() - { - return undirectedGraph; - } - - public Map getHyponymCountMap() throws WikiApiException { - if (hyponymCountMap == null) { - createHyponymCountMap(); - } - return this.hyponymCountMap; + // test whether the root category is in this graph + if (!graph.containsVertex(root.getPageId())) { + logger.error("The root node is not part of this graph. Cannot compute depth of this graph. Setting depth to 0.0"); + return 0.0; } - - public Map> getRootPathMap() throws WikiApiException { - if (rootPathMap == null) { - createRootPathMap(); - } - return this.rootPathMap; + double maxPathLength = 0.0; + double[] returnValues = computeShortestPathLenghts(root.getPageId(), 0.0, maxPathLength, new HashSet<>()); + maxPathLength = returnValues[1]; + return maxPathLength; + } + + public String getGraphInfo() { + StringBuffer sb = new StringBuffer(1000); + Map degreeDistribution = getDegreeDistribution(); + + sb.append("Number of Nodes: " + getNumberOfNodes() + LF); + sb.append("Number of Edges: " + getNumberOfEdges() + LF); + sb.append("Avg. path length: " + getAverageShortestPathLength() + LF); + sb.append("Diameter: " + getDiameter() + LF); + sb.append("Average degree: " + getAverageDegree() + LF); + sb.append("Cluster coefficient: " + getClusterCoefficient() + LF); + sb.append("Degree distribution: " + CommonUtilities.getMapContents(degreeDistribution) + LF); + + return sb.toString(); + } + + /** + * @return Returns the graph. + */ + public DefaultDirectedGraph getGraph() { + return graph; + } + + public AsUndirectedGraph getUndirectedGraph() { + return undirectedGraph; + } + + public Map getHyponymCountMap() throws WikiApiException { + if (hyponymCountMap == null) { + createHyponymCountMap(); } + return this.hyponymCountMap; + } - /** - * Serialize a Map. - * - * @param map The map to serialize. - * @param file The file for saving the map. - */ - private void serializeMap(Map map, File file) { - try(ObjectOutputStream os = new ObjectOutputStream(new BufferedOutputStream(new FileOutputStream(file)))){ - os.writeObject(map); - } catch (Exception e) { - logger.error(e.getLocalizedMessage(), e); - } + public Map> getRootPathMap() throws WikiApiException { + if (rootPathMap == null) { + createRootPathMap(); } - - /** - * Deserialize a map - * @param file The file with the map. - */ - private Map deserializeMap(File file) { - Map map; - try(ObjectInputStream is = new ObjectInputStream(new BufferedInputStream(new FileInputStream(file)))) { - map = (Map) is.readObject(); - } catch (Exception e) { - logger.error(e.getLocalizedMessage(), e); - return null; - } - return map; + return this.rootPathMap; + } + + /** + * Serialize a Map. + * + * @param map The map to serialize. + * @param file The file for saving the map. + */ + private void serializeMap(Map map, File file) { + try (ObjectOutputStream os = new ObjectOutputStream(new BufferedOutputStream(new FileOutputStream(file)))) { + os.writeObject(map); + } catch (Exception e) { + logger.error(e.getLocalizedMessage(), e); } - - /** - * Serializes the graph to the given destination. - * @param destination The destination to which should be saved. - * @throws WikiApiException Thrown if errors occurred. - */ - // TODO should be refactored a bit. - public void saveGraph(String destination) throws WikiApiException { - try { - GraphSerialization.saveGraph(graph, destination); - } catch (IOException e) { - throw new WikiApiException(e); - } + } + + /** + * Deserialize a map + * + * @param file The file with the map. + */ + private Map deserializeMap(File file) { + Map map; + try (ObjectInputStream is = new ObjectInputStream(new BufferedInputStream(new FileInputStream(file)))) { + map = (Map) is.readObject(); + } catch (Exception e) { + logger.error(e.getLocalizedMessage(), e); + return null; + } + return map; + } + + /** + * Serializes the graph to the given destination. + * + * @param destination The destination to which should be saved. + * @throws WikiApiException Thrown if errors occurred. + */ + // TODO should be refactored a bit. + public void saveGraph(String destination) throws WikiApiException { + try { + GraphSerialization.saveGraph(graph, destination); + } catch (IOException e) { + throw new WikiApiException(e); } + } } diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/CategoryGraphManager.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/CategoryGraphManager.java index f54a3938..81e04c34 100755 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/CategoryGraphManager.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/CategoryGraphManager.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -34,94 +34,92 @@ // There should be no way to construct a category graph that circumvents the manager. public class CategoryGraphManager { - private static final Logger logger = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + private static final Logger logger = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); - private static Map catGraphMap; + private static Map catGraphMap; - private final static String catGraphSerializationFilename = "catGraphSer"; + private final static String catGraphSerializationFilename = "catGraphSer"; - public static CategoryGraph getCategoryGraph(Wikipedia wiki) throws WikiApiException { - return getCategoryGraph(wiki, null, true); - } + public static CategoryGraph getCategoryGraph(Wikipedia wiki) throws WikiApiException { + return getCategoryGraph(wiki, null, true); + } + + public static CategoryGraph getCategoryGraph(Wikipedia wiki, boolean serialize) throws WikiApiException { + return getCategoryGraph(wiki, null, serialize); + } - public static CategoryGraph getCategoryGraph(Wikipedia wiki, boolean serialize) throws WikiApiException { - return getCategoryGraph(wiki, null, serialize); + public static CategoryGraph getCategoryGraph(Wikipedia wiki, Set pageIds) throws WikiApiException { + return getCategoryGraph(wiki, pageIds, true); + } + + public static CategoryGraph getCategoryGraph(Wikipedia wiki, Set pageIds, boolean serialize) throws WikiApiException { + if (catGraphMap == null) { + catGraphMap = new HashMap<>(); } - public static CategoryGraph getCategoryGraph(Wikipedia wiki, Set pageIds) throws WikiApiException { - return getCategoryGraph(wiki, pageIds, true); + String wikiID = wiki.getWikipediaId(); + if (catGraphMap.containsKey(wikiID)) { + return catGraphMap.get(wikiID); } - public static CategoryGraph getCategoryGraph(Wikipedia wiki, Set pageIds, boolean serialize) throws WikiApiException { - if (catGraphMap == null) { - catGraphMap = new HashMap<>(); - } - - String wikiID = wiki.getWikipediaId(); - if (catGraphMap.containsKey(wikiID)) { - return catGraphMap.get(wikiID); - } - - String size = ""; - if (pageIds != null) { - size = Integer.valueOf(pageIds.size()).toString(); - } - - CategoryGraph catGraph; - if (serialize) { - catGraph = tryToLoadCategoryGraph(wiki, wikiID, size); - if (catGraph != null) { - catGraphMap.put(wikiID, catGraph); - return catGraph; - } - } - - - // could not be loaded (= no serialized category graph was written so far) => create it - if (pageIds != null) { - catGraph = new CategoryGraph(wiki, pageIds); - } - else { - catGraph = new CategoryGraph(wiki); - } + String size = ""; + if (pageIds != null) { + size = Integer.valueOf(pageIds.size()).toString(); + } + CategoryGraph catGraph; + if (serialize) { + catGraph = tryToLoadCategoryGraph(wiki, wikiID, size); + if (catGraph != null) { catGraphMap.put(wikiID, catGraph); + return catGraph; + } + } - if (serialize) { - saveCategoryGraph(catGraph, wikiID, size); - } - return catGraph; + // could not be loaded (= no serialized category graph was written so far) => create it + if (pageIds != null) { + catGraph = new CategoryGraph(wiki, pageIds); + } else { + catGraph = new CategoryGraph(wiki); } - private static CategoryGraph tryToLoadCategoryGraph(Wikipedia wiki, String wikiId, String size) throws WikiApiException { - - String defaultSerializedGraphLocation = getCategoryGraphSerializationFileName(wikiId, size); - File defaulSerializedGraphFile = new File(defaultSerializedGraphLocation); - if (defaulSerializedGraphFile.exists()) { - try { - logger.info("Loading category graph from " + defaultSerializedGraphLocation); - return new CategoryGraph(wiki, GraphSerialization.loadGraph(defaultSerializedGraphLocation)); - } catch (IOException | ClassNotFoundException e) { - throw new WikiApiException(e); - } - } - else { - return null; - } - } - - private static void saveCategoryGraph(CategoryGraph catGraph, String wikiId, String size) throws WikiApiException { - String defaultSerializedGraphLocation = getCategoryGraphSerializationFileName(wikiId, size); - try { - logger.info("Saving category graph to " + defaultSerializedGraphLocation); - GraphSerialization.saveGraph(catGraph.getGraph(), defaultSerializedGraphLocation); - } catch (IOException e) { - throw new WikiApiException(e); - } - } + catGraphMap.put(wikiID, catGraph); + + if (serialize) { + saveCategoryGraph(catGraph, wikiID, size); + } + + return catGraph; + } - private static String getCategoryGraphSerializationFileName(String wikiId, String size) { - return catGraphSerializationFilename + "_" + wikiId + size; + private static CategoryGraph tryToLoadCategoryGraph(Wikipedia wiki, String wikiId, String size) throws WikiApiException { + + String defaultSerializedGraphLocation = getCategoryGraphSerializationFileName(wikiId, size); + File defaulSerializedGraphFile = new File(defaultSerializedGraphLocation); + if (defaulSerializedGraphFile.exists()) { + try { + logger.info("Loading category graph from " + defaultSerializedGraphLocation); + return new CategoryGraph(wiki, GraphSerialization.loadGraph(defaultSerializedGraphLocation)); + } catch (IOException | ClassNotFoundException e) { + throw new WikiApiException(e); } + } else { + return null; + } + } + + private static void saveCategoryGraph(CategoryGraph catGraph, String wikiId, String size) throws WikiApiException { + String defaultSerializedGraphLocation = getCategoryGraphSerializationFileName(wikiId, size); + try { + logger.info("Saving category graph to " + defaultSerializedGraphLocation); + GraphSerialization.saveGraph(catGraph.getGraph(), defaultSerializedGraphLocation); + } catch (IOException e) { + throw new WikiApiException(e); + } + } + + private static String getCategoryGraphSerializationFileName(String wikiId, String size) { + return catGraphSerializationFilename + "_" + wikiId + size; + } } diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/CategoryIterable.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/CategoryIterable.java index 6f5e5568..220d618c 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/CategoryIterable.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/CategoryIterable.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -19,35 +19,34 @@ import java.util.Iterator; - /** - * An iterable over category objects. - * + * An {@link Iterable} over {@link Category} objects. */ public class CategoryIterable implements Iterable { - private final Wikipedia wiki; - - /** - * The size of the page buffer. - * With bufferSize = 1, a database connection is needed for retrieving a single article. - * Higher bufferSize gives better performance, but needs memory. - * Initialize it with 500. - */ - private int bufferSize = 500; - - public CategoryIterable(Wikipedia wiki) { - this.wiki = wiki; - } - - public CategoryIterable(Wikipedia wiki, int bufferSize) { - this.wiki = wiki; - this.bufferSize = bufferSize; - } - - public Iterator iterator() { - return new CategoryIterator(wiki, bufferSize); - } + private final Wikipedia wiki; + + /* + * The size of the page buffer. + * With bufferSize = 1, a database connection is needed for retrieving a single article. + * Higher bufferSize gives better performance, but needs memory. + * Initialize it with 500. + */ + private int bufferSize = 500; + + public CategoryIterable(Wikipedia wiki) { + this.wiki = wiki; + } + + public CategoryIterable(Wikipedia wiki, int bufferSize) { + this.wiki = wiki; + this.bufferSize = bufferSize; + } + + @Override + public Iterator iterator() { + return new CategoryIterator(wiki, bufferSize); + } } diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/CategoryIterator.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/CategoryIterator.java index 6595b79c..c4c92fb6 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/CategoryIterator.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/CategoryIterator.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -22,204 +22,139 @@ import java.util.Iterator; import java.util.List; -import org.hibernate.Session; - import org.dkpro.jwpl.api.exception.WikiApiException; +import org.hibernate.Session; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** - * An iterator over category objects. - * + * An {@link Iterator} over {@link Category} objects. */ public class CategoryIterator implements Iterator { - private static final Logger logger = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); - - private final CategoryBuffer buffer; - - public CategoryIterator(Wikipedia wiki, int bufferSize) { - buffer = new CategoryBuffer(bufferSize, wiki); - } - - public boolean hasNext(){ - return buffer.hasNext(); - } - - public Category next(){ - return buffer.next(); - } - - public void remove() { - throw new UnsupportedOperationException(); + private static final Logger logger = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + + private final CategoryBuffer buffer; + + public CategoryIterator(Wikipedia wiki, int bufferSize) { + buffer = new CategoryBuffer(bufferSize, wiki); + } + + @Override + public boolean hasNext() { + return buffer.hasNext(); + } + + @Override + public Category next() { + return buffer.next(); + } + + @Override + public void remove() { + throw new UnsupportedOperationException(); + } + + /** + * Buffers categories in a list. + */ + static class CategoryBuffer { + + private final Wikipedia wiki; + + private final List buffer; + private final int maxBufferSize; // the number of pages to be buffered after a query to the database. + private int bufferFillSize; // even a 500 slot buffer can be filled with only 5 elements + private int bufferOffset; // the offset in the buffer + private int dataOffset; // the overall offset in the data + + public CategoryBuffer(int bufferSize, Wikipedia wiki) { + this.maxBufferSize = bufferSize; + this.wiki = wiki; + this.buffer = new ArrayList<>(); + this.bufferFillSize = 0; + this.bufferOffset = 0; + this.dataOffset = 0; + //TODO test whether this works when zero pages are retrieved } /** - * Buffers categories in a list. - * + * If there are elements in the buffer left, then return true. + * If the end of the filled buffer is reached, then try to load new buffer. * + * @return True, if there are pages left. False otherwise. */ - class CategoryBuffer{ - - private final Wikipedia wiki; - - private final List buffer; - private final int maxBufferSize; // the number of pages to be buffered after a query to the database. - private int bufferFillSize; // even a 500 slot buffer can be filled with only 5 elements - private int bufferOffset; // the offset in the buffer - private int dataOffset; // the overall offset in the data - - public CategoryBuffer(int bufferSize, Wikipedia wiki){ - this.maxBufferSize = bufferSize; - this.wiki = wiki; - this.buffer = new ArrayList<>(); - this.bufferFillSize = 0; - this.bufferOffset = 0; - this.dataOffset = 0; - //TODO test whether this works when zero pages are retrieved - } - - /** - * If there are elements in the buffer left, then return true. - * If the end of the filled buffer is reached, then try to load new buffer. - * @return True, if there are pages left. False otherwise. - */ - public boolean hasNext(){ - if (bufferOffset < bufferFillSize) { - return true; - } - else { - return this.fillBuffer(); - } - } + public boolean hasNext() { + if (bufferOffset < bufferFillSize) { + return true; + } else { + return this.fillBuffer(); + } + } - /** - * - * @return The next Category or null if no more categories are available. - */ - public Category next(){ - // if there are still elements in the buffer, just retrieve the next one - if (bufferOffset < bufferFillSize) { - return this.getBufferElement(); - } - // if there are no more elements => try to fill a new buffer - else if (this.fillBuffer()) { - return this.getBufferElement(); - } - else { - // if it cannot be filled => return null - return null; - } - } + /** + * @return The next Category or null if no more categories are available. + */ + public Category next() { + // if there are still elements in the buffer, just retrieve the next one + if (bufferOffset < bufferFillSize) { + return this.getBufferElement(); + } + // if there are no more elements => try to fill a new buffer + else if (this.fillBuffer()) { + return this.getBufferElement(); + } else { + // if it cannot be filled => return null + return null; + } + } - private Category getBufferElement() { - Category cat = buffer.get(bufferOffset); - bufferOffset++; - dataOffset++; - return cat; - } + private Category getBufferElement() { + Category cat = buffer.get(bufferOffset); + bufferOffset++; + dataOffset++; + return cat; + } - private boolean fillBuffer() { - - Session session = this.wiki.__getHibernateSession(); - session.beginTransaction(); - final String sql = "SELECT c FROM Category c"; - List returnValues = - session.createQuery(sql, org.dkpro.jwpl.api.hibernate.Category.class) - .setFirstResult(dataOffset) - .setMaxResults(maxBufferSize) - .setFetchSize(maxBufferSize) - .list(); - session.getTransaction().commit(); - - // clear the old buffer and all variables regarding the state of the buffer - buffer.clear(); - bufferOffset = 0; - bufferFillSize = 0; - - Category apiCategory; - for(org.dkpro.jwpl.api.hibernate.Category o : returnValues){ - if(o==null) { - return false; - } else { - long id = o.getId(); - try { - apiCategory= new Category(this.wiki, id); - buffer.add(apiCategory); - } catch (WikiApiException e) { - logger.error("Page with hibernateID {} not found.", id, e); - } - } - } - if (buffer.size() > 0) { - bufferFillSize = buffer.size(); - return true; - } - else { - return false; - } + private boolean fillBuffer() { + + Session session = this.wiki.__getHibernateSession(); + session.beginTransaction(); + final String sql = "SELECT c FROM Category c"; + List returnValues = + session.createQuery(sql, org.dkpro.jwpl.api.hibernate.Category.class) + .setFirstResult(dataOffset) + .setMaxResults(maxBufferSize) + .setFetchSize(maxBufferSize) + .list(); + session.getTransaction().commit(); + + // clear the old buffer and all variables regarding the state of the buffer + buffer.clear(); + bufferOffset = 0; + bufferFillSize = 0; + + Category apiCategory; + for (org.dkpro.jwpl.api.hibernate.Category o : returnValues) { + if (o == null) { + return false; + } else { + long id = o.getId(); + try { + apiCategory = new Category(this.wiki, id); + buffer.add(apiCategory); + } catch (WikiApiException e) { + logger.error("Page with hibernateID {} not found.", id, e); + } } - + } + if (buffer.size() > 0) { + bufferFillSize = buffer.size(); + return true; + } else { + return false; + } } - -// private Wikipedia wiki; -// private int iterPosition; -// -// public CategoryIterator(Wikipedia wiki) { -// this.wiki = wiki; -// this.iterPosition = 0; -// } -// -// public boolean hasNext() { -// Session session = this.wiki.__getHibernateSession(); -// session.beginTransaction(); -// Object returnValue = session.createCriteria(org.tud.ukp.wikipedia.api.hibernate.Category.class) -// .setFirstResult(iterPosition) -// .setMaxResults(1) -// .uniqueResult(); -// session.getTransaction().commit(); -// -// if (returnValue == null) { -// return false; -// } -// else { -// return true; -// } -// -// } -// -// public Category next() { -// Session session = this.wiki.__getHibernateSession(); -// session.beginTransaction(); -// Object returnValue = session.createCriteria(org.tud.ukp.wikipedia.api.hibernate.Category.class) -// .setFirstResult(iterPosition) -// .setMaxResults(1) -// .uniqueResult(); -// session.getTransaction().commit(); -// -// Category apiCategory; -// -// if (returnValue == null) { -// return null; -// } -// else { -// org.tud.ukp.wikipedia.api.hibernate.Category hibernateCategory = (org.tud.ukp.wikipedia.api.hibernate.Category) returnValue; -// long id = hibernateCategory.getId(); -// try { -// apiCategory = new org.tud.ukp.wikipedia.api.Category(this.wiki, id); -// iterPosition++; -// return apiCategory; -// } catch (WikiPageNotFoundException e) { -// logger.error("Category with hibernateID " + id + " not found."); -// e.printStackTrace(); -// } -// } -// return null; -// -// } -// -// public void remove() { -// throw new UnsupportedOperationException(); -// } + } } diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/CategoryTitleComparator.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/CategoryTitleComparator.java index 3bcea612..6f2adf37 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/CategoryTitleComparator.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/CategoryTitleComparator.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -22,19 +22,19 @@ import org.dkpro.jwpl.api.exception.WikiTitleParsingException; /** - * Compares two categories based on the lexicographic ordering of their titles. - * + * Compares two {@link Category categories} based on the lexicographic ordering of their titles. */ public class CategoryTitleComparator implements Comparator { - public int compare(Category o1, Category o2) { + @Override + public int compare(Category o1, Category o2) { - int retVal = 0; - try { - retVal = o1.getTitle().getPlainTitle().compareTo(o2.getTitle().getPlainTitle()); - } catch (WikiTitleParsingException e) { - e.printStackTrace(); - } - return retVal; + int retVal = 0; + try { + retVal = o1.getTitle().getPlainTitle().compareTo(o2.getTitle().getPlainTitle()); + } catch (WikiTitleParsingException e) { + e.printStackTrace(); } + return retVal; + } } diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/CycleHandler.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/CycleHandler.java index 3785ca53..a60eff34 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/CycleHandler.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/CycleHandler.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -22,282 +22,106 @@ import java.util.Map; import java.util.Set; -import org.jgrapht.graph.DefaultEdge; - import org.dkpro.jwpl.api.exception.WikiApiException; +import org.jgrapht.graph.DefaultEdge; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * Methods for handling cycles in the category graph. - * */ public class CycleHandler { - private static final Logger logger = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + private static final Logger logger = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); - final Wikipedia wiki; - final CategoryGraph categoryGraph; + final Wikipedia wiki; + final CategoryGraph categoryGraph; - private enum Color {white, grey, black} + private enum Color {white, grey, black} private Map colorMap; - /** - * Creates a cycle handler object. - * @param wiki The {@link Wikipedia} object to use. - * @param categoryGraph The category graph in which cycles should be handled. - */ - public CycleHandler(Wikipedia wiki, CategoryGraph categoryGraph) { - this.wiki = wiki; - this.categoryGraph = categoryGraph; + /** + * Creates a cycle handler object. + * + * @param wiki The {@link Wikipedia} object to use. + * @param categoryGraph The category graph in which cycles should be handled. + */ + public CycleHandler(Wikipedia wiki, CategoryGraph categoryGraph) { + this.wiki = wiki; + this.categoryGraph = categoryGraph; + } + + /** + * The JGraphT cycle detection seems not to find all cycles. Thus, I wrote my own cycle detection. + * It is a colored DFS and should find all (viscious :) cycles. + * + * @return True, if the graph contains a cycle. + * @throws WikiApiException Thrown if errors occurred. + */ + public boolean containsCycle() throws WikiApiException { + DefaultEdge edge = findCycle(); + if (edge != null) { + Category sourceCat = wiki.getCategory(categoryGraph.getGraph().getEdgeSource(edge)); + Category targetCat = wiki.getCategory(categoryGraph.getGraph().getEdgeTarget(edge)); + + logger.info("Cycle: " + sourceCat.getTitle() + " - " + targetCat.getTitle()); + return true; + } else { + return false; } - - -// /** -// * If there are cylces in the graph, they are resolved. -// * The JGraphT cycle detector seems not to detect direct cycles (n1 -> n2 -> n1) - so we also call breakDirectCycles. -// * For each node in a cycle, we determine the minimum path length to the root. -// * We remove the edge that runs from this deepest node to any other node in the cycle. -// * If all nodes are of equal depth, we choose an arbitrary node an remove all outgoing edges to nodes in the cycle. -// * @throws WikiApiException -// */ -// @Deprecated -// private void breakCycles() throws WikiApiException { -// logger.info("Breaking cycles."); -// -// // get root node -// Category root = wiki.getMetaData().getMainCategory(); -// int rootID = root.getPageId(); -// -// CycleDetector cycleDetector = new CycleDetector(categoryGraph.getGraph()); -// -// Map pathLengthToRoot = new HashMap(); -// -// while (hasCycles()) { -// Set cycleNodes = cycleDetector.findCycles(); -// logger.info("Number of nodes in cycles: " + cycleNodes.size()); -// Iterator it = cycleNodes.iterator(); -// int currentNode = it.next(); -// Set currentCycleNodes = cycleDetector.findCyclesContainingVertex(currentNode); -// // find the node with the highest minimum path length to the root ( this is the deepest node of the cycle) -// int maxLength = -1; -// int maxNode = -1; -// for (int cycleNode : currentCycleNodes) { -// // get the path length -// Category cat = wiki.__getCategory(cycleNode); -// int pathLength = -1; -// if (!pathLengthToRoot.containsKey(cycleNode)) { -//// pathLength = categoryGraph.getPathLengthInNodes(rootID, cat.getPageId()); -// pathLengthToRoot.put(cycleNode, pathLength); -// } -// else { -// pathLength = pathLengthToRoot.get(cycleNode); -// } -// -// // set the maximum -// if (pathLength >= maxLength) { -// maxNode = cycleNode; -// maxLength = pathLength; -// } -// } -// -// // maxCat is the deepest category of a cycle -// // get all outlinks and remove all edges from the graph that point to nodes in the cycle -// Set outgoingEdges = categoryGraph.getGraph().outgoingEdgesOf(maxNode); -// Set edgesToRemove = new HashSet(); -// for (DefaultEdge edge : outgoingEdges) { -// if ((categoryGraph.getGraph().getEdgeSource(edge) == maxNode) && (currentCycleNodes.contains(categoryGraph.getGraph().getEdgeTarget(edge)))) { -// edgesToRemove.add(edge); -// -////// I removed this, because it is must always be possible to reconstruct the real data from the category and article object. -////// A category graph is an abstraction from that. It may contain fewer nodes. It may have cycles broken etc. -////// TODO Some algorithms work on the category structure itself instead of the graph. Maybe that should be changed. -//// Category cat = wiki.getCategory(maxNode); -//// Set outlinks = cat.getOutLinks(); -//// outlinks.remove(categoryGraph.getGraph().getEdgeTarget(edge)); -//// cat.setOutLinks(outlinks); -// } -// } -// categoryGraph.getGraph().removeAllEdges(edgesToRemove); -// } -// breakDirectCycles(); -// } - -// /** -// * The JGraphT cycle detector seems not to detect direct cycles (n1 -> n2 -> n1). -// * Break such direct cycles. -// */ -// private void breakDirectCycles() { -// Set nodes = categoryGraph.getGraph().vertexSet(); -// Set edgesToRemove = new HashSet(); -// for (int node : nodes) { -// Set outgoingEdges = categoryGraph.getGraph().outgoingEdgesOf(node); -// for (DefaultEdge edge : outgoingEdges) { -// int outNode = categoryGraph.getGraph().getEdgeTarget(edge); -// if (outNode == node) { -// logger.error("Graph contains self edge."); -// } -// Set outgoingEdges2 = categoryGraph.getGraph().outgoingEdgesOf(outNode); -// for (DefaultEdge edge2 : outgoingEdges2) { -// if (categoryGraph.getGraph().getEdgeTarget(edge2) == node) { -// logger.error("Direct cycle found."); -// edgesToRemove.add(edge2); -// } -// } -// } -// } -// categoryGraph.getGraph().removeAllEdges(edgesToRemove); -// } - - -// /** -// * Detects whether there is a cyle in the graph. -// */ -// @Deprecated -// protected boolean hasCycles() { -// CycleDetector cycleDetector = new CycleDetector(categoryGraph.getGraph()); -// return cycleDetector.detectCycles(); -// } - - -//// accessed the JGraphT cycle detector, that I have found to be buggy. -// private void showCycles() throws WikiApiException { -// CycleDetector cycleDetector = new CycleDetector(categoryGraph.getGraph()); -// Set cycleNodes = cycleDetector.findCycles(); -// -// while (!cycleNodes.isEmpty()) { -// Iterator it = cycleNodes.iterator(); -// int currentNode = it.next(); -// Set currentCycleNodes = cycleDetector.findCyclesContainingVertex(currentNode); -// -// for (int cycleNode : currentCycleNodes) { -// Category cat = wiki.__getCategory(cycleNode); -// logger.info(cat.getTitle()); -// } -// logger.info(""); -// -// // remove these nodes from the set of cycle nodes -// cycleNodes.removeAll(currentCycleNodes); -// } -// } - - -// /** -// * The JGraphT cycle detection seems not to find all cycles. Thus, I wrote my own cycle detection. -// * It is a colored DFS and should find all (viscious :) cycles. -// * @return -// */ -// public boolean containsCycle() { -// colorMap = new HashMap(); -// // initialize all nodes with white -// for (int node : categoryGraph.getGraph().vertexSet()) { -// colorMap.put(node, Color.white); -// } -// -// for (int node : categoryGraph.getGraph().vertexSet()) { -// if (colorMap.get(node).equals(Color.white)) { -// if (visit(node)) { -// return true; -// } -// } -// } -// return false; -// } -// -// private boolean visit(int node) { -// colorMap.put(node, Color.grey); -// Set outgoingEdges = categoryGraph.getGraph().outgoingEdgesOf(node); -// for (DefaultEdge edge : outgoingEdges) { -// int outNode = categoryGraph.getGraph().getEdgeTarget(edge); -// if (colorMap.get(outNode).equals(Color.grey)) { -//// Category sourceCat = wiki.getCategory(node); -//// Category targetCat = wiki.getCategory(outNode); -//// -//// logger.info(sourceCat.getName() + " - " + targetCat.getName()); -// -// return true; -// } -// else if (colorMap.get(outNode).equals(Color.white)) { -// if (visit(outNode)) { -// return true; -// } -// } -// } -// colorMap.put(node, Color.black); -// return false; -// } - - /** - * The JGraphT cycle detection seems not to find all cycles. Thus, I wrote my own cycle detection. - * It is a colored DFS and should find all (viscious :) cycles. - * @return True, if the graph contains a cycle. - * @throws WikiApiException Thrown if errors occurred. - */ - public boolean containsCycle() throws WikiApiException { - DefaultEdge edge = findCycle(); - if (edge != null) { - Category sourceCat = wiki.getCategory(categoryGraph.getGraph().getEdgeSource(edge)); - Category targetCat = wiki.getCategory(categoryGraph.getGraph().getEdgeTarget(edge)); - - logger.info("Cycle: " + sourceCat.getTitle() + " - " + targetCat.getTitle()); - return true; - } - else { - return false; - } + } + + /** + * Removes cycles from the graph that was used to construct the cycle handler. + * + * @throws WikiApiException Thrown if errors occurred. + */ + public void removeCycles() throws WikiApiException { + DefaultEdge edge; + while ((edge = findCycle()) != null) { + Category sourceCat = wiki.getCategory(categoryGraph.getGraph().getEdgeSource(edge)); + Category targetCat = wiki.getCategory(categoryGraph.getGraph().getEdgeTarget(edge)); + + logger.info("Removing cycle: " + sourceCat.getTitle() + " - " + targetCat.getTitle()); + + categoryGraph.getGraph().removeEdge(edge); } + } - /** - * Removes cycles from the graph that was used to construct the cycle handler. - * @throws WikiApiException Thrown if errors occurred. - */ - public void removeCycles() throws WikiApiException { - DefaultEdge edge; - while ((edge = findCycle()) != null) { - Category sourceCat = wiki.getCategory(categoryGraph.getGraph().getEdgeSource(edge)); - Category targetCat = wiki.getCategory(categoryGraph.getGraph().getEdgeTarget(edge)); - - logger.info("Removing cycle: " + sourceCat.getTitle() + " - " + targetCat.getTitle()); - - categoryGraph.getGraph().removeEdge(edge); - } + private DefaultEdge findCycle() { + colorMap = new HashMap<>(); + // initialize all nodes with white + for (int node : categoryGraph.getGraph().vertexSet()) { + colorMap.put(node, Color.white); } - private DefaultEdge findCycle() { - colorMap = new HashMap<>(); - // initialize all nodes with white - for (int node : categoryGraph.getGraph().vertexSet()) { - colorMap.put(node, Color.white); + for (int node : categoryGraph.getGraph().vertexSet()) { + if (colorMap.get(node).equals(Color.white)) { + DefaultEdge e = visit(node); + if (e != null) { + return e; } - - for (int node : categoryGraph.getGraph().vertexSet()) { - if (colorMap.get(node).equals(Color.white)) { - DefaultEdge e = visit(node); - if (e != null) { - return e; - } - } - } - return null; + } } - - private DefaultEdge visit(int node) { - colorMap.put(node, Color.grey); - Set outgoingEdges = categoryGraph.getGraph().outgoingEdgesOf(node); - for (DefaultEdge edge : outgoingEdges) { - int outNode = categoryGraph.getGraph().getEdgeTarget(edge); - if (colorMap.get(outNode).equals(Color.grey)) { - return edge; - } - else if (colorMap.get(outNode).equals(Color.white)) { - DefaultEdge e = visit(outNode); - if (e != null) { - return e; - } - } + return null; + } + + private DefaultEdge visit(int node) { + colorMap.put(node, Color.grey); + Set outgoingEdges = categoryGraph.getGraph().outgoingEdgesOf(node); + for (DefaultEdge edge : outgoingEdges) { + int outNode = categoryGraph.getGraph().getEdgeTarget(edge); + if (colorMap.get(outNode).equals(Color.grey)) { + return edge; + } else if (colorMap.get(outNode).equals(Color.white)) { + DefaultEdge e = visit(outNode); + if (e != null) { + return e; } - colorMap.put(node, Color.black); - return null; + } } + colorMap.put(node, Color.black); + return null; + } } diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/DatabaseConfiguration.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/DatabaseConfiguration.java index 9c73f63e..2e7d01f8 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/DatabaseConfiguration.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/DatabaseConfiguration.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -18,152 +18,166 @@ package org.dkpro.jwpl.api; /** - * An instance of {@link DatabaseConfiguration} is used to establish a database connection and set various parameters. + * A {@link DatabaseConfiguration} is used to establish a database connection and set various parameters. */ public class DatabaseConfiguration { - private String host; - private String database; - private String user; - private String password; - private WikiConstants.Language language; - private String jdbcURL; - private String databaseDriver; - - public DatabaseConfiguration() {} - - /** - * A constructor for MySQL backends, i.e. the default production setting. - * - * @param host The hostname the machine the database is hosted on. - * @param database The name of the database to connect to. - * @param user The username as part of the credentials used for authentication. - * @param password The password as part of the credentials used for authentication. - * @param language The {@link WikiConstants.Language} used for the underlying connection. - */ - public DatabaseConfiguration(String host, String database, String user, String password, WikiConstants.Language language) { - - this("com.mysql.jdbc.Driver", "jdbc:mysql://" + host + "/" + database, - host, database, user, password, language); - } + private String host; + private String database; + private String user; + private String password; + private WikiConstants.Language language; + private String jdbcURL; + private String databaseDriver; - /** - * A constructor for an explicit DBMS specific configuration. - * - * @param databaseDriver The fully qualified name of the JDBC driver. - * @param jdbcURL A valid JDBC url used to open connections. - * @param host The hostname the machine the database is hosted on. - * @param database The name of the database to connect to. - * @param user The username as part of the credentials used for authentication. - * @param password The password as part of the credentials used for authentication. - * @param language The {@link WikiConstants.Language} used for the underlying connection. - */ - public DatabaseConfiguration(String databaseDriver, String jdbcURL, String host, String database, String user, - String password, WikiConstants.Language language) { - this.host = host; - this.database = database; - this.user = user; - this.password = password; - this.language = language; - - this.setDatabaseDriver(databaseDriver); - this.setJdbcURL(jdbcURL); - } + public DatabaseConfiguration() { + } - /** - * @return {@code True} if collation is supported by the database backend, else {@code false}. - */ - boolean supportsCollation() { - if(databaseDriver!=null) { - return databaseDriver.contains("mysql") || databaseDriver.contains("mariadb"); - } else { - return false; - } - } + /** + * A constructor for MySQL backends, i.e. the default production setting. + * + * @param host The hostname the machine the database is hosted on. + * @param database The name of the database to connect to. + * @param user The username as part of the credentials used for authentication. + * @param password The password as part of the credentials used for authentication. + * @param language The {@link WikiConstants.Language} used for the underlying connection. + */ + public DatabaseConfiguration(String host, String database, String user, String password, WikiConstants.Language language) { - /** - * @param database The name of the database. - */ - public void setDatabase(String database) { - this.database = database; - } - /** - * @param host The host where the database is running. Set to "localhost", if the database is running locally. - */ - public void setHost(String host) { - this.host = host; - } - /** - * @param password The password to access the database. - */ - public void setPassword(String password) { - this.password = password; - } - /** - * @param user The database user. - */ - public void setUser(String user) { - this.user = user; - } - /** - * @param language The language of the Wikipedia data. - */ - public void setLanguage(WikiConstants.Language language) { - this.language = language; - } - /** - * @return The name of the database. - */ - public String getDatabase() { - return database; - } - /** - * @return The host where the database is running. - */ - public String getHost() { - return host; - } - /** - * @return The password to access the database. - */ - public String getPassword() { - return password; - } - /** - * @return The database user. - */ - public String getUser() { - return user; - } - /** - * @return The language of the Wikipedia data. - */ - public WikiConstants.Language getLanguage() { - return language; + this("com.mysql.jdbc.Driver", "jdbc:mysql://" + host + "/" + database, + host, database, user, password, language); + } + + /** + * A constructor for an explicit DBMS specific configuration. + * + * @param databaseDriver The fully qualified name of the JDBC driver. + * @param jdbcURL A valid JDBC url used to open connections. + * @param host The hostname the machine the database is hosted on. + * @param database The name of the database to connect to. + * @param user The username as part of the credentials used for authentication. + * @param password The password as part of the credentials used for authentication. + * @param language The {@link WikiConstants.Language} used for the underlying connection. + */ + public DatabaseConfiguration(String databaseDriver, String jdbcURL, String host, String database, String user, + String password, WikiConstants.Language language) { + this.host = host; + this.database = database; + this.user = user; + this.password = password; + this.language = language; + + this.setDatabaseDriver(databaseDriver); + this.setJdbcURL(jdbcURL); + } + + /** + * @return {@code True} if collation is supported by the database backend, else {@code false}. + */ + boolean supportsCollation() { + if (databaseDriver != null) { + return databaseDriver.contains("mysql") || databaseDriver.contains("mariadb"); + } else { + return false; } - /** - * @param databaseDriver the databaseDriver to set - */ - public void setDatabaseDriver(String databaseDriver) { - this.databaseDriver = databaseDriver; - } - /** - * @return the databaseDriver - */ - public String getDatabaseDriver() { - return databaseDriver; - } - /** - * @param jdbcURL the jdbcURL to set - */ - public void setJdbcURL(String jdbcURL) { - this.jdbcURL = jdbcURL; - } - /** - * @return the jdbcURL - */ - public String getJdbcURL() { - return jdbcURL; - } + } + + /** + * @param database The name of the database. + */ + public void setDatabase(String database) { + this.database = database; + } + + /** + * @param host The host where the database is running. Set to "localhost", if the database is running locally. + */ + public void setHost(String host) { + this.host = host; + } + + /** + * @param password The password to access the database. + */ + public void setPassword(String password) { + this.password = password; + } + + /** + * @param user The database user. + */ + public void setUser(String user) { + this.user = user; + } + + /** + * @param language The language of the Wikipedia data. + */ + public void setLanguage(WikiConstants.Language language) { + this.language = language; + } + + /** + * @return The name of the database. + */ + public String getDatabase() { + return database; + } + + /** + * @return The host where the database is running. + */ + public String getHost() { + return host; + } + + /** + * @return The password to access the database. + */ + public String getPassword() { + return password; + } + + /** + * @return The database user. + */ + public String getUser() { + return user; + } + + /** + * @return The language of the Wikipedia data. + */ + public WikiConstants.Language getLanguage() { + return language; + } + + /** + * @param databaseDriver the databaseDriver to set + */ + public void setDatabaseDriver(String databaseDriver) { + this.databaseDriver = databaseDriver; + } + + /** + * @return the databaseDriver + */ + public String getDatabaseDriver() { + return databaseDriver; + } + + /** + * @param jdbcURL the jdbcURL to set + */ + public void setJdbcURL(String jdbcURL) { + this.jdbcURL = jdbcURL; + } + + /** + * @return the jdbcURL + */ + public String getJdbcURL() { + return jdbcURL; + } } diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/MetaData.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/MetaData.java index 6610f9f6..0d4429a5 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/MetaData.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/MetaData.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -17,149 +17,137 @@ */ package org.dkpro.jwpl.api; +import org.dkpro.jwpl.api.exception.WikiApiException; import org.hibernate.LockMode; import org.hibernate.Session; -import org.dkpro.jwpl.api.exception.WikiApiException; - /** - * Provides access to meta data about a certain instance of Wikipedia. + * Provides access to meta-data about a certain {@link Wikipedia} instance. */ public class MetaData implements WikiConstants { - // private MetaDataDAO metaDAO; - private final org.dkpro.jwpl.api.hibernate.MetaData hibernateMetaData; - private final Wikipedia wiki; - /** - * Creates a meta data object. - */ - protected MetaData(Wikipedia wiki) - { - this.wiki = wiki; - // this.metaDAO = new MetaDataDAO(wiki); - Session session = this.wiki.__getHibernateSession(); - session.beginTransaction(); - hibernateMetaData = session.createQuery("from MetaData", - org.dkpro.jwpl.api.hibernate.MetaData.class).uniqueResult(); - session.getTransaction().commit(); - } + private final Wikipedia wiki; + private final org.dkpro.jwpl.api.hibernate.MetaData hibernateMetaData; /** - * @return The id of the {@link MetaData} object. - */ - /* - * Note well: - * Access is limited to package-private here intentionally, as the database ID is considered framework-internal use. - */ - long getId() - { - Session session = this.wiki.__getHibernateSession(); - session.beginTransaction(); - session.lock(hibernateMetaData, LockMode.NONE); - long id = hibernateMetaData.getId(); - session.getTransaction().commit(); - return id; - } + * Creates a meta data object. + */ + protected MetaData(Wikipedia wiki) { + this.wiki = wiki; + Session session = this.wiki.__getHibernateSession(); + session.beginTransaction(); + hibernateMetaData = session.createQuery("from MetaData", + org.dkpro.jwpl.api.hibernate.MetaData.class).uniqueResult(); + session.getTransaction().commit(); + } - /** - * @return The number of categories in the current Wikipedia. - */ - public long getNumberOfCategories() - { - Session session = this.wiki.__getHibernateSession(); - session.beginTransaction(); - session.lock(hibernateMetaData, LockMode.NONE); - long nrofCategories = hibernateMetaData.getNrofCategories(); - session.getTransaction().commit(); - return nrofCategories; - } + /** + * @return The id of the {@link MetaData} object. + */ + /* + * Note well: + * Access is limited to package-private here intentionally, as the database ID is considered framework-internal use. + */ + long getId() { + Session session = this.wiki.__getHibernateSession(); + session.beginTransaction(); + session.lock(hibernateMetaData, LockMode.NONE); + long id = hibernateMetaData.getId(); + session.getTransaction().commit(); + return id; + } - /** - * @return The number of pages in the current Wikipedia. - */ - public long getNumberOfPages() - { - Session session = this.wiki.__getHibernateSession(); - session.beginTransaction(); - session.lock(hibernateMetaData, LockMode.NONE); - long nrofPages = hibernateMetaData.getNrofPages(); - session.getTransaction().commit(); - return nrofPages; - } + /** + * @return The number of categories in the current Wikipedia. + */ + public long getNumberOfCategories() { + Session session = this.wiki.__getHibernateSession(); + session.beginTransaction(); + session.lock(hibernateMetaData, LockMode.NONE); + long nrofCategories = hibernateMetaData.getNrofCategories(); + session.getTransaction().commit(); + return nrofCategories; + } - /** - * @return The number of disambiguation pages in the current Wikipedia. - */ - public long getNumberOfDisambiguationPages() - { - Session session = this.wiki.__getHibernateSession(); - session.beginTransaction(); - session.lock(hibernateMetaData, LockMode.NONE); - long nrofDisambPages = hibernateMetaData.getNrofDisambiguationPages(); - session.getTransaction().commit(); - return nrofDisambPages; - } + /** + * @return The number of pages in the current Wikipedia. + */ + public long getNumberOfPages() { + Session session = this.wiki.__getHibernateSession(); + session.beginTransaction(); + session.lock(hibernateMetaData, LockMode.NONE); + long nrofPages = hibernateMetaData.getNrofPages(); + session.getTransaction().commit(); + return nrofPages; + } - /** - * @return The number of redirects in the current Wikipedia. - */ - public long getNumberOfRedirectPages() - { - Session session = this.wiki.__getHibernateSession(); - session.beginTransaction(); - session.lock(hibernateMetaData, LockMode.NONE); - long nrofRedirects = hibernateMetaData.getNrofRedirects(); - session.getTransaction().commit(); - return nrofRedirects; - } + /** + * @return The number of disambiguation pages in the current Wikipedia. + */ + public long getNumberOfDisambiguationPages() { + Session session = this.wiki.__getHibernateSession(); + session.beginTransaction(); + session.lock(hibernateMetaData, LockMode.NONE); + long nrofDisambPages = hibernateMetaData.getNrofDisambiguationPages(); + session.getTransaction().commit(); + return nrofDisambPages; + } - /** - * @return The disambiguation {@link Category}. - * @throws WikiApiException Thrown if errors occurred fetching the information. - */ - public Category getDisambiguationCategory() throws WikiApiException - { - Session session = this.wiki.__getHibernateSession(); - session.beginTransaction(); - session.lock(hibernateMetaData, LockMode.NONE); - String disambCategoryTitle = hibernateMetaData.getDisambiguationCategory(); - session.getTransaction().commit(); - return wiki.getCategory(disambCategoryTitle); - } + /** + * @return The number of redirects in the current Wikipedia. + */ + public long getNumberOfRedirectPages() { + Session session = this.wiki.__getHibernateSession(); + session.beginTransaction(); + session.lock(hibernateMetaData, LockMode.NONE); + long nrofRedirects = hibernateMetaData.getNrofRedirects(); + session.getTransaction().commit(); + return nrofRedirects; + } + + /** + * @return The disambiguation {@link Category}. + * @throws WikiApiException Thrown if errors occurred fetching the information. + */ + public Category getDisambiguationCategory() throws WikiApiException { + Session session = this.wiki.__getHibernateSession(); + session.beginTransaction(); + session.lock(hibernateMetaData, LockMode.NONE); + String disambCategoryTitle = hibernateMetaData.getDisambiguationCategory(); + session.getTransaction().commit(); + return wiki.getCategory(disambCategoryTitle); + } - /** - * @return The name of the main/root {@link Category}. - * @throws WikiApiException Thrown if errors occurred fetching the information. - */ - public Category getMainCategory() throws WikiApiException - { - Session session = this.wiki.__getHibernateSession(); - session.beginTransaction(); - session.lock(hibernateMetaData, LockMode.NONE); - String mainCategoryTitle = hibernateMetaData.getMainCategory(); - session.getTransaction().commit(); - return wiki.getCategory(mainCategoryTitle); - } + /** + * @return The name of the main/root {@link Category}. + * @throws WikiApiException Thrown if errors occurred fetching the information. + */ + public Category getMainCategory() throws WikiApiException { + Session session = this.wiki.__getHibernateSession(); + session.beginTransaction(); + session.lock(hibernateMetaData, LockMode.NONE); + String mainCategoryTitle = hibernateMetaData.getMainCategory(); + session.getTransaction().commit(); + return wiki.getCategory(mainCategoryTitle); + } - /** - * @return The version of the wikipedia data. - * @throws WikiApiException Thrown if errors occurred fetching the information. - */ - public String getVersion() throws WikiApiException - { - Session session = this.wiki.__getHibernateSession(); - session.beginTransaction(); - session.lock(hibernateMetaData, LockMode.NONE); - String version = hibernateMetaData.getVersion(); - session.getTransaction().commit(); - return version; - } + /** + * @return The version of the wikipedia data. + * @throws WikiApiException Thrown if errors occurred fetching the information. + */ + public String getVersion() throws WikiApiException { + Session session = this.wiki.__getHibernateSession(); + session.beginTransaction(); + session.lock(hibernateMetaData, LockMode.NONE); + String version = hibernateMetaData.getVersion(); + session.getTransaction().commit(); + return version; + } - /** - * @return The language of this wikipedia. - */ - public Language getLanguage() - { - return wiki.getLanguage(); - } + /** + * @return The language of this wikipedia. + */ + public Language getLanguage() { + return wiki.getLanguage(); + } } diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/Page.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/Page.java index 88b142c7..c46be17c 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/Page.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/Page.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -38,12 +38,10 @@ /** * Represents a Wikipedia article page. - * - * */ // Adapter class for hiding hibernate session management from the user. -public class Page implements WikiConstants -{ +public class Page implements WikiConstants { + private final Wikipedia wiki; private final PageDAO pageDAO; @@ -57,7 +55,6 @@ public class Page implements WikiConstants // Note: The page itself is _not_ a redirect, it is just a page. private boolean isRedirect = false; - /** * Creates a page object. * @@ -67,9 +64,7 @@ public class Page implements WikiConstants * The hibernate id of the page. * @throws WikiApiException Thrown if errors occurred. */ - protected Page(Wikipedia wiki, long id) - throws WikiApiException - { + protected Page(Wikipedia wiki, long id) throws WikiApiException { this.wiki = wiki; this.pageDAO = new PageDAO(wiki); fetchByHibernateId(id); @@ -84,9 +79,7 @@ protected Page(Wikipedia wiki, long id) * The pageID of the page. * @throws WikiApiException Thrown if errors occurred. */ - protected Page(Wikipedia wiki, int pageID) - throws WikiApiException - { + protected Page(Wikipedia wiki, int pageID) throws WikiApiException { this.wiki = wiki; this.pageDAO = new PageDAO(wiki); fetchByPageId(pageID); @@ -101,34 +94,30 @@ protected Page(Wikipedia wiki, int pageID) * The name of the page. * @throws WikiApiException Thrown if errors occurred. */ - public Page(Wikipedia wiki, String pName) - throws WikiApiException - { + public Page(Wikipedia wiki, String pName) throws WikiApiException { this(wiki, pName, false); } - + /** - * Creates a page object. - * - * @param wiki - * The wikipedia object. - * @param pName - * The name of the page. - * @param useExactTitle - * Whether to use the exact title or try to guess the correct wiki-style title. - * @throws WikiApiException Thrown if errors occurred. - */ - public Page(Wikipedia wiki, String pName, boolean useExactTitle) - throws WikiApiException - { - if (pName == null || pName.length() == 0) { - throw new WikiPageNotFoundException(); - } - this.wiki = wiki; - this.pageDAO = new PageDAO(wiki); - Title pageTitle = new Title(pName); - fetchByTitle(pageTitle, useExactTitle); - } + * Creates a page object. + * + * @param wiki + * The wikipedia object. + * @param pName + * The name of the page. + * @param useExactTitle + * Whether to use the exact title or try to guess the correct wiki-style title. + * @throws WikiApiException Thrown if errors occurred. + */ + public Page(Wikipedia wiki, String pName, boolean useExactTitle) throws WikiApiException { + if (pName == null || pName.length() == 0) { + throw new WikiPageNotFoundException(); + } + this.wiki = wiki; + this.pageDAO = new PageDAO(wiki); + Title pageTitle = new Title(pName); + fetchByTitle(pageTitle, useExactTitle); + } /** * Creates a Page object from an already retrieved hibernate Page @@ -138,13 +127,10 @@ public Page(Wikipedia wiki, String pName, boolean useExactTitle) * @param id * The hibernate id of the page. * @param hibernatePage - * The {@code api.hibernatePage} that has already been retrieved + * The {@code api.hibernate.Page} that has already been retrieved * @throws WikiApiException Thrown if errors occurred. */ - protected Page(Wikipedia wiki, long id, - org.dkpro.jwpl.api.hibernate.Page hibernatePage) - throws WikiApiException - { + protected Page(Wikipedia wiki, long id, org.dkpro.jwpl.api.hibernate.Page hibernatePage) throws WikiApiException { this.wiki = wiki; this.pageDAO = new PageDAO(wiki); this.hibernatePage = hibernatePage; @@ -154,31 +140,27 @@ protected Page(Wikipedia wiki, long id, * @throws WikiApiException Thrown if errors occurred. * @see Page */ - private void fetchByHibernateId(long id) - throws WikiApiException - { - Session session = this.wiki.__getHibernateSession(); - session.beginTransaction(); - hibernatePage = pageDAO.findById(id); - session.getTransaction().commit(); - - if (hibernatePage == null) { - throw new WikiPageNotFoundException("No page with id " + id + " was found."); - } + private void fetchByHibernateId(long id) throws WikiApiException { + Session session = this.wiki.__getHibernateSession(); + session.beginTransaction(); + hibernatePage = pageDAO.findById(id); + session.getTransaction().commit(); + + if (hibernatePage == null) { + throw new WikiPageNotFoundException("No page with id " + id + " was found."); + } } - private void fetchByPageId(int pageID) - throws WikiApiException - { - Session session = this.wiki.__getHibernateSession(); - session.beginTransaction(); - hibernatePage = session.createQuery("from Page where pageId = :id", org.dkpro.jwpl.api.hibernate.Page.class) + private void fetchByPageId(int pageID) throws WikiApiException { + Session session = this.wiki.__getHibernateSession(); + session.beginTransaction(); + hibernatePage = session.createQuery("from Page where pageId = :id", org.dkpro.jwpl.api.hibernate.Page.class) .setParameter("id", pageID, StandardBasicTypes.INTEGER).uniqueResult(); - session.getTransaction().commit(); + session.getTransaction().commit(); - if (hibernatePage == null) { - throw new WikiPageNotFoundException("No page with page id " + pageID + " was found."); - } + if (hibernatePage == null) { + throw new WikiPageNotFoundException("No page with page id " + pageID + " was found."); + } } /** @@ -187,54 +169,52 @@ private void fetchByPageId(int pageID) * @param pTitle * @throws WikiApiException Thrown if errors occurred. */ - private void fetchByTitle(Title pTitle, boolean useExactTitle) - throws WikiApiException - { + private void fetchByTitle(Title pTitle, boolean useExactTitle) throws WikiApiException { String searchString = pTitle.getPlainTitle(); if (!useExactTitle) { - searchString = pTitle.getWikiStyleTitle(); + searchString = pTitle.getWikiStyleTitle(); } Session session; session = this.wiki.__getHibernateSession(); session.beginTransaction(); - Integer pageId = session.createNativeQuery( - "select pml.pageID from PageMapLine as pml where pml.name = :pagetitle LIMIT 1", Integer.class) - .setParameter("pagetitle", searchString, StandardBasicTypes.STRING).uniqueResult(); + String sql = "select pml.pageID from PageMapLine as pml where pml.name = :pagetitle LIMIT 1"; + Integer pageId = session.createNativeQuery(sql, Integer.class) + .setParameter("pagetitle", searchString, StandardBasicTypes.STRING).uniqueResult(); session.getTransaction().commit(); - if (pageId == null) { + if (pageId == null) { throw new WikiPageNotFoundException("No page with name " + searchString + " was found."); } fetchByPageId(pageId); - if (!this.isRedirect&&searchString != null&&!searchString.equals(getTitle().getRawTitleText())) { - if(this.isRedirect){ - //in case we already tried to re-retrieve the discussion page unsuccessfully, - //we have to give up here or we end up in an infinite loop. - - //reasons for this happening might be several entries in PageMapLine with the same name but different upper/lower case variants - //if the database does not allow case sensitive queries, then the API will always retrieve only the first result and if this is a redirect to a different writing variant, we are stuck in a loop. - //To fix this, either a case sensitive collation should be used or the API should be able to deal with set valued results and pick the correct one from the set. - //For now, we gracefully return without retrieving the Talk page for this article and throw an appropriate excption. - throw new WikiPageNotFoundException("No discussion page with name " + searchString + " could be retrieved. This is most likely due to multiple writing variants of the same page in the database"); - }else{ - this.isRedirect = true; - /* - * WORKAROUND - * in our page is a redirect to a discussion page, we might not retrieve the target discussion page as expected but rather the article associated with the target discussion page - * we check this here and re-retrieve the correct page. - * this error should be avoided by keeping the namespace information in the database - * This fix has been provided by Shiri Dori-Hacohen and is discussed in the Google Group under https://groups.google.com/forum/#!topic/jwpl/2nlr55yp87I/discussion - */ - if (searchString.startsWith(DISCUSSION_PREFIX) && !getTitle().getRawTitleText().startsWith(DISCUSSION_PREFIX)) { - try { - fetchByTitle(new Title(DISCUSSION_PREFIX + getTitle().getRawTitleText()), useExactTitle); - } catch (WikiPageNotFoundException e) { - throw new WikiPageNotFoundException("No page with name " + DISCUSSION_PREFIX + getTitle().getRawTitleText() + " was found."); - } - } - } - } + if (!this.isRedirect&&searchString != null&&!searchString.equals(getTitle().getRawTitleText())) { + if(this.isRedirect) { + //in case we already tried to re-retrieve the discussion page unsuccessfully, + //we have to give up here or we end up in an infinite loop. + + //reasons for this happening might be several entries in PageMapLine with the same name but different upper/lower case variants + //if the database does not allow case sensitive queries, then the API will always retrieve only the first result and if this is a redirect to a different writing variant, we are stuck in a loop. + //To fix this, either a case sensitive collation should be used or the API should be able to deal with set valued results and pick the correct one from the set. + //For now, we gracefully return without retrieving the Talk page for this article and throw an appropriate excption. + throw new WikiPageNotFoundException("No discussion page with name " + searchString + " could be retrieved. This is most likely due to multiple writing variants of the same page in the database"); + } else { + this.isRedirect = true; + /* + * WORKAROUND + * in our page is a redirect to a discussion page, we might not retrieve the target discussion page as expected but rather the article associated with the target discussion page + * we check this here and re-retrieve the correct page. + * this error should be avoided by keeping the namespace information in the database + * This fix has been provided by Shiri Dori-Hacohen and is discussed in the Google Group under https://groups.google.com/forum/#!topic/jwpl/2nlr55yp87I/discussion + */ + if (searchString.startsWith(DISCUSSION_PREFIX) && !getTitle().getRawTitleText().startsWith(DISCUSSION_PREFIX)) { + try { + fetchByTitle(new Title(DISCUSSION_PREFIX + getTitle().getRawTitleText()), useExactTitle); + } catch (WikiPageNotFoundException e) { + throw new WikiPageNotFoundException("No page with name " + DISCUSSION_PREFIX + getTitle().getRawTitleText() + " was found."); + } + } + } + } } /** @@ -244,24 +224,21 @@ private void fetchByTitle(Title pTitle, boolean useExactTitle) * Note well: * Access is limited to package-private here intentionally, as the database ID is considered framework-internal use. */ - long __getId() - { + long __getId() { return hibernatePage.getId(); } /** * @return Returns a unique page id. */ - public int getPageId() - { + public int getPageId() { return hibernatePage.getPageId(); } /** - * @return The a set of categories that this page belongs to. + * @return A set of categories that this page belongs to. */ - public Set getCategories() - { + public Set getCategories() { Session session = this.wiki.__getHibernateSession(); session.beginTransaction(); session.buildLockRequest(LockOptions.NONE).lock(hibernatePage); @@ -282,17 +259,15 @@ public Set getCategories() * * @return The number of categories. */ - public int getNumberOfCategories() - { + public int getNumberOfCategories() { int nrOfCategories = 0; long id = __getId(); Session session = wiki.__getHibernateSession(); session.beginTransaction(); String sql = "select count(pages) from page_categories where id = :pageid"; - Long returnValue = session - .createNativeQuery(sql, Long.class) - .setParameter("pageid", id, StandardBasicTypes.LONG).uniqueResult(); + Long returnValue = session.createNativeQuery(sql, Long.class) + .setParameter("pageid", id, StandardBasicTypes.LONG).uniqueResult(); session.getTransaction().commit(); if (returnValue != null) { @@ -302,14 +277,15 @@ public int getNumberOfCategories() } /** - * Returns the set of pages that have a link pointing to this page. Warning: Do not use + * Returns the set of pages that have a link pointing to this page. + *

+ * Warning: Do not use * this for getting the number of inlinks with {@link Page#getInlinks()}.size(). This is too slow. Use * {@link Page#getNumberOfInlinks()} instead. * * @return The set of pages that have a link pointing to this page. */ - public Set getInlinks() - { + public Set getInlinks() { Session session = wiki.__getHibernateSession(); session.beginTransaction(); session.buildLockRequest(LockOptions.NONE).lock(hibernatePage); @@ -325,8 +301,7 @@ public Set getInlinks() catch (WikiApiException e) { // Silently ignore if a page could not be found // There may be inlinks that do not come from an existing page. - continue; - } + } } return pages; @@ -338,8 +313,7 @@ public Set getInlinks() * * @return The number of inlinks. */ - public int getNumberOfInlinks() - { + public int getNumberOfInlinks() { int nrOfInlinks = 0; long id = __getId(); @@ -347,7 +321,7 @@ public int getNumberOfInlinks() session.beginTransaction(); String sql = "select count(pi.inLinks) from page_inlinks as pi where pi.id = :piid"; Long returnValue = session.createNativeQuery(sql, Long.class) - .setParameter("piid", id, StandardBasicTypes.LONG).uniqueResult(); + .setParameter("piid", id, StandardBasicTypes.LONG).uniqueResult(); session.getTransaction().commit(); if (returnValue != null) { @@ -362,18 +336,14 @@ public int getNumberOfInlinks() * * @return Returns the IDs of the inLinks of this page. */ - public Set getInlinkIDs() - { - Set tmpSet = new HashSet<>(); - + public Set getInlinkIDs() { Session session = wiki.__getHibernateSession(); session.beginTransaction(); session.buildLockRequest(LockOptions.NONE).lock(hibernatePage); - tmpSet.addAll(hibernatePage.getInLinks()); + Set tmpSet = new HashSet<>(hibernatePage.getInLinks()); session.getTransaction().commit(); - return tmpSet; } @@ -385,11 +355,10 @@ public Set getInlinkIDs() * * @return The set of pages that are linked from this page. */ - public Set getOutlinks() - { + public Set getOutlinks() { Session session = wiki.__getHibernateSession(); session.beginTransaction(); -// session.lock(hibernatePage, LockMode.NONE); + // session.lock(hibernatePage, LockMode.NONE); session.buildLockRequest(LockOptions.NONE).lock(hibernatePage); // Have to copy links here since getPage later will close the session. Set tmpSet = new UnmodifiableArraySet<>(hibernatePage.getOutLinks()); @@ -414,8 +383,7 @@ public Set getOutlinks() * * @return The number of outlinks. */ - public int getNumberOfOutlinks() - { + public int getNumberOfOutlinks() { int nrOfOutlinks = 0; long id = __getId(); @@ -423,7 +391,7 @@ public int getNumberOfOutlinks() session.beginTransaction(); String sql = "select count(outLinks) from page_outlinks where id = :id"; Long returnValue = session.createNativeQuery(sql, Long.class) - .setParameter("id", id, StandardBasicTypes.LONG).uniqueResult(); + .setParameter("id", id, StandardBasicTypes.LONG).uniqueResult(); session.getTransaction().commit(); if (returnValue != null) { @@ -438,15 +406,13 @@ public int getNumberOfOutlinks() * * @return Returns the IDs of the outLinks of this page. */ - public Set getOutlinkIDs() - { - Set tmpSet = new HashSet<>(); + public Set getOutlinkIDs() { - Session session = wiki.__getHibernateSession(); + Session session = wiki.__getHibernateSession(); session.beginTransaction(); session.buildLockRequest(LockOptions.NONE).lock(hibernatePage); - tmpSet.addAll(hibernatePage.getOutLinks()); + Set tmpSet = new HashSet<>(hibernatePage.getOutLinks()); session.getTransaction().commit(); return tmpSet; @@ -456,22 +422,18 @@ public Set getOutlinkIDs() * @return The title of the page. * @throws WikiTitleParsingException Thrown if errors occurred while parsing. */ - public Title getTitle() - throws WikiTitleParsingException - { + public Title getTitle() throws WikiTitleParsingException { Session session = wiki.__getHibernateSession(); session.beginTransaction(); String name = hibernatePage.getName(); session.getTransaction().commit(); - Title title = new Title(name); - return title; + return new Title(name); } /** * @return The set of strings that are redirects to this page. */ - public Set getRedirects() - { + public Set getRedirects() { Session session = wiki.__getHibernateSession(); session.beginTransaction(); session.buildLockRequest(LockOptions.NONE).lock(hibernatePage); @@ -483,8 +445,7 @@ public Set getRedirects() /** * @return The text of the page with media wiki markup. */ - public String getText() - { + public String getText() { Session session = wiki.__getHibernateSession(); session.beginTransaction(); String text = hibernatePage.getText(); @@ -527,8 +488,7 @@ public String getText() /** * @return {@code True}, if the page is a disambiguation page, {@code false} otherwise. */ - public boolean isDisambiguation() - { + public boolean isDisambiguation() { Session session = wiki.__getHibernateSession(); session.beginTransaction(); boolean isDisambiguation = hibernatePage.getIsDisambiguation(); @@ -539,41 +499,37 @@ public boolean isDisambiguation() /** * @return {@code True}, if the page was returned by querying a redirect string, {@code false} otherwise. */ - public boolean isRedirect() - { + public boolean isRedirect() { return isRedirect; } - /** - * @return {@code True}, if the page is a discussion page. - * @throws WikiTitleParsingException - */ - public boolean isDiscussion() throws WikiTitleParsingException - { - return getTitle().getRawTitleText().startsWith(DISCUSSION_PREFIX); - } - - /** - *

Returns the Wikipedia article as plain text using the SwebleParser with - * a SimpleWikiConfiguration and the PlainTextConverter.
- * If you have different needs regarding the plain text, you can use - * getParsedPage(Visitor v) and provide your own Sweble-Visitor. Examples - * are in the {@code org.dkpro.jwpl.api.sweble} package - * or on http://www.sweble.org - *

- * - *

Alternatively, use {@link Page#getText()} to return the Wikipedia article - * with all Wiki markup. You can then use the old JWPL MediaWiki parser for - * creating a plain text version. The JWPL parser is now located in a - * separate project {@code org.dkpro.jwpl.api.parser}. - * Please refer to the JWPL Google Code project page for further reference.

- * - * @return The plain text of a Wikipedia article - * @throws WikiApiException Thrown if errors occurred. - */ - public String getPlainText() - throws WikiApiException - { + /** + * @return {@code True}, if the page is a discussion page, {@code false} otherwise. + * @throws WikiTitleParsingException Thrown if errors occurred. + */ + public boolean isDiscussion() throws WikiTitleParsingException { + return getTitle().getRawTitleText().startsWith(DISCUSSION_PREFIX); + } + + /** + *

Returns the Wikipedia article as plain text using the SwebleParser with + * a SimpleWikiConfiguration and the PlainTextConverter.
+ * If you have different needs regarding the plain text, you can use + * getParsedPage(Visitor v) and provide your own Sweble-Visitor. Examples + * are in the {@code org.dkpro.jwpl.api.sweble} package + * or on http://www.sweble.org + *

+ * + *

Alternatively, use {@link Page#getText()} to return the Wikipedia article + * with all Wiki markup. You can then use the old JWPL MediaWiki parser for + * creating a plain text version. The JWPL parser is now located in a + * separate project {@code org.dkpro.jwpl.api.parser}. + * Please refer to the JWPL Google Code project page for further reference.

+ * + * @return The plain text of a Wikipedia article + * @throws WikiApiException Thrown if errors occurred. + */ + public String getPlainText() throws WikiApiException { //Configure the PlainTextConverter for plain text parsing return (String) parsePage(new PlainTextConverter(this.wiki.getWikConfig(), false, Integer.MAX_VALUE)); } @@ -591,8 +547,7 @@ public String getPlainText() * type of the go() method of your visitor. * @throws WikiApiException Thrown if errors occurred. */ - private Object parsePage(AstVisitor v) throws WikiApiException - { + private Object parsePage(AstVisitor v) throws WikiApiException { // Use the provided visitor to parse the page return v.go(getCompiledPage().getPage()); } @@ -603,8 +558,7 @@ private Object parsePage(AstVisitor v) throws WikiApiException * @return the parsed page * @throws WikiApiException Thrown if errors occurred. */ - private EngProcessedPage getCompiledPage() throws WikiApiException - { + private EngProcessedPage getCompiledPage() throws WikiApiException { EngProcessedPage cp; try{ WtEngineImpl engine = new WtEngineImpl(this.wiki.getWikConfig()); @@ -634,9 +588,7 @@ private EngProcessedPage getCompiledPage() throws WikiApiException * @return A string with infos about this page object. * @throws WikiApiException Thrown if errors occurred. */ - protected String getPageInfo() - throws WikiApiException - { + protected String getPageInfo() throws WikiApiException { StringBuilder sb = new StringBuilder(1000); sb.append("ID : ").append(__getId()).append(LF); diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/PageIterable.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/PageIterable.java index ca3477ef..f0da621d 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/PageIterable.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/PageIterable.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -19,41 +19,41 @@ import java.util.Iterator; - /** - * An iterable of page objects. - * + * An {@link Iterable} of {@link Page} objects. */ public class PageIterable implements Iterable { - /** The Wikipedia object */ - private final Wikipedia wiki; - - /** Whether only articles are retrieved (or also disambiguation pages) */ - private final boolean onlyArticles; - - /** - * The size of the page buffer. - * With bufferSize = 1, a database connection is needed for retrieving a single article. - * Higher bufferSize gives better performance, but needs memory. - * Initialize it with 500. - */ - private int bufferSize = 500; - - public PageIterable(Wikipedia wiki, boolean onlyArticles) { - this.wiki = wiki; - this.onlyArticles = onlyArticles; - } - - protected PageIterable(Wikipedia wiki, boolean onlyArticles, int bufferSize) { - this.wiki = wiki; - this.onlyArticles = onlyArticles; - this.bufferSize = bufferSize; - } - - public Iterator iterator() { - return new PageIterator(wiki, onlyArticles, bufferSize); - } + private final Wikipedia wiki; + + /* + * Whether only articles are retrieved (or also disambiguation pages) + */ + private final boolean onlyArticles; + + /* + * The size of the page buffer. + * With bufferSize = 1, a database connection is needed for retrieving a single article. + * Higher bufferSize gives better performance, but needs memory. + * Initialize it with 500. + */ + private int bufferSize = 500; + + public PageIterable(Wikipedia wiki, boolean onlyArticles) { + this.wiki = wiki; + this.onlyArticles = onlyArticles; + } + + protected PageIterable(Wikipedia wiki, boolean onlyArticles, int bufferSize) { + this.wiki = wiki; + this.onlyArticles = onlyArticles; + this.bufferSize = bufferSize; + } + + @Override + public Iterator iterator() { + return new PageIterator(wiki, onlyArticles, bufferSize); + } } diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/PageIterator.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/PageIterator.java index 0ef89fd8..1f985978 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/PageIterator.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/PageIterator.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -25,237 +25,221 @@ import java.util.Set; import jakarta.persistence.TypedQuery; -import org.hibernate.Session; import org.dkpro.jwpl.api.exception.WikiApiException; +import org.hibernate.Session; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** - * An iterator over {@link Page} objects. + * An {@link Iterator} over {@link Page} objects. */ public class PageIterator implements Iterator { - private static final Logger logger = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); - - private final PageBuffer buffer; - - public PageIterator(Wikipedia wiki, Set ids, Set titles, int bufferSize) { - buffer = new PageBuffer(bufferSize, wiki, ids, titles); - } - - public PageIterator(Wikipedia wiki, boolean onlyArticles, int bufferSize) { - buffer = new PageBuffer(bufferSize, wiki, onlyArticles); - } - - @Override - public boolean hasNext(){ - return buffer.hasNext(); - } - - @Override - public Page next(){ - return buffer.next(); - } - - @Override - public void remove() { - throw new UnsupportedOperationException(); - } - - /** - * Buffers {@link Page pages} in a list. - */ - class PageBuffer{ - - private final Wikipedia wiki; - private final boolean onlyArticles; - - private final List buffer; - private final int maxBufferSize; // the number of pages to be buffered after a query to the database. - private int bufferFillSize; // even a 500 slot buffer can be filled with only 5 elements - private int bufferOffset; // the offset in the buffer - private long lastPage;// the overall offset in the data - - private List pageIds = new LinkedList<>(); // a set of ids, if a specific list of articles is supposed to be read - private List pageTitles = new LinkedList<>(); // a set of titles, if a specific list of articles is supposed to be read - final boolean loadFromList; - - public PageBuffer(int bufferSize, Wikipedia wiki, boolean onlyArticles){ - this.maxBufferSize = bufferSize; - this.wiki = wiki; - this.onlyArticles = onlyArticles; - this.buffer = new ArrayList<>(); - this.bufferFillSize = 0; - this.bufferOffset = 0; - this.lastPage = 0; - this.loadFromList=false; - //TODO test whether this works when zero pages are retrieved - } - - public PageBuffer(int bufferSize, Wikipedia wiki, Set ids, Set titles){ - this.maxBufferSize = bufferSize; - this.wiki = wiki; - this.buffer = new ArrayList<>(); - this.onlyArticles = false; - this.bufferFillSize = 0; - this.bufferOffset = 0; - this.lastPage = 0; - this.pageIds= new LinkedList<>(ids); - this.pageTitles= new LinkedList<>(titles); - this.loadFromList=true; - } - - /** - * If there are elements in the buffer left, then return true. - * If the end of the filled buffer is reached, then try to load new buffer. - * @return True, if there are pages left. False otherwise. - */ - public boolean hasNext(){ - if (bufferOffset < bufferFillSize) { - return true; - } - else { - return this.fillBuffer(); - } - } - - /** - * - * @return The next {@link Page} or {@code null} if no more pages are available. - */ - public Page next(){ - // if there are still elements in the buffer, just retrieve the next one - if (bufferOffset < bufferFillSize) { - return this.getBufferElement(); - } - // if there are no more elements => try to fill a new buffer - else if (this.fillBuffer()) { - return this.getBufferElement(); - } - else { - // if it cannot be filled => return null - return null; - } - } - - private Page getBufferElement() { - Page page = buffer.get(bufferOffset); - bufferOffset++; - return page; - } - -// private void showBuffer() { -// for (Page p : buffer) { -// try { -// logger.info(p.getTitle().getPlainTitle()); -// } catch (WikiTitleParsingException e) { -// e.printStackTrace(); -// } -// } -// } - - private boolean fillBuffer() { - - //decide whether to load from list or retrieve all available articles - if (loadFromList){ - // clear the old buffer and all variables regarding the state of the buffer - buffer.clear(); - bufferOffset = 0; - bufferFillSize = 0; - - //load pages - if(pageIds.isEmpty()&&pageTitles.isEmpty()){ - return false; - } - - while(bufferFillSize<=maxBufferSize&&!pageIds.isEmpty()){ - String id = pageIds.remove(0); - if(id!=null&&!id.isEmpty()){ - try { - buffer.add(wiki.getPage(Integer.parseInt(id))); - bufferFillSize++; - } catch (WikiApiException e){ - logger.warn("Missing article with id "+id); - } - } - } - while(bufferFillSize<=maxBufferSize&&!pageTitles.isEmpty()){ - String title = pageTitles.remove(0); - if(title!=null&&!title.isEmpty()){ - try{ - buffer.add(wiki.getPage(title)); - bufferFillSize++; - }catch(WikiApiException e){ - logger.warn("Missing article with title \""+title+"\""); - } - } - } - - if (buffer.size() > 0) { - bufferFillSize = buffer.size(); - return true; - } - else { - return false; - } - } else{ - Session session = this.wiki.__getHibernateSession(); - session.beginTransaction(); - List returnValues; - TypedQuery query; - String sql; - if (onlyArticles) { - sql = "SELECT p FROM Page p WHERE p.isDisambiguation = :isDisambiguation AND p.id > :pageId"; - query = session.createQuery(sql, org.dkpro.jwpl.api.hibernate.Page.class); - query.setParameter("isDisambiguation", false); - query.setParameter("pageId", lastPage); - } - else { - sql = "SELECT p FROM Page p WHERE p.id > :pageId"; - query = session.createQuery(sql, org.dkpro.jwpl.api.hibernate.Page.class); - query.setParameter("pageId", lastPage); - } - query.setMaxResults(maxBufferSize); - returnValues = query.getResultList(); - session.getTransaction().commit(); - - // clear the old buffer and all variables regarding the state of the buffer - buffer.clear(); - bufferOffset = 0; - bufferFillSize = 0; - - Page apiPage; - for(org.dkpro.jwpl.api.hibernate.Page o : returnValues){ - if(o==null) { - return false; - } else { - long id = o.getId(); - try { - apiPage = new Page(this.wiki, id, o); - if (this.onlyArticles) { - if (!apiPage.isRedirect()) { - buffer.add(apiPage); - } - } - else { - buffer.add(apiPage); - } - } catch (WikiApiException e) { - logger.error("Page with hibernateID " + id + " not found."); - } - lastPage = id; - } - } - if (buffer.size() > 0) { - bufferFillSize = buffer.size(); - return true; - } - else { - return false; - } - } - } // fillBuffer - - } + private static final Logger logger = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + + private final PageBuffer buffer; + + public PageIterator(Wikipedia wiki, Set ids, Set titles, int bufferSize) { + buffer = new PageBuffer(bufferSize, wiki, ids, titles); + } + + public PageIterator(Wikipedia wiki, boolean onlyArticles, int bufferSize) { + buffer = new PageBuffer(bufferSize, wiki, onlyArticles); + } + + @Override + public boolean hasNext() { + return buffer.hasNext(); + } + + @Override + public Page next() { + return buffer.next(); + } + + @Override + public void remove() { + throw new UnsupportedOperationException(); + } + + /** + * Buffers {@link Page pages} in a list. + */ + static class PageBuffer { + + private final Wikipedia wiki; + private final boolean onlyArticles; + + private final List buffer; + private final int maxBufferSize; // the number of pages to be buffered after a query to the database. + private int bufferFillSize; // even a 500 slot buffer can be filled with only 5 elements + private int bufferOffset; // the offset in the buffer + private long lastPage;// the overall offset in the data + + private List pageIds = new LinkedList<>(); // a set of ids, if a specific list of articles is supposed to be read + private List pageTitles = new LinkedList<>(); // a set of titles, if a specific list of articles is supposed to be read + final boolean loadFromList; + + public PageBuffer(int bufferSize, Wikipedia wiki, boolean onlyArticles) { + this.maxBufferSize = bufferSize; + this.wiki = wiki; + this.onlyArticles = onlyArticles; + this.buffer = new ArrayList<>(); + this.bufferFillSize = 0; + this.bufferOffset = 0; + this.lastPage = 0; + this.loadFromList = false; + //TODO test whether this works when zero pages are retrieved + } + + public PageBuffer(int bufferSize, Wikipedia wiki, Set ids, Set titles) { + this.maxBufferSize = bufferSize; + this.wiki = wiki; + this.buffer = new ArrayList<>(); + this.onlyArticles = false; + this.bufferFillSize = 0; + this.bufferOffset = 0; + this.lastPage = 0; + this.pageIds = new LinkedList<>(ids); + this.pageTitles = new LinkedList<>(titles); + this.loadFromList = true; + } + + /** + * If there are elements in the buffer left, then return true. + * If the end of the filled buffer is reached, then try to load new buffer. + * + * @return True, if there are pages left. False otherwise. + */ + public boolean hasNext() { + if (bufferOffset < bufferFillSize) { + return true; + } else { + return this.fillBuffer(); + } + } + + /** + * @return The next {@link Page} or {@code null} if no more pages are available. + */ + public Page next() { + // if there are still elements in the buffer, just retrieve the next one + if (bufferOffset < bufferFillSize) { + return this.getBufferElement(); + } + // if there are no more elements => try to fill a new buffer + else if (this.fillBuffer()) { + return this.getBufferElement(); + } else { + // if it cannot be filled => return null + return null; + } + } + + private Page getBufferElement() { + Page page = buffer.get(bufferOffset); + bufferOffset++; + return page; + } + + private boolean fillBuffer() { + + //decide whether to load from list or retrieve all available articles + if (loadFromList) { + // clear the old buffer and all variables regarding the state of the buffer + buffer.clear(); + bufferOffset = 0; + bufferFillSize = 0; + + //load pages + if (pageIds.isEmpty() && pageTitles.isEmpty()) { + return false; + } + + while (bufferFillSize <= maxBufferSize && !pageIds.isEmpty()) { + String id = pageIds.remove(0); + if (id != null && !id.isEmpty()) { + try { + buffer.add(wiki.getPage(Integer.parseInt(id))); + bufferFillSize++; + } catch (WikiApiException e) { + logger.warn("Missing article with id " + id); + } + } + } + while (bufferFillSize <= maxBufferSize && !pageTitles.isEmpty()) { + String title = pageTitles.remove(0); + if (title != null && !title.isEmpty()) { + try { + buffer.add(wiki.getPage(title)); + bufferFillSize++; + } catch (WikiApiException e) { + logger.warn("Missing article with title \"" + title + "\""); + } + } + } + + if (buffer.size() > 0) { + bufferFillSize = buffer.size(); + return true; + } else { + return false; + } + } else { + Session session = this.wiki.__getHibernateSession(); + session.beginTransaction(); + List returnValues; + TypedQuery query; + String sql; + if (onlyArticles) { + sql = "SELECT p FROM Page p WHERE p.isDisambiguation = :isDisambiguation AND p.id > :pageId"; + query = session.createQuery(sql, org.dkpro.jwpl.api.hibernate.Page.class); + query.setParameter("isDisambiguation", false); + query.setParameter("pageId", lastPage); + } else { + sql = "SELECT p FROM Page p WHERE p.id > :pageId"; + query = session.createQuery(sql, org.dkpro.jwpl.api.hibernate.Page.class); + query.setParameter("pageId", lastPage); + } + query.setMaxResults(maxBufferSize); + returnValues = query.getResultList(); + session.getTransaction().commit(); + + // clear the old buffer and all variables regarding the state of the buffer + buffer.clear(); + bufferOffset = 0; + bufferFillSize = 0; + + Page apiPage; + for (org.dkpro.jwpl.api.hibernate.Page o : returnValues) { + if (o == null) { + return false; + } else { + long id = o.getId(); + try { + apiPage = new Page(this.wiki, id, o); + if (this.onlyArticles) { + if (!apiPage.isRedirect()) { + buffer.add(apiPage); + } + } else { + buffer.add(apiPage); + } + } catch (WikiApiException e) { + logger.error("Page with hibernateID " + id + " not found."); + } + lastPage = id; + } + } + if (buffer.size() > 0) { + bufferFillSize = buffer.size(); + return true; + } else { + return false; + } + } + } // fillBuffer + + } } \ No newline at end of file diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/PageQuery.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/PageQuery.java index 9d91505f..22a9bfe0 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/PageQuery.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/PageQuery.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -20,220 +20,282 @@ /** * Represents a query for retrieving pages that meet the given conditions. * Conditions are represented by the fields of a query object. - * */ public class PageQuery implements WikiConstants { - /** Whether only article pages should be retrieved. */ - private boolean onlyArticlePages; - - /** Whether only disambiguation pages should be retrieved. */ - private boolean onlyDisambiguationPages; - - /** A MySql regular expression style titlePattern for the page's title */ - private String titlePattern; - - /** The minimum in-degree of the page */ - private int minIndegree; - /** The maximum out-degree of the page */ - private int maxIndegree; - - /** The minimum out-degree of the page */ - private int minOutdegree; - /** The maximum out-degree of the page */ - private int maxOutdegree; - - /** The minimum number of redirects of the page */ - private int minRedirects; - /** The maximum number of redirects of the page */ - private int maxRedirects; - - /** The minimum number of categories of the page */ - private int minCategories; - /** The maximum number of categories of the page */ - private int maxCategories; - - /** The minimum number of tokens in the page */ - private int minTokens; - /** The minimum number of tokens in the page */ - private int maxTokens; - - public PageQuery() { - onlyDisambiguationPages = false; - - titlePattern = ""; - - minIndegree = 0; - maxIndegree = Integer.MAX_VALUE; - - minOutdegree = 0; - maxOutdegree = Integer.MAX_VALUE; - - minRedirects = 0; - maxRedirects = Integer.MAX_VALUE; - - minCategories = 0; - maxCategories = Integer.MAX_VALUE; - - minTokens = 0; - maxTokens = Integer.MAX_VALUE; - - } - - protected int getMaxCategories() { - return maxCategories; - } - protected int getMaxIndegree() { - return maxIndegree; - } - protected int getMaxOutdegree() { - return maxOutdegree; - } - protected int getMaxRedirects() { - return maxRedirects; - } - protected int getMinCategories() { - return minCategories; - } - protected int getMinIndegree() { - return minIndegree; - } - protected int getMinOutdegree() { - return minOutdegree; - } - protected int getMinRedirects() { - return minRedirects; - } - protected boolean onlyArticlePages() { - return onlyArticlePages; - } - protected boolean onlyDisambiguationPages() { - return onlyDisambiguationPages; - } - protected int getMinTokens() { - return minTokens; - } - protected int getMaxTokens() { - return maxTokens; - } - protected String getTitlePattern() { - return titlePattern; - } - - /** - * Sets the minimum number of categories that queried articles should have. - * @param minCategories The minimum number of categories. - */ - public void setMinCategories(int minCategories) { - this.minCategories = minCategories; - } - /** - * Sets the maximum number of categories that queried articles should have. - * @param maxCategories The maximum number of categories. - */ - public void setMaxCategories(int maxCategories) { - this.maxCategories = maxCategories; - } - /** - * Sets the minimum number of ingoing links that queried articles should have. - * @param minIndegree The minimum number of ingoing links. - */ - public void setMinIndegree(int minIndegree) { - this.minIndegree = minIndegree; - } - /** - * Sets the maximum number of ingoing links that queried articles should have. - * @param maxIndegree The maximum number of ingoing links. - */ - public void setMaxIndegree(int maxIndegree) { - this.maxIndegree = maxIndegree; - } - /** - * Sets the minimum number of outgoing links that queried articles should have. - * @param minOutdegree The minimum number of outgoing links. - */ - public void setMinOutdegree(int minOutdegree) { - this.minOutdegree = minOutdegree; - } - /** - * Sets the maximum number of outgoing links that queried articles should have. - * @param maxOutdegree The maximum number of outgoing links. - */ - public void setMaxOutdegree(int maxOutdegree) { - this.maxOutdegree = maxOutdegree; - } - /** - * Sets the minimum number of redirects that queried articles should have. - * @param minRedirects The minimum number of redirects. - */ - public void setMinRedirects(int minRedirects) { - this.minRedirects = minRedirects; - } - /** - * Sets the maximum number of redirects that queried articles should have. - * @param maxRedirects The maximum number of redirects. - */ - public void setMaxRedirects(int maxRedirects) { - this.maxRedirects = maxRedirects; - } - /** - * Sets whether only be articles should be retrieved. - * @param onlyArticlePages If set to true, only article pages are returned. - */ - public void setOnlyArticlePages(boolean onlyArticlePages) { - this.onlyArticlePages = onlyArticlePages; - } - /** - * Sets whether only disambiguation pages should be retrieved. - * @param onlyDisambiguationPages If set to true, only disambiguation pages are returned. - */ - public void setOnlyDisambiguationPages(boolean onlyDisambiguationPages) { - this.onlyDisambiguationPages = onlyDisambiguationPages; - } - /** - * Sets the minimum number of tokens that queried articles should have. - * @param minTokens The minimum number of tokens. - */ - public void setMinTokens(int minTokens) { - this.minTokens = minTokens; - } - /** - * Sets the maximum number of tokens that queried articles should have. - * @param maxTokens The maximum number of tokens. - */ - public void setMaxTokens(int maxTokens) { - this.maxTokens = maxTokens; - } - /** - * Sets a regular expression that pages have to match. - * % for any number of arbitrary characters (can only be used at the end of a string) - * _ for a single arbitrary character (can also be used inside a string) - * @param pattern A regular expression pattern. - */ - public void setTitlePattern(String pattern) { - this.titlePattern = pattern; - } - - /** - * @return A string that shows the current values of the query members. - */ - public String getQueryInfo() { - StringBuilder sb = new StringBuilder(); - - sb.append("MaxCategories: " + maxCategories + LF); - sb.append("MinCategories: " + minCategories + LF); - sb.append("MaxIndegree: " + maxIndegree + LF); - sb.append("MinIndegree: " + minIndegree + LF); - sb.append("MaxOutdegree: " + maxOutdegree + LF); - sb.append("MinOutdegree: " + minOutdegree + LF); - sb.append("MaxRedirects: " + maxRedirects + LF); - sb.append("MinRedirects: " + minRedirects + LF); - sb.append("MaxTokens: " + maxTokens + LF); - sb.append("MinTokens: " + minTokens + LF); - sb.append("Only article pages: " + onlyArticlePages + LF); - sb.append("Only disambiguation pages: " + onlyDisambiguationPages + LF); - sb.append("Title pattern: " + titlePattern + LF); - - return sb.toString(); - } + /** + * Whether only article pages should be retrieved. + */ + private boolean onlyArticlePages; + + /** + * Whether only disambiguation pages should be retrieved. + */ + private boolean onlyDisambiguationPages; + + /** + * A regular expression style titlePattern for the page's title + */ + private String titlePattern; + + /** + * The minimum in-degree of the page + */ + private int minIndegree; + /** + * The maximum out-degree of the page + */ + private int maxIndegree; + + /** + * The minimum out-degree of the page + */ + private int minOutdegree; + /** + * The maximum out-degree of the page + */ + private int maxOutdegree; + + /** + * The minimum number of redirects of the page + */ + private int minRedirects; + /** + * The maximum number of redirects of the page + */ + private int maxRedirects; + + /** + * The minimum number of categories of the page + */ + private int minCategories; + /** + * The maximum number of categories of the page + */ + private int maxCategories; + + /** + * The minimum number of tokens in the page + */ + private int minTokens; + /** + * The minimum number of tokens in the page + */ + private int maxTokens; + + public PageQuery() { + onlyDisambiguationPages = false; + + titlePattern = ""; + + minIndegree = 0; + maxIndegree = Integer.MAX_VALUE; + + minOutdegree = 0; + maxOutdegree = Integer.MAX_VALUE; + + minRedirects = 0; + maxRedirects = Integer.MAX_VALUE; + + minCategories = 0; + maxCategories = Integer.MAX_VALUE; + + minTokens = 0; + maxTokens = Integer.MAX_VALUE; + + } + + protected int getMaxCategories() { + return maxCategories; + } + + protected int getMaxIndegree() { + return maxIndegree; + } + + protected int getMaxOutdegree() { + return maxOutdegree; + } + + protected int getMaxRedirects() { + return maxRedirects; + } + + protected int getMinCategories() { + return minCategories; + } + + protected int getMinIndegree() { + return minIndegree; + } + + protected int getMinOutdegree() { + return minOutdegree; + } + + protected int getMinRedirects() { + return minRedirects; + } + + protected boolean onlyArticlePages() { + return onlyArticlePages; + } + + protected boolean onlyDisambiguationPages() { + return onlyDisambiguationPages; + } + + protected int getMinTokens() { + return minTokens; + } + + protected int getMaxTokens() { + return maxTokens; + } + + protected String getTitlePattern() { + return titlePattern; + } + + /** + * Sets the minimum number of categories that queried articles should have. + * + * @param minCategories The minimum number of categories. + */ + public void setMinCategories(int minCategories) { + this.minCategories = minCategories; + } + + /** + * Sets the maximum number of categories that queried articles should have. + * + * @param maxCategories The maximum number of categories. + */ + public void setMaxCategories(int maxCategories) { + this.maxCategories = maxCategories; + } + + /** + * Sets the minimum number of ingoing links that queried articles should have. + * + * @param minIndegree The minimum number of ingoing links. + */ + public void setMinIndegree(int minIndegree) { + this.minIndegree = minIndegree; + } + + /** + * Sets the maximum number of ingoing links that queried articles should have. + * + * @param maxIndegree The maximum number of ingoing links. + */ + public void setMaxIndegree(int maxIndegree) { + this.maxIndegree = maxIndegree; + } + + /** + * Sets the minimum number of outgoing links that queried articles should have. + * + * @param minOutdegree The minimum number of outgoing links. + */ + public void setMinOutdegree(int minOutdegree) { + this.minOutdegree = minOutdegree; + } + + /** + * Sets the maximum number of outgoing links that queried articles should have. + * + * @param maxOutdegree The maximum number of outgoing links. + */ + public void setMaxOutdegree(int maxOutdegree) { + this.maxOutdegree = maxOutdegree; + } + + /** + * Sets the minimum number of redirects that queried articles should have. + * + * @param minRedirects The minimum number of redirects. + */ + public void setMinRedirects(int minRedirects) { + this.minRedirects = minRedirects; + } + + /** + * Sets the maximum number of redirects that queried articles should have. + * + * @param maxRedirects The maximum number of redirects. + */ + public void setMaxRedirects(int maxRedirects) { + this.maxRedirects = maxRedirects; + } + + /** + * Sets whether only be articles should be retrieved. + * + * @param onlyArticlePages If set to true, only article pages are returned. + */ + public void setOnlyArticlePages(boolean onlyArticlePages) { + this.onlyArticlePages = onlyArticlePages; + } + + /** + * Sets whether only disambiguation pages should be retrieved. + * + * @param onlyDisambiguationPages If set to true, only disambiguation pages are returned. + */ + public void setOnlyDisambiguationPages(boolean onlyDisambiguationPages) { + this.onlyDisambiguationPages = onlyDisambiguationPages; + } + + /** + * Sets the minimum number of tokens that queried articles should have. + * + * @param minTokens The minimum number of tokens. + */ + public void setMinTokens(int minTokens) { + this.minTokens = minTokens; + } + + /** + * Sets the maximum number of tokens that queried articles should have. + * + * @param maxTokens The maximum number of tokens. + */ + public void setMaxTokens(int maxTokens) { + this.maxTokens = maxTokens; + } + + /** + * Sets a regular expression that pages have to match. + * % for any number of arbitrary characters (can only be used at the end of a string) + * _ for a single arbitrary character (can also be used inside a string) + * + * @param pattern A regular expression pattern. + */ + public void setTitlePattern(String pattern) { + this.titlePattern = pattern; + } + + /** + * @return A string that shows the current values of the query members. + */ + public String getQueryInfo() { + StringBuilder sb = new StringBuilder(); + + sb.append("MaxCategories: ").append(maxCategories).append(LF); + sb.append("MinCategories: ").append(minCategories).append(LF); + sb.append("MaxIndegree: ").append(maxIndegree).append(LF); + sb.append("MinIndegree: ").append(minIndegree).append(LF); + sb.append("MaxOutdegree: ").append(maxOutdegree).append(LF); + sb.append("MinOutdegree: ").append(minOutdegree).append(LF); + sb.append("MaxRedirects: ").append(maxRedirects).append(LF); + sb.append("MinRedirects: ").append(minRedirects).append(LF); + sb.append("MaxTokens: ").append(maxTokens).append(LF); + sb.append("MinTokens: ").append(minTokens).append(LF); + sb.append("Only article pages: ").append(onlyArticlePages).append(LF); + sb.append("Only disambiguation pages: ").append(onlyDisambiguationPages).append(LF); + sb.append("Title pattern: ").append(titlePattern).append(LF); + + return sb.toString(); + } } diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/PageQueryIterable.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/PageQueryIterable.java index 61bfc7dc..dcf3f6b3 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/PageQueryIterable.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/PageQueryIterable.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -22,171 +22,144 @@ import java.util.Iterator; import java.util.List; -import org.hibernate.Session; - import org.dkpro.jwpl.api.exception.WikiApiException; import org.dkpro.jwpl.api.exception.WikiPageNotFoundException; import org.dkpro.jwpl.util.ApiUtilities; import org.dkpro.jwpl.util.StringUtils; +import org.hibernate.Session; import org.slf4j.Logger; import org.slf4j.LoggerFactory; - /** * An iterable over {@link Page} objects selected by a query. */ public class PageQueryIterable implements Iterable { - private static final Logger logger = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); - - private final Wikipedia wiki; - private final List pageIdList; - - public PageQueryIterable(Wikipedia wiki, PageQuery query) throws WikiApiException { - - this.wiki = wiki; - this.pageIdList = new ArrayList<>(); - - // get a list with all pageIDs of the pages conforming with the query - //TODO change this to a hibernate criteria query - String hql = "select p.pageId from Page as p "; - List conditions = new ArrayList<>(); - if (query.onlyDisambiguationPages()) { - conditions.add("p.isDisambiguation = 1"); - } - if (query.onlyArticlePages()) { - conditions.add("p.isDisambiguation = 0"); - } - if (!"".equals(query.getTitlePattern())) { - conditions.add("p.name like '" + query.getTitlePattern() + "'"); - } - - String conditionString = StringUtils.join(conditions, " AND "); - if (conditionString.length() > 0) { - hql += "where " + conditionString; - } - - Session session = this.wiki.__getHibernateSession(); - session.beginTransaction(); - List idList = session.createQuery(hql, Integer.class).list(); - session.getTransaction().commit(); - - int progress = 0; - for (Integer pageID : idList) { - progress++; - ApiUtilities.printProgressInfo(progress, idList.size(), 100, ApiUtilities.ProgressInfoMode.TEXT, "searching " + idList.size() + " pages ... "); - - // shortcut to fasten queries that do not have such constraints - if (query.getMaxCategories() == Integer.MAX_VALUE && - query.getMaxIndegree() == Integer.MAX_VALUE && - query.getMaxOutdegree() == Integer.MAX_VALUE && - query.getMaxRedirects() == Integer.MAX_VALUE && - query.getMaxTokens() == Integer.MAX_VALUE && - query.getMinCategories() == 0 && - query.getMinIndegree() == 0 && - query.getMinOutdegree() == 0 && - query.getMinRedirects() == 0 && - query.getMinTokens() == 0) - { - pageIdList.add(pageID); - continue; - } - - Page page = null; - try { - page = wiki.getPage(pageID); - } catch (WikiPageNotFoundException e) { - logger.error("Page with pageID {} could not be found. Fatal error. Terminating.", pageID); - e.printStackTrace(); - System.exit(1); - } - - String[] tokens = page.getPlainText().split(" "); - - if (!(query.getMinIndegree() >= 0 && - query.getMaxIndegree() >= 0 && - query.getMinIndegree() <= query.getMaxIndegree())) { - - query.setMinIndegree(0); - query.setMaxIndegree(Integer.MAX_VALUE); - } - - if (!(query.getMinOutdegree() >= 0 && - query.getMaxOutdegree() >= 0 && - query.getMinOutdegree() <= query.getMaxOutdegree())) { - - query.setMinOutdegree(0); - query.setMaxOutdegree(Integer.MAX_VALUE); - } - - if (!(query.getMinRedirects() >= 0 && - query.getMaxRedirects() >= 0 && - query.getMinRedirects() <= query.getMaxRedirects())) { - - query.setMinRedirects(0); - query.setMaxRedirects(Integer.MAX_VALUE); - } - - if (!(query.getMinCategories() >= 0 && - query.getMaxCategories() >= 0 && - query.getMinCategories() <= query.getMaxCategories())) { - - query.setMinCategories(0); - query.setMaxCategories(Integer.MAX_VALUE); - } - - if (!(query.getMinCategories() >= 0 && - query.getMaxCategories() >= 0 && - query.getMinCategories() <= query.getMaxCategories())) { - - query.setMinCategories(0); - query.setMaxCategories(Integer.MAX_VALUE); - } - - if (!(query.getMinTokens() >= 0 && - query.getMaxTokens() >= 0 && - query.getMinTokens() <= query.getMaxTokens())) { - - query.setMinTokens(0); - query.setMaxTokens(Integer.MAX_VALUE); - } - - - int inlinkSize = page.getNumberOfInlinks(); - if (inlinkSize < query.getMinIndegree() || - inlinkSize > query.getMaxIndegree()) { - continue; - } - - int outlinkSize = page.getNumberOfOutlinks(); - if (outlinkSize < query.getMinOutdegree() || - outlinkSize > query.getMaxOutdegree()) { - continue; - } - if (page.getRedirects().size() < query.getMinRedirects() || - page.getRedirects().size() > query.getMaxRedirects()) { - continue; - } - - int categoriesSize = page.getCategories().size(); - if (categoriesSize < query.getMinCategories() || - categoriesSize > query.getMaxCategories()) { - continue; - } - if (tokens.length < query.getMinTokens() || - tokens.length > query.getMaxTokens()) { - continue; - } - - // if still here, add page - pageIdList.add(pageID); - } // for - logger.info("Query selected {} pages.", pageIdList.size()); + private static final Logger logger = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + + private final Wikipedia wiki; + private final List pageIdList; + + public PageQueryIterable(Wikipedia wiki, PageQuery q) throws WikiApiException { + + this.wiki = wiki; + this.pageIdList = new ArrayList<>(); + + // get a list with all pageIDs of the pages conforming with the query + String hql = "select p.pageId from Page as p "; + List conditions = new ArrayList<>(); + if (q.onlyDisambiguationPages()) { + conditions.add("p.isDisambiguation = 1"); + } + if (q.onlyArticlePages()) { + conditions.add("p.isDisambiguation = 0"); + } + if (!"".equals(q.getTitlePattern())) { + conditions.add("p.name like '" + q.getTitlePattern() + "'"); } - public Iterator iterator() { - return new PageQueryIterator(wiki, pageIdList); + String conditionString = StringUtils.join(conditions, " AND "); + if (conditionString.length() > 0) { + hql += "where " + conditionString; } + + Session session = this.wiki.__getHibernateSession(); + session.beginTransaction(); + List idList = session.createQuery(hql, Integer.class).list(); + session.getTransaction().commit(); + + int progress = 0; + for (Integer pageID : idList) { + progress++; + ApiUtilities.printProgressInfo(progress, idList.size(), 100, ApiUtilities.ProgressInfoMode.TEXT, "searching " + idList.size() + " pages ... "); + + // shortcut to fasten queries that do not have such constraints + if (q.getMaxCategories() == Integer.MAX_VALUE && + q.getMaxIndegree() == Integer.MAX_VALUE && + q.getMaxOutdegree() == Integer.MAX_VALUE && + q.getMaxRedirects() == Integer.MAX_VALUE && + q.getMaxTokens() == Integer.MAX_VALUE && + q.getMinCategories() == 0 && + q.getMinIndegree() == 0 && + q.getMinOutdegree() == 0 && + q.getMinRedirects() == 0 && + q.getMinTokens() == 0) { + pageIdList.add(pageID); + continue; + } + + Page page = null; + try { + page = wiki.getPage(pageID); + } catch (WikiPageNotFoundException e) { + logger.error("Page with pageID {} could not be found. Fatal error. Terminating.", pageID); + e.printStackTrace(); + System.exit(1); + } + + String[] tokens = page.getPlainText().split(" "); + + if (!(q.getMinIndegree() >= 0 && q.getMaxIndegree() >= 0 && q.getMinIndegree() <= q.getMaxIndegree())) { + q.setMinIndegree(0); + q.setMaxIndegree(Integer.MAX_VALUE); + } + + if (!(q.getMinOutdegree() >= 0 && q.getMaxOutdegree() >= 0 && q.getMinOutdegree() <= q.getMaxOutdegree())) { + q.setMinOutdegree(0); + q.setMaxOutdegree(Integer.MAX_VALUE); + } + + if (!(q.getMinRedirects() >= 0 && q.getMaxRedirects() >= 0 && q.getMinRedirects() <= q.getMaxRedirects())) { + q.setMinRedirects(0); + q.setMaxRedirects(Integer.MAX_VALUE); + } + + if (!(q.getMinCategories() >= 0 && q.getMaxCategories() >= 0 && q.getMinCategories() <= q.getMaxCategories())) { + q.setMinCategories(0); + q.setMaxCategories(Integer.MAX_VALUE); + } + + if (!(q.getMinCategories() >= 0 && q.getMaxCategories() >= 0 && q.getMinCategories() <= q.getMaxCategories())) { + q.setMinCategories(0); + q.setMaxCategories(Integer.MAX_VALUE); + } + + if (!(q.getMinTokens() >= 0 && q.getMaxTokens() >= 0 && q.getMinTokens() <= q.getMaxTokens())) { + q.setMinTokens(0); + q.setMaxTokens(Integer.MAX_VALUE); + } + + int inlinkSize = page.getNumberOfInlinks(); + if (inlinkSize < q.getMinIndegree() || inlinkSize > q.getMaxIndegree()) { + continue; + } + + int outlinkSize = page.getNumberOfOutlinks(); + if (outlinkSize < q.getMinOutdegree() || outlinkSize > q.getMaxOutdegree()) { + continue; + } + if (page.getRedirects().size() < q.getMinRedirects() || page.getRedirects().size() > q.getMaxRedirects()) { + continue; + } + + int categoriesSize = page.getCategories().size(); + if (categoriesSize < q.getMinCategories() || categoriesSize > q.getMaxCategories()) { + continue; + } + if (tokens.length < q.getMinTokens() || tokens.length > q.getMaxTokens()) { + continue; + } + + // if still here, add page + pageIdList.add(pageID); + } // for + logger.info("Query selected {} pages.", pageIdList.size()); + } + + @Override + public Iterator iterator() { + return new PageQueryIterator(wiki, pageIdList); + } } diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/PageQueryIterator.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/PageQueryIterator.java index bf5756d6..7e177dec 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/PageQueryIterator.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/PageQueryIterator.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -25,46 +25,42 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; - /** * An iterator over {@link Page} objects selected by a query. - * */ public class PageQueryIterator implements Iterator { - private static final Logger logger = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + private static final Logger logger = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); - private final Wikipedia wiki; - private int iterPosition; - private final List pageIDs; + private final Wikipedia wiki; + private int iterPosition; + private final List pageIDs; - public PageQueryIterator(Wikipedia wiki, List pPageIDs) { - this.wiki = wiki; - this.iterPosition = 0; - this.pageIDs = pPageIDs; - } + public PageQueryIterator(Wikipedia wiki, List pPageIDs) { + this.wiki = wiki; + this.iterPosition = 0; + this.pageIDs = pPageIDs; + } - public boolean hasNext() { - if (iterPosition < this.pageIDs.size()) { - return true; - } - else { - return false; - } - } + @Override + public boolean hasNext() { + return iterPosition < this.pageIDs.size(); + } - public Page next() { - Page page = null; - try { - page = this.wiki.getPage(pageIDs.get(iterPosition)); - } catch (WikiApiException e) { - logger.error("Could not load page with id {}", pageIDs.get(iterPosition), e); - } - iterPosition++; - return page; + @Override + public Page next() { + Page page = null; + try { + page = this.wiki.getPage(pageIDs.get(iterPosition)); + } catch (WikiApiException e) { + logger.error("Could not load page with id {}", pageIDs.get(iterPosition), e); } + iterPosition++; + return page; + } - public void remove() { - throw new UnsupportedOperationException(); - } + @Override + public void remove() { + throw new UnsupportedOperationException(); + } } diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/PageTitleComparator.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/PageTitleComparator.java index 00d295ff..97048a38 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/PageTitleComparator.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/PageTitleComparator.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -23,18 +23,17 @@ /** * Compares two pages based on the lexicographic ordering of their titles. - * */ public class PageTitleComparator implements Comparator { - public int compare(Page o1, Page o2) { + public int compare(Page o1, Page o2) { - int retVal = 0; - try { - retVal = o1.getTitle().getPlainTitle().compareTo(o2.getTitle().getPlainTitle()); - } catch (WikiTitleParsingException e) { - e.printStackTrace(); - } - return retVal; + int retVal = 0; + try { + retVal = o1.getTitle().getPlainTitle().compareTo(o2.getTitle().getPlainTitle()); + } catch (WikiTitleParsingException e) { + e.printStackTrace(); } + return retVal; + } } diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/Title.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/Title.java index 59e50a89..a47dc1ac 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/Title.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/Title.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -20,160 +20,155 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; -import org.dkpro.jwpl.api.exception.WikiTitleParsingException; import org.apache.commons.lang3.StringUtils; +import org.dkpro.jwpl.api.exception.WikiTitleParsingException; /** * Represents a Wikipedia page title. *

* Title parsing regexp fixed with the help of many UKP colleagues and Samy Ateia. - * */ public class Title { - private final String wikiStyleTitle; - private final String plainTitle; - private final String entity; - private final String disambiguationText; - private final String rawTitleText; - private final String sectionText; - - /** - * Create a {@link Title} object using a title string. - * The string gets parsed into an entity part and a disambiguation part. - * As Wikipedia page names represent spaces as underscores, we create a version with spaces and one without. - * - * @param titleText The title string of the page. - * @throws WikiTitleParsingException Thrown if errors occurred during sanitation of the {@code titleText}. - */ - public Title(String titleText) throws WikiTitleParsingException { - if (titleText.length() == 0) { - throw new WikiTitleParsingException("Title is empty."); - } - - /* - * Do not convert first character to upper case. We perform case insensitive querying - */ - if (titleText.substring(0, 1).toLowerCase().equals(titleText.substring(0, 1))) { - this.rawTitleText = titleText.substring(0,1).toUpperCase() + titleText.substring(1,titleText.length()); - } - else { - this.rawTitleText = titleText; - } - - // "Car_(automobile)#Introduction" - // should be split into: - // - "Car" - // - "automobile" - // - "Introduction" - - String titlePart; - String sectionPart = null; - if (rawTitleText.contains("#")) { - titlePart = rawTitleText.substring(0, rawTitleText.lastIndexOf("#")); - sectionPart = rawTitleText.substring(rawTitleText.lastIndexOf("#")+1,rawTitleText.length()); - } - else { - titlePart = rawTitleText; - } - - this.sectionText = sectionPart; - - String regexFindParts = "(.*?)[ _]\\((.+?)\\)$"; - - Pattern patternNamespace = Pattern.compile(regexFindParts); - Matcher matcherNamespace = patternNamespace.matcher( - this.decodeTitleWikistyle(titlePart) - ); - - // group 0 is the whole match - if (matcherNamespace.find()) { - this.entity = matcherNamespace.group(1); - this.disambiguationText = matcherNamespace.group(2); - - String relevantTitleParts = this.entity + " (" + this.disambiguationText + ")"; - this.plainTitle = decodeTitleWikistyle(relevantTitleParts); - this.wikiStyleTitle = encodeTitleWikistyle(relevantTitleParts); - } - else { - this.plainTitle = decodeTitleWikistyle(titlePart); - this.wikiStyleTitle = encodeTitleWikistyle(titlePart); - this.entity = this.plainTitle; - this.disambiguationText = null; - } - - if (StringUtils.isEmpty(getEntity())) { - throw new WikiTitleParsingException("Title was not properly initialized."); - } - } - - /** - * Encodes a plain title string to wiki-style. - *

- * Page titles in Wikipedia are encoded in a way that URLs containing the title are valid. - * Title strings entered by users normally do not conform to this wiki-style encoding. - * - * @param pTitle The string to encode. Must not be {@code null}. - * @return The wiki-style encoded string. - */ - private String encodeTitleWikistyle(String pTitle) { - return pTitle.replace(' ', '_'); - } - - /** - * Decodes a wiki-style title string to plain text. - *

- * Page titles in Wikipedia are encoded in a way that URLs containing the title are valid. - * Title strings entered by users normally do not conform to this wiki-style encoding. - * - * @param pTitle The string to decode. Must not be {@code null}. - * @return The decoded string. - */ - private String decodeTitleWikistyle(String pTitle) { - return pTitle.replace('_', ' '); - } - - /** - * @return The disambiguation text of a page title (i.e., the part in parentheses following the page's name). - */ - public String getDisambiguationText() { - return disambiguationText; + private static final Pattern PATTERN_NAMESPACE = Pattern.compile("(.*?)[ _]\\((.+?)\\)$"); + + private final String wikiStyleTitle; + private final String plainTitle; + private final String entity; + private final String disambiguationText; + private final String rawTitleText; + private final String sectionText; + + /** + * Create a {@link Title} object using a title string. + * The string gets parsed into an entity part and a disambiguation part. + * As Wikipedia page names represent spaces as underscores, we create a version with spaces and one without. + * + * @param titleText The title string of the page. + * @throws WikiTitleParsingException Thrown if errors occurred during sanitation of the {@code titleText}. + */ + public Title(String titleText) throws WikiTitleParsingException { + if (titleText.length() == 0) { + throw new WikiTitleParsingException("Title is empty."); } - /** - * @return The name of the entity (i.e. the page's title *without* disambiguation string). + /* + * Do not convert first character to upper case. We perform case insensitive querying */ - public String getEntity() { - return entity; + if (titleText.substring(0, 1).toLowerCase().equals(titleText.substring(0, 1))) { + this.rawTitleText = titleText.substring(0, 1).toUpperCase() + titleText.substring(1, titleText.length()); + } else { + this.rawTitleText = titleText; } - /** - * @return The plain title, without wikistyle underscores replacing spaces. - */ - public String getPlainTitle() { - return plainTitle; - } - - /** - * @return Returns the section part of a link "Article (Disambiguation)#Section". - */ - public String getSectionText() { - return sectionText; - } - - /** - * @return The wikistyle title, with spaces replaced by underscores. - */ - public String getWikiStyleTitle() { - return wikiStyleTitle; + // "Car_(automobile)#Introduction" + // should be split into: + // - "Car" + // - "automobile" + // - "Introduction" + + String titlePart; + String sectionPart = null; + if (rawTitleText.contains("#")) { + titlePart = rawTitleText.substring(0, rawTitleText.lastIndexOf("#")); + sectionPart = rawTitleText.substring(rawTitleText.lastIndexOf("#") + 1, rawTitleText.length()); + } else { + titlePart = rawTitleText; } - protected String getRawTitleText() { - return rawTitleText; + this.sectionText = sectionPart; + + Matcher matcherNamespace = PATTERN_NAMESPACE.matcher( + this.decodeTitleWikistyle(titlePart) + ); + + // group 0 is the whole match + if (matcherNamespace.find()) { + this.entity = matcherNamespace.group(1); + this.disambiguationText = matcherNamespace.group(2); + + String relevantTitleParts = this.entity + " (" + this.disambiguationText + ")"; + this.plainTitle = decodeTitleWikistyle(relevantTitleParts); + this.wikiStyleTitle = encodeTitleWikistyle(relevantTitleParts); + } else { + this.plainTitle = decodeTitleWikistyle(titlePart); + this.wikiStyleTitle = encodeTitleWikistyle(titlePart); + this.entity = this.plainTitle; + this.disambiguationText = null; } - @Override - public String toString() { - return getPlainTitle(); + if (StringUtils.isEmpty(getEntity())) { + throw new WikiTitleParsingException("Title was not properly initialized."); } + } + + /** + * Encodes a plain title string to wiki-style. + *

+ * Page titles in Wikipedia are encoded in a way that URLs containing the title are valid. + * Title strings entered by users normally do not conform to this wiki-style encoding. + * + * @param pTitle The string to encode. Must not be {@code null}. + * @return The wiki-style encoded string. + */ + private String encodeTitleWikistyle(String pTitle) { + return pTitle.replace(' ', '_'); + } + + /** + * Decodes a wiki-style title string to plain text. + *

+ * Page titles in Wikipedia are encoded in a way that URLs containing the title are valid. + * Title strings entered by users normally do not conform to this wiki-style encoding. + * + * @param pTitle The string to decode. Must not be {@code null}. + * @return The decoded string. + */ + private String decodeTitleWikistyle(String pTitle) { + return pTitle.replace('_', ' '); + } + + /** + * @return The disambiguation text of a page title (i.e., the part in parentheses following the page's name). + */ + public String getDisambiguationText() { + return disambiguationText; + } + + /** + * @return The name of the entity (i.e. the page's title *without* disambiguation string). + */ + public String getEntity() { + return entity; + } + + /** + * @return The plain title, without wikistyle underscores replacing spaces. + */ + public String getPlainTitle() { + return plainTitle; + } + + /** + * @return Returns the section part of a link "Article (Disambiguation)#Section". + */ + public String getSectionText() { + return sectionText; + } + + /** + * @return The wikistyle title, with spaces replaced by underscores. + */ + public String getWikiStyleTitle() { + return wikiStyleTitle; + } + + protected String getRawTitleText() { + return rawTitleText; + } + + @Override + public String toString() { + return getPlainTitle(); + } } diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/TitleIterable.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/TitleIterable.java index e289da4a..3a2ecf22 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/TitleIterable.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/TitleIterable.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -19,35 +19,34 @@ import java.util.Iterator; - /** - * An iterable over all titles. - * + * An {@link Iterable} over {@link Title} objects. */ public class TitleIterable implements Iterable { - private final Wikipedia wiki; - - /** - * The size of the title buffer. - * With bufferSize = 1, a database connection is needed for retrieving a single title. - * Higher bufferSize gives better performance, but needs memory. - * Initialize it with 5000. - */ - private int bufferSize = 5000; - - public TitleIterable(Wikipedia wiki) { - this.wiki = wiki; - } - - public TitleIterable(Wikipedia wiki, int bufferSize) { - this.wiki = wiki; - this.bufferSize = bufferSize; - } - - public Iterator<Title> iterator() { - return new TitleIterator(wiki, bufferSize); - } + private final Wikipedia wiki; + + /* + * The size of the title buffer. + * With bufferSize = 1, a database connection is needed for retrieving a single title. + * Higher bufferSize gives better performance, but needs memory. + * Initialize it with 5000. + */ + private int bufferSize = 5000; + + public TitleIterable(Wikipedia wiki) { + this.wiki = wiki; + } + + public TitleIterable(Wikipedia wiki, int bufferSize) { + this.wiki = wiki; + this.bufferSize = bufferSize; + } + + @Override + public Iterator<Title> iterator() { + return new TitleIterator(wiki, bufferSize); + } } diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/TitleIterator.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/TitleIterator.java index 6faeccc2..0a8a96a6 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/TitleIterator.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/TitleIterator.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -21,131 +21,127 @@ import java.util.Iterator; import java.util.List; -import org.hibernate.Session; - import org.dkpro.jwpl.api.exception.WikiTitleParsingException; +import org.hibernate.Session; /** - * An iterator over category objects. - * + * An {@link Iterator} over {@link Title} objects. */ public class TitleIterator implements Iterator<Title> { - private final TitleBuffer buffer; - - public TitleIterator(Wikipedia wiki, int bufferSize) { - buffer = new TitleBuffer(bufferSize, wiki); + private final TitleBuffer buffer; + + public TitleIterator(Wikipedia wiki, int bufferSize) { + buffer = new TitleBuffer(bufferSize, wiki); + } + + @Override + public boolean hasNext() { + return buffer.hasNext(); + } + + @Override + public Title next() { + return buffer.next(); + } + + @Override + public void remove() { + throw new UnsupportedOperationException(); + } + + /** + * Buffers titles in a list. + */ + static class TitleBuffer { + + private final Wikipedia wiki; + + private final List<String> titleStringBuffer; + private final int maxBufferSize; // the number of pages to be buffered after a query to the database. + private int bufferFillSize; // even a 500 slot buffer can be filled with only 5 elements + private int bufferOffset; // the offset in the buffer + private int dataOffset; // the overall offset in the data + + public TitleBuffer(int bufferSize, Wikipedia wiki) { + this.maxBufferSize = bufferSize; + this.wiki = wiki; + this.titleStringBuffer = new ArrayList<>(); + this.bufferFillSize = 0; + this.bufferOffset = 0; + this.dataOffset = 0; } - public boolean hasNext(){ - return buffer.hasNext(); + /** + * If there are elements in the buffer left, then return true. + * If the end of the filled buffer is reached, then try to load new buffer. + * + * @return True, if there are pages left. False otherwise. + */ + public boolean hasNext() { + if (bufferOffset < bufferFillSize) { + return true; + } else { + return this.fillBuffer(); + } } - public Title next(){ - return buffer.next(); + /** + * @return The next Title or null if no more categories are available. + */ + public Title next() { + // if there are still elements in the buffer, just retrieve the next one + if (bufferOffset < bufferFillSize) { + return this.getBufferElement(); + } + // if there are no more elements => try to fill a new buffer + else if (this.fillBuffer()) { + return this.getBufferElement(); + } else { + // if it cannot be filled => return null + return null; + } } - public void remove() { - throw new UnsupportedOperationException(); + private Title getBufferElement() { + String titleString = titleStringBuffer.get(bufferOffset); + Title title = null; + try { + title = new Title(titleString); + } catch (WikiTitleParsingException e) { + e.printStackTrace(); + } + bufferOffset++; + dataOffset++; + return title; } - /** - * Buffers titles in a list. - * - * - */ - class TitleBuffer { - - private final Wikipedia wiki; - - private final List<String> titleStringBuffer; - private final int maxBufferSize; // the number of pages to be buffered after a query to the database. - private int bufferFillSize; // even a 500 slot buffer can be filled with only 5 elements - private int bufferOffset; // the offset in the buffer - private int dataOffset; // the overall offset in the data - - public TitleBuffer(int bufferSize, Wikipedia wiki){ - this.maxBufferSize = bufferSize; - this.wiki = wiki; - this.titleStringBuffer = new ArrayList<>(); - this.bufferFillSize = 0; - this.bufferOffset = 0; - this.dataOffset = 0; - } - - /** - * If there are elements in the buffer left, then return true. - * If the end of the filled buffer is reached, then try to load new buffer. - * @return True, if there are pages left. False otherwise. - */ - public boolean hasNext(){ - if (bufferOffset < bufferFillSize) { - return true; - } - else { - return this.fillBuffer(); - } - } - - /** - * - * @return The next Title or null if no more categories are available. - */ - public Title next(){ - // if there are still elements in the buffer, just retrieve the next one - if (bufferOffset < bufferFillSize) { - return this.getBufferElement(); - } - // if there are no more elements => try to fill a new buffer - else if (this.fillBuffer()) { - return this.getBufferElement(); - } - else { - // if it cannot be filled => return null - return null; - } - } - - private Title getBufferElement() { - String titleString = titleStringBuffer.get(bufferOffset); - Title title = null; - try { - title = new Title(titleString); - } catch (WikiTitleParsingException e) { - e.printStackTrace(); - } - bufferOffset++; - dataOffset++; - return title; - } - - private boolean fillBuffer() { - - Session session = this.wiki.__getHibernateSession(); - session.beginTransaction(); - final String sql = "select p.name from PageMapLine as p"; - List<String> returnList = session.createNativeQuery(sql, String.class) - .setFirstResult(dataOffset) - .setMaxResults(maxBufferSize) - .setFetchSize(maxBufferSize) - .list(); - session.getTransaction().commit(); - - // clear the old buffer and all variables regarding the state of the buffer - titleStringBuffer.clear(); - bufferOffset = 0; - bufferFillSize = 0; - - titleStringBuffer.addAll(returnList); - - if (titleStringBuffer.size() > 0) { - bufferFillSize = titleStringBuffer.size(); - return true; - } - else { - return false; - } - } - + private boolean fillBuffer() { + + Session session = this.wiki.__getHibernateSession(); + session.beginTransaction(); + final String sql = "select p.name from PageMapLine as p"; + List<String> returnList = session.createNativeQuery(sql, String.class) + .setFirstResult(dataOffset) + .setMaxResults(maxBufferSize) + .setFetchSize(maxBufferSize) + .list(); + session.getTransaction().commit(); + + // clear the old buffer and all variables regarding the state of the buffer + titleStringBuffer.clear(); + bufferOffset = 0; + bufferFillSize = 0; + + titleStringBuffer.addAll(returnList); + + if (titleStringBuffer.size() > 0) { + bufferFillSize = titleStringBuffer.size(); + return true; + } else { + return false; + } } + + } } diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/WikiConstants.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/WikiConstants.java index ac51b82d..b5c2539f 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/WikiConstants.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/WikiConstants.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -22,314 +22,316 @@ import org.sweble.wikitext.engine.utils.DefaultConfigEnWp; import org.sweble.wikitext.engine.utils.LanguageConfigGenerator; import org.xml.sax.SAXException; + import javax.xml.parsers.ParserConfigurationException; import java.io.IOException; import java.util.List; public interface WikiConstants { - /** - * Shortcut for System.getProperty("line.separator"). - */ - String LF = System.getProperty("line.separator"); + /** + * Shortcut for System.getProperty("line.separator"). + */ + String LF = System.getProperty("line.separator"); - /** - * The prefix that is added to page titles of discussion pages - * Has to be the same as in wikipedia.datamachine:SingleDumpVersionJDKGeneric - */ - String DISCUSSION_PREFIX = "Discussion:"; + /** + * The prefix that is added to page titles of discussion pages + * Has to be the same as in wikipedia.datamachine:SingleDumpVersionJDKGeneric + */ + String DISCUSSION_PREFIX = "Discussion:"; - /** - * Configuration file for the Sweble parser - */ - String SWEBLE_CONFIG = "classpath:/org/sweble/wikitext/engine/SimpleWikiConfiguration.xml"; + /** + * Configuration file for the Sweble parser + */ + String SWEBLE_CONFIG = "classpath:/org/sweble/wikitext/engine/SimpleWikiConfiguration.xml"; + + /** + * Enumerates the languages for which Wikipedia APIs are available. + * A Wikipedia object can be created using one of these languages. + */ + // Languages should be lowercase and match the corresponding snowball stemmer names. + enum Language { + abkhazian, + afar, + afrikaans, + akan, + albanian, + alemannic, + amharic, + anglo_saxon, + arabic, + aragonese, + armenian, + aromanian, + assamese, + assyrian_neo_aramaic, + asturian, + avar, + aymara, + azeri, + bambara, + banyumasan, + bashkir, + basque, + bavarian, + belarusian, + belarusian_tarashkevitsa, + bengali, + bihari, + bishnupriya_manipuri, + bislama, + bosnian, + breton, + buginese, + bulgarian, + burmese, + buryat_russia, + cantonese, + catalan, + cebuano, + central_bicolano, + chamorro, + chechen, + cherokee, + cheyenne, + chichewa, + chinese, + choctaw, + chuvash, + classical_chinese, + cornish, + corsican, + cree, + crimean_tatar, + croatian, + czech, + danish, + divehi, + dutch, + dutch_low_saxon, + dzongkha, + emilian_romagnol, + english, + esperanto, + estonian, + ewe, + faroese, + fijian, + finnish, + franco_provencal_arpitan, + french, + friulian, + fula, + galician, + georgian, + german, + gilaki, + gothic, + greek, + greenlandic, + guarani, + gujarati, + haitian, + hakka, + hausa, + hawaiian, + hebrew, + herero, + hindi, + hiri_motu, + hungarian, + icelandic, + ido, + igbo, + ilokano, + indonesian, + interlingua, + interlingue, + inuktitut, + inupiak, + irish, + italian, + japanese, + javanese, + kabyle, + kalmyk, + kannada, + kanuri, + kapampangan, + kashmiri, + kashubian, + kazakh, + khmer, + kikuyu, + kinyarwanda, + kirghiz, + kirundi, + klingon, + komi, + kongo, + korean, + kuanyama, + kurdish, + ladino, + lak, + lao, + latin, + latvian, + ligurian, + limburgian, + lingala, + lithuanian, + lojban, + lombard, + low_saxon, + lower_sorbian, + luganda, + luxembourgish, + macedonian, + malagasy, + malay, + malayalam, + maltese, + manx, + maori, + marathi, + marshallese, + mazandarani, + min_dong, + min_nan, + moldovan, + mongolian, + muscogee, + nahuatl, + nauruan, + navajo, + ndonga, + neapolitan, + nepali, + newar_nepal_bhasa, + norfolk, + norman, + northern_sami, + norwegian_bokmal, + norwegian_nynorsk, + novial, + occitan, + old_church_slavonic, + oriya, + oromo, + ossetian, + pali, + pangasinan, + papiamentu, + pashto, + pennsylvania_german, + persian, + piedmontese, + polish, + portuguese, + punjabi, + quechua, + ripuarian, + romani, + romanian, + romansh, + russian, + samoan, + samogitian, + sango, + sanskrit, + sardinian, + saterland_frisian, + scots, + scottish_gaelic, + serbian, + serbo_croatian, + sesotho, + shona, + sichuan_yi, + sicilian, + simple_english, + sindhi, + sinhalese, + slovak, + slovenian, + somali, + spanish, + sundanese, + swahili, + swati, + swedish, + tagalog, + tahitian, + tajik, + tamil, + tarantino, + tatar, + telugu, + tetum, + thai, + tibetan, + tigrinya, + tok_pisin, + tokipona, + tongan, + tsonga, + tswana, + tumbuka, + turkish, + turkmen, + twi, + udmurt, + ukrainian, + upper_sorbian, + urdu, + uyghur, + uzbek, + venda, + venetian, + vietnamese, + volapuek, + voro, + walloon, + waray_waray, + welsh, + west_flemish, + west_frisian, + wolof, + wu, + xhosa, + yiddish, + yoruba, + zamboanga_chavacano, + zazaki, + zealandic, + zhuang, + zulu, + _test; /** - * Enumerates the languages for which Wikipedia APIs are available. - * A Wikipedia object can be created using one of these languages. + * Configures a language specific configuration for parsing wikipedia pages. + * + * @return WikiConfig */ - // Languages should be lowercase and match the corresponding snowball stemmer names. - enum Language { - abkhazian, - afar, - afrikaans, - akan, - albanian, - alemannic, - amharic, - anglo_saxon, - arabic, - aragonese, - armenian, - aromanian, - assamese, - assyrian_neo_aramaic, - asturian, - avar, - aymara, - azeri, - bambara, - banyumasan, - bashkir, - basque, - bavarian, - belarusian, - belarusian_tarashkevitsa, - bengali, - bihari, - bishnupriya_manipuri, - bislama, - bosnian, - breton, - buginese, - bulgarian, - burmese, - buryat_russia, - cantonese, - catalan, - cebuano, - central_bicolano, - chamorro, - chechen, - cherokee, - cheyenne, - chichewa, - chinese, - choctaw, - chuvash, - classical_chinese, - cornish, - corsican, - cree, - crimean_tatar, - croatian, - czech, - danish, - divehi, - dutch, - dutch_low_saxon, - dzongkha, - emilian_romagnol, - english, - esperanto, - estonian, - ewe, - faroese, - fijian, - finnish, - franco_provencal_arpitan, - french, - friulian, - fula, - galician, - georgian, - german, - gilaki, - gothic, - greek, - greenlandic, - guarani, - gujarati, - haitian, - hakka, - hausa, - hawaiian, - hebrew, - herero, - hindi, - hiri_motu, - hungarian, - icelandic, - ido, - igbo, - ilokano, - indonesian, - interlingua, - interlingue, - inuktitut, - inupiak, - irish, - italian, - japanese, - javanese, - kabyle, - kalmyk, - kannada, - kanuri, - kapampangan, - kashmiri, - kashubian, - kazakh, - khmer, - kikuyu, - kinyarwanda, - kirghiz, - kirundi, - klingon, - komi, - kongo, - korean, - kuanyama, - kurdish, - ladino, - lak, - lao, - latin, - latvian, - ligurian, - limburgian, - lingala, - lithuanian, - lojban, - lombard, - low_saxon, - lower_sorbian, - luganda, - luxembourgish, - macedonian, - malagasy, - malay, - malayalam, - maltese, - manx, - maori, - marathi, - marshallese, - mazandarani, - min_dong, - min_nan, - moldovan, - mongolian, - muscogee, - nahuatl, - nauruan, - navajo, - ndonga, - neapolitan, - nepali, - newar_nepal_bhasa, - norfolk, - norman, - northern_sami, - norwegian_bokmal, - norwegian_nynorsk, - novial, - occitan, - old_church_slavonic, - oriya, - oromo, - ossetian, - pali, - pangasinan, - papiamentu, - pashto, - pennsylvania_german, - persian, - piedmontese, - polish, - portuguese, - punjabi, - quechua, - ripuarian, - romani, - romanian, - romansh, - russian, - samoan, - samogitian, - sango, - sanskrit, - sardinian, - saterland_frisian, - scots, - scottish_gaelic, - serbian, - serbo_croatian, - sesotho, - shona, - sichuan_yi, - sicilian, - simple_english, - sindhi, - sinhalese, - slovak, - slovenian, - somali, - spanish, - sundanese, - swahili, - swati, - swedish, - tagalog, - tahitian, - tajik, - tamil, - tarantino, - tatar, - telugu, - tetum, - thai, - tibetan, - tigrinya, - tok_pisin, - tokipona, - tongan, - tsonga, - tswana, - tumbuka, - turkish, - turkmen, - twi, - udmurt, - ukrainian, - upper_sorbian, - urdu, - uyghur, - uzbek, - venda, - venetian, - vietnamese, - volapuek, - voro, - walloon, - waray_waray, - welsh, - west_flemish, - west_frisian, - wolof, - wu, - xhosa, - yiddish, - yoruba, - zamboanga_chavacano, - zazaki, - zealandic, - zhuang, - zulu, - _test; - - /** - * Configures a language specific configuration for parsing wikipedia pages. - * @return WikiConfig - */ - public WikiConfig getWikiconfig(Language this) { - WikiConfig config = DefaultConfigEnWp.generate(); - if (this != Language._test) { - // We need to capitalize the language name otherwise the locale lib cannot find it. - String langName = this.name().substring(0, 1).toUpperCase() + this.name().substring(1); - try { - List<LanguageCode> langCodes = LanguageCode.findByName(langName); - if (!langCodes.isEmpty()) { - String langCode = langCodes.get(0).name(); - return LanguageConfigGenerator.generateWikiConfig(langCode); - } - } catch (IOException | ParserConfigurationException | SAXException e) { - System.out.println( - String.format("Failed to create WikiConfig for language for %s, using default instead", - langName) - ); - } - } - return config; + public WikiConfig getWikiconfig(Language this) { + WikiConfig config = DefaultConfigEnWp.generate(); + if (this != Language._test) { + // We need to capitalize the language name otherwise the locale lib cannot find it. + String langName = this.name().substring(0, 1).toUpperCase() + this.name().substring(1); + try { + List<LanguageCode> langCodes = LanguageCode.findByName(langName); + if (!langCodes.isEmpty()) { + String langCode = langCodes.get(0).name(); + return LanguageConfigGenerator.generateWikiConfig(langCode); + } + } catch (IOException | ParserConfigurationException | SAXException e) { + System.out.println( + String.format("Failed to create WikiConfig for language for %s, using default instead", + langName) + ); } + } + return config; } + } } diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/Wikipedia.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/Wikipedia.java index 39e86d93..11d38632 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/Wikipedia.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/Wikipedia.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -18,11 +18,16 @@ package org.dkpro.jwpl.api; import java.lang.invoke.MethodHandles; -import java.util.*; +import java.util.Comparator; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; import java.util.Map.Entry; - -import org.hibernate.Session; -import org.hibernate.type.StandardBasicTypes; +import java.util.Set; +import java.util.TreeSet; import org.dkpro.jwpl.api.exception.WikiApiException; import org.dkpro.jwpl.api.exception.WikiInitializationException; @@ -30,785 +35,779 @@ import org.dkpro.jwpl.api.exception.WikiTitleParsingException; import org.dkpro.jwpl.api.hibernate.WikiHibernateUtil; import org.dkpro.jwpl.util.distance.LevenshteinStringDistance; +import org.hibernate.Session; +import org.hibernate.type.StandardBasicTypes; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.sweble.wikitext.engine.config.WikiConfig; - /** * Provides access to Wikipedia articles and categories. - * */ // TODO better JavaDocs! public class Wikipedia implements WikiConstants { - - private static final Logger logger = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); - - // Note well: The whitespace at the beginning of this constant is here on purpose. Do NOT remove it! - static final String SQL_COLLATION = " COLLATE utf8mb4_bin"; /*" COLLATE utf8_bin";*/ - - private final Language language; - private final DatabaseConfiguration dbConfig; - - /** - * A mapping from page pageIDs to hibernateIDs. - * It is a kind of cache. It is only filled, if a pageID was previously accessed. - * The wikiapi startup time is way too long otherwise. */ - private final Map<Integer, Long> idMapPages; - - /** - * A mapping from categories pageIDs to hibernateIDs. - * It is a kind of cache. It is only filled, if a pageID was previously accessed. - * The wikiapi startup time is way too long otherwise. */ - private final Map<Integer, Long> idMapCategories; - - private final MetaData metaData; - - // Note: This should only be accessed internally. - private final WikiConfig wikiConfig; - /** - * Creates a new {@link Wikipedia} object accessing the database indicated by the dbConfig parameter. - * @param dbConfig A {@link DatabaseConfiguration} object telling the {@link Wikipedia} object where the data is stored and how it can be accessed. - * @throws WikiInitializationException Thrown if errors occurred while bootstrapping the {@link Wikipedia} instance. - */ - public Wikipedia(DatabaseConfiguration dbConfig) throws WikiInitializationException { + private static final Logger logger = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); - logger.trace("Creating Wikipedia object."); + // Note well: The whitespace at the beginning of this constant is here on purpose. Do NOT remove it! + static final String SQL_COLLATION = " COLLATE utf8mb4_bin"; /*" COLLATE utf8_bin";*/ - this.language = dbConfig.getLanguage(); - this.dbConfig = dbConfig; + private final Language language; + private final DatabaseConfiguration dbConfig; - this.idMapPages = new HashMap<>(); - this.idMapCategories = new HashMap<>(); + /* + * A mapping from page pageIDs to hibernateIDs. + * It is a kind of cache. It is only filled, if a pageID was previously accessed. + * The wikiapi startup time is way too long otherwise. + */ + private final Map<Integer, Long> idMapPages; - this.metaData = new MetaData(this); - this.wikiConfig = this.language.getWikiconfig(); + /* + * A mapping from categories pageIDs to hibernateIDs. + * It is a kind of cache. It is only filled, if a pageID was previously accessed. + * The wikiapi startup time is way too long otherwise. + */ + private final Map<Integer, Long> idMapCategories; - if(dbConfig.supportsCollation()) { - logger.info("Wikipedia database backend supports character collation features."); - } else { - logger.debug("Wikipedia database backend does NOT support character collation features."); - } - } + private final MetaData metaData; - WikiConfig getWikConfig() { - return wikiConfig; - } + // Note: This should only be accessed internally. + private final WikiConfig wikiConfig; - /** - * Gets the page with the given title. - * If the title is a redirect, the corresponding page is returned.<br> - * If the title start with a lowercase letter it converts it to an uppercase letter, as each Wikipedia article title starts with an uppercase letter. - * Spaces in the title are converted to underscores, as this is a convention for Wikipedia article titles. - * <p> - * For example, the article "Steam boat" could be queried with - * - "Steam boat" - * - "steam boat" - * - "Steam_boat" - * - "steam_boat" - * and additionally all redirects that might point to that article. - * - * @param title The title of the page. - * @return The page object for a given title. - * @throws WikiApiException If no page or redirect with this title exists or the title could not be properly parsed. - */ - public Page getPage(String title) throws WikiApiException { - return new Page(this, title, false); - } + /** + * Creates a new {@link Wikipedia} object accessing the database indicated by the dbConfig parameter. + * + * @param dbConfig A {@link DatabaseConfiguration} object telling the {@link Wikipedia} object + * where the data is stored and how it can be accessed. + * @throws WikiInitializationException Thrown if errors occurred while bootstrapping the {@link Wikipedia} instance. + */ + public Wikipedia(DatabaseConfiguration dbConfig) throws WikiInitializationException { - /** - * Gets the page with the exactly the given title.<br> - * - * Note that when using this method you are responsible for converting a normal search string into the right wiki-style.<br> - * - * If the title is a redirect, the corresponding page is returned.<br> - * - * @param exactTitle The exact title of the page. - * @return The page object for a given title. - * @throws WikiApiException If no page or redirect with this title exists or the title could not be properly parsed. - */ - public Page getPageByExactTitle(String exactTitle) throws WikiApiException { - return new Page(this, exactTitle, true); - } + logger.trace("Creating Wikipedia object."); - /** - * Get all pages which match all lowercase/uppercase version of the given title.<br> - * If the title is a redirect, the corresponding page is returned.<br> - * Spaces in the title are converted to underscores, as this is a convention for Wikipedia article titles. - * - * @param title The title of the page. - * @return A set of page objects matching this title. - * @throws WikiApiException If no page or redirect with this title exists or the title could not be properly parsed. - */ - public Set<Page> getPages(String title) throws WikiApiException { - Set<Integer> ids = new HashSet<>(getPageIdsCaseInsensitive(title)); - - Set<Page> pages = new HashSet<>(); - for (Integer id : ids) { - pages.add(new Page(this, id)); - } - return pages; - } + this.language = dbConfig.getLanguage(); + this.dbConfig = dbConfig; - /** - * Gets the page for a given pageId. - * - * @param pageId The id of the page. - * @return The page object for a given pageId. - * @throws WikiApiException Thrown if errors occurred. - */ - public Page getPage(int pageId) throws WikiApiException { - return new Page(this, pageId); - } + this.idMapPages = new HashMap<>(); + this.idMapCategories = new HashMap<>(); - /** - * Gets the title for a given pageId. - * - * @param pageId The id of the page. - * @return The title for the given pageId. - * @throws WikiApiException Thrown if errors occurred. - */ - public Title getTitle(int pageId) throws WikiApiException { - Session session = this.__getHibernateSession(); - session.beginTransaction(); - String sql = "select p.name from PageMapLine as p where p.pageId= :pId"; - String returnValue = session.createNativeQuery(sql, String.class) - .setParameter("pId", pageId, StandardBasicTypes.INTEGER) - .uniqueResult(); - session.getTransaction().commit(); - - if(returnValue == null){ - throw new WikiPageNotFoundException(); - } - return new Title(returnValue); - } + this.metaData = new MetaData(this); + this.wikiConfig = this.language.getWikiconfig(); - /** - * Gets the page ids for a given title.<br> - * - * - * @param title The title of the page. - * @return The id for the page with the given title. - * @throws WikiApiException Thrown if errors occurred. - */ - public List<Integer> getPageIds(String title) throws WikiApiException { - Session session = this.__getHibernateSession(); - session.beginTransaction(); - String sql = "select p.pageID from PageMapLine as p where p.name = :pName"; - Iterator<Integer> results = session.createQuery(sql, Integer.class) - .setParameter("pName", title, StandardBasicTypes.STRING) - .list().iterator(); - - session.getTransaction().commit(); - - if(!results.hasNext()){ - throw new WikiPageNotFoundException(); - } - List<Integer> resultList = new LinkedList<>(); - while(results.hasNext()){ - resultList.add(results.next()); - } - return resultList; + if(dbConfig.supportsCollation()) { + logger.info("Wikipedia database backend supports character collation features."); + } else { + logger.debug("Wikipedia database backend does NOT support character collation features."); } + } - /** - * Gets the page ids for a given title with case insensitive matching.<br> - * - * - * @param title The title of the page. - * @return The ids of the pages with the given title. - * @throws WikiApiException Thrown if errors occurred. - */ - public List<Integer> getPageIdsCaseInsensitive(String title) throws WikiApiException { - title = title.toLowerCase(); - title = title.replaceAll(" ", "_"); - - Session session = this.__getHibernateSession(); - session.beginTransaction(); - String sql = "select p.pageID from PageMapLine as p where lower(p.name) = :pName"; - Iterator<Integer> results = session.createQuery(sql, Integer.class) - .setParameter("pName", title, StandardBasicTypes.STRING) - .list().iterator(); - - session.getTransaction().commit(); - - if(!results.hasNext()){ - throw new WikiPageNotFoundException(); - } - List<Integer> resultList = new LinkedList<>(); - while(results.hasNext()){ - resultList.add(results.next()); - } - return resultList; - } + WikiConfig getWikConfig() { + return wikiConfig; + } - /** - * Returns the article page for a given discussion page. - * - * @param discussionPage - * the discussion page object - * @return The page object of the article associated with the discussion. If - * the parameter already was an article, it is returned directly. - * @throws WikiApiException Thrown if errors occurred. - */ - public Page getArticleForDiscussionPage(Page discussionPage) throws WikiApiException { - if(discussionPage.isDiscussion()){ - String title = discussionPage.getTitle().getPlainTitle().replaceAll(WikiConstants.DISCUSSION_PREFIX, ""); - - if(title.contains("/")){ - //If we have a discussion archive - //TODO This does not support articles that contain slashes- - //However, the rest of the API cannot cope with that as well, so this should not be any extra trouble - title = title.split("/")[0]; - } - return getPage(title); - }else{ - return discussionPage; - } + /** + * Gets the page with the given title. + * If the title is a redirect, the corresponding page is returned.<br> + * If the title start with a lowercase letter it converts it to an uppercase letter, as each Wikipedia article title starts with an uppercase letter. + * Spaces in the title are converted to underscores, as this is a convention for Wikipedia article titles. + * <p> + * For example, the article "Steam boat" could be queried with + * - "Steam boat" + * - "steam boat" + * - "Steam_boat" + * - "steam_boat" + * and additionally all redirects that might point to that article. + * + * @param title The title of the page. + * @return The page object for a given title. + * @throws WikiApiException If no page or redirect with this title exists or the title could not be properly parsed. + */ + public Page getPage(String title) throws WikiApiException { + return new Page(this, title, false); + } - } + /** + * Gets the page with exactly the given title.<br> + * + * Note that when using this method you are responsible for converting a normal search string into the right wiki-style.<br> + * + * If the title is a redirect, the corresponding page is returned.<br> + * + * @param exactTitle The exact title of the page. + * @return The page object for a given title. + * @throws WikiApiException If no page or redirect with this title exists or the title could not be properly parsed. + */ + public Page getPageByExactTitle(String exactTitle) throws WikiApiException { + return new Page(this, exactTitle, true); + } + /** + * Get all pages which match all lowercase/uppercase version of the given title.<br> + * If the title is a redirect, the corresponding page is returned.<br> + * Spaces in the title are converted to underscores, as this is a convention for Wikipedia article titles. + * + * @param title The title of the page. + * @return A set of page objects matching this title. + * @throws WikiApiException If no page or redirect with this title exists or the title could not be properly parsed. + */ + public Set<Page> getPages(String title) throws WikiApiException { + Set<Integer> ids = new HashSet<>(getPageIdsCaseInsensitive(title)); + + Set<Page> pages = new HashSet<>(); + for (Integer id : ids) { + pages.add(new Page(this, id)); + } + return pages; + } - /** - * Gets the discussion page for an article page with the given pageId. - * - * @param articlePageId The id of the page. - * @return The page object for a given pageId. - * @throws WikiApiException Thrown if errors occurred. - */ - public Page getDiscussionPage(int articlePageId) throws WikiApiException { - //Retrieve discussion page with article title - //TODO not the prettiest solution, but currently discussions are only marked in the title - return getDiscussionPage(getPage(articlePageId)); - } + /** + * Gets the page for a given pageId. + * + * @param pageId The id of the page. + * @return The page object for a given pageId. + * @throws WikiApiException Thrown if errors occurred. + */ + public Page getPage(int pageId) throws WikiApiException { + return new Page(this, pageId); + } - /** - * Gets the discussion page for the page with the given title. - * The page retrieval works as defined in {@link #getPage(String title)} - * - * @param title The title of the page for which the discussions should be retrieved. - * @return The page object for the discussion page. - * @throws WikiApiException If no page or redirect with this title exists or title could not be properly parsed. - */ - public Page getDiscussionPage(String title) throws WikiApiException { - return getDiscussionPage(getPage(title)); - } + /** + * Gets the title for a given pageId. + * + * @param pageId The id of the page. + * @return The title for the given pageId. + * @throws WikiApiException Thrown if errors occurred. + */ + public Title getTitle(int pageId) throws WikiApiException { + Session session = this.__getHibernateSession(); + session.beginTransaction(); + String sql = "select p.name from PageMapLine as p where p.pageId= :pId"; + String returnValue = session.createNativeQuery(sql, String.class) + .setParameter("pId", pageId, StandardBasicTypes.INTEGER) + .uniqueResult(); + session.getTransaction().commit(); - /** - * Gets the discussion page for the given article page - * The provided page must not be a discussion page - * - * @param articlePage the article page for which a discussion page should be retrieved - * @return The discussion page object for the given article page object - * @throws WikiApiException If no page or redirect with this title exists or title could not be properly parsed. - */ - public Page getDiscussionPage(Page articlePage) throws WikiApiException{ - String articleTitle = articlePage.getTitle().toString(); - if(articleTitle.startsWith(WikiConstants.DISCUSSION_PREFIX)){ - return articlePage; - }else{ - return new Page(this, WikiConstants.DISCUSSION_PREFIX+articleTitle); - } + if(returnValue == null){ + throw new WikiPageNotFoundException(); } + return new Title(returnValue); + } + /** + * Gets the page ids for a given title. + * + * @param title The title of the page. + * @return The id for the page with the given title. + * @throws WikiApiException Thrown if errors occurred. + */ + public List<Integer> getPageIds(String title) throws WikiApiException { + Session session = this.__getHibernateSession(); + session.beginTransaction(); + String sql = "select p.pageID from PageMapLine as p where p.name = :pName"; + Iterator<Integer> results = session.createQuery(sql, Integer.class) + .setParameter("pName", title, StandardBasicTypes.STRING) + .list().iterator(); + + session.getTransaction().commit(); + + if(!results.hasNext()){ + throw new WikiPageNotFoundException(); + } + List<Integer> resultList = new LinkedList<>(); + while(results.hasNext()){ + resultList.add(results.next()); + } + return resultList; + } - /** - * Returns an iterable containing all archived discussion pages for - * the page with the given title String. <br> - * The page retrieval works as defined in {@link #getPage(int)}. <br> - * The most recent discussion page is NOT included here! - * It can be obtained with {@link #getDiscussionPage(Page)}. - * - * @param articlePageId The id of the page for which to the the discussion archives - * @return The page object for the discussion page. - * @throws WikiApiException If no page or redirect with this title exists or title could not be properly parsed. - */ - public Iterable<Page> getDiscussionArchives(int articlePageId) throws WikiApiException { - //Retrieve discussion archive pages with page id - return getDiscussionArchives(getPage(articlePageId)); - } + /** + * Gets the page ids for a given title with case insensitive matching.<br> + * + * @param title The title of the page. + * @return The ids of the pages with the given title. + * @throws WikiApiException Thrown if errors occurred. + */ + public List<Integer> getPageIdsCaseInsensitive(String title) throws WikiApiException { + title = title.toLowerCase(); + title = title.replaceAll(" ", "_"); + + Session session = this.__getHibernateSession(); + session.beginTransaction(); + String sql = "select p.pageID from PageMapLine as p where lower(p.name) = :pName"; + Iterator<Integer> results = session.createQuery(sql, Integer.class) + .setParameter("pName", title, StandardBasicTypes.STRING) + .list().iterator(); + + session.getTransaction().commit(); + + if(!results.hasNext()){ + throw new WikiPageNotFoundException(); + } + List<Integer> resultList = new LinkedList<>(); + while(results.hasNext()){ + resultList.add(results.next()); + } + return resultList; + } - /** - * Returns an iterable containing all archived discussion pages for - * the page with the given title String. <br> - * The page retrieval works as defined in {@link #getPage(String title)}.<br> - * The most recent discussion page is NOT included here! - * It can be obtained with {@link #getDiscussionPage(Page)}. - * - * @param title The title of the page for which the discussions should be retrieved. - * @return The page object for the discussion page. - * @throws WikiApiException If no page or redirect with this title exists or title could not be properly parsed. - */ - public Iterable<Page> getDiscussionArchives(String title) throws WikiApiException { - //Retrieve discussion archive pages with page title - return getDiscussionArchives(getPage(title)); + /** + * Returns the article page for a given discussion page. + * + * @param discussionPage + * the discussion page object + * @return The page object of the article associated with the discussion. If + * the parameter already was an article, it is returned directly. + * @throws WikiApiException Thrown if errors occurred. + */ + public Page getArticleForDiscussionPage(Page discussionPage) throws WikiApiException { + if(discussionPage.isDiscussion()){ + String title = discussionPage.getTitle().getPlainTitle().replaceAll(WikiConstants.DISCUSSION_PREFIX, ""); + + if(title.contains("/")){ + //If we have a discussion archive + //TODO This does not support articles that contain slashes- + //However, the rest of the API cannot cope with that as well, so this should not be any extra trouble + title = title.split("/")[0]; + } + return getPage(title); + }else{ + return discussionPage; } - /** - * Return an iterable containing all archived discussion pages for - * the given article page. The most recent discussion page is not included. - * The most recent discussion page can be obtained with {@link #getDiscussionPage(Page)}. - * <br> - * The provided page Object must not be a discussion page itself! If it is - * a discussion page, is returned unchanged. - * - * @param articlePage the article page for which a discussion archives should be retrieved - * @return An iterable with the discussion archive page objects for the given article page object - * @throws WikiApiException If no page or redirect with this title exists or title could not be properly parsed. - */ - public Iterable<Page> getDiscussionArchives(Page articlePage) throws WikiApiException{ - String articleTitle = articlePage.getTitle().getWikiStyleTitle(); - if(!articleTitle.startsWith(WikiConstants.DISCUSSION_PREFIX)){ - articleTitle=WikiConstants.DISCUSSION_PREFIX+articleTitle; - } - - Session session = this.__getHibernateSession(); - session.beginTransaction(); - - List<Page> discussionArchives = new LinkedList<>(); + } - String sql = "SELECT pageID FROM PageMapLine where name like :name"; - Iterator<Integer> results = session.createQuery(sql, Integer.class) - .setParameter("name", articleTitle+"/%", StandardBasicTypes.STRING) - .list().iterator(); + /** + * Gets the discussion page for an article page with the given pageId. + * + * @param articlePageId The id of the page. + * @return The page object for a given pageId. + * @throws WikiApiException Thrown if errors occurred. + */ + public Page getDiscussionPage(int articlePageId) throws WikiApiException { + // Retrieve discussion page with article title + //TODO not the prettiest solution, but currently discussions are only marked in the title + return getDiscussionPage(getPage(articlePageId)); + } - session.getTransaction().commit(); + /** + * Gets the discussion page for the page with the given title. + * The page retrieval works as defined in {@link #getPage(String title)} + * + * @param title The title of the page for which the discussions should be retrieved. + * @return The page object for the discussion page. + * @throws WikiApiException If no page or redirect with this title exists or title could not be properly parsed. + */ + public Page getDiscussionPage(String title) throws WikiApiException { + return getDiscussionPage(getPage(title)); + } - while (results.hasNext()) { - int pageID = results.next(); - discussionArchives.add(getPage(pageID)); - } - return discussionArchives; + /** + * Gets the discussion page for the given article page + * The provided page must not be a discussion page + * + * @param articlePage the article page for which a discussion page should be retrieved + * @return The discussion page object for the given article page object + * @throws WikiApiException If no page or redirect with this title exists or title could not be properly parsed. + */ + public Page getDiscussionPage(Page articlePage) throws WikiApiException{ + String articleTitle = articlePage.getTitle().toString(); + if(articleTitle.startsWith(WikiConstants.DISCUSSION_PREFIX)){ + return articlePage; + }else{ + return new Page(this, WikiConstants.DISCUSSION_PREFIX+articleTitle); } + } - //// I do not want to make this public at the moment (TZ, March, 2007) - /** - * Gets the pages or redirects with a name similar to the pattern. - * Calling this method is quite costly, as similarity is computed for all names. - * @param pPattern The pattern. - * @param pSize The maximum size of the result list. Only the most similar results will be included. - * @return A map of pages with names similar to the pattern and their distance values. Smaller distances are more similar. - * @throws WikiApiException Thrown if errors occurred. - */ - protected Map<Page, Double> getSimilarPages(String pPattern, int pSize) throws WikiApiException { - Title title = new Title(pPattern); - String pattern = title.getWikiStyleTitle(); - - // a mapping of the most similar pages and their similarity values - // It is returned by this method. - Map<Page, Double> pageMap = new HashMap<>(); - - // holds a mapping of the best distance values to page IDs - Map<Integer, Double> distanceMap = new HashMap<>(); - - Session session = this.__getHibernateSession(); - session.beginTransaction(); - for (Object o : session.createQuery("select pml.pageID, pml.name from PageMapLine as pml").list()) { - Object[] row = (Object[]) o; - int pageID = (Integer) row[0]; - String pageName = (String) row[1]; - - // this returns a similarity - if we want to use it, we have to change the semantics the ordering of the results - // double distance = new Levenshtein().getSimilarity(pageName, pPattern); - double distance = new LevenshteinStringDistance().distance(pageName, pattern); - - distanceMap.put(pageID, distance); - - // if there are more than "pSize" entries in the map remove the last one (it has the biggest distance) - if (distanceMap.size() > pSize) { - Set<Entry<Integer, Double>> valueSortedSet = new TreeSet<>(new ValueComparator()); - valueSortedSet.addAll(distanceMap.entrySet()); - Iterator<Entry<Integer, Double>> it = valueSortedSet.iterator(); - // remove the first element - if (it.hasNext()) { - // get the id of this entry and remove it in the distanceMap - distanceMap.remove(it.next().getKey()); - } - } - } - session.getTransaction().commit(); - - for (int pageID : distanceMap.keySet()) { - Page page = null; - try { - page = this.getPage(pageID); - } catch (WikiPageNotFoundException e) { - logger.error("Page with pageID " + pageID + " could not be found. Fatal error. Terminating."); - e.printStackTrace(); - System.exit(1); - } - pageMap.put(page, distanceMap.get(pageID)); - } - return pageMap; - } + /** + * Returns an iterable containing all archived discussion pages for + * the page with the given title String. <br> + * The page retrieval works as defined in {@link #getPage(int)}. <br> + * The most recent discussion page is NOT included here! + * It can be obtained with {@link #getDiscussionPage(Page)}. + * + * @param articlePageId The id of the page for which to the the discussion archives + * @return The page object for the discussion page. + * @throws WikiApiException If no page or redirect with this title exists or title could not be properly parsed. + */ + public Iterable<Page> getDiscussionArchives(int articlePageId) throws WikiApiException { + //Retrieve discussion archive pages with page id + return getDiscussionArchives(getPage(articlePageId)); + } - /** - * Gets the category for a given title. - * If the {@link Category} title start with a lowercase letter it converts it to an uppercase letter, as each Wikipedia category title starts with an uppercase letter. - * Spaces in the title are converted to underscores, as this is a convention for Wikipedia category titles. - * <p> - * For example, the (possible) category "Famous steamboats" could be queried with - * - "Famous steamboats" - * - "Famous_steamboats" - * - "famous steamboats" - * - "famous_steamboats" - * @param title The title of the category. - * @return The category object with the given title. - * @throws WikiApiException If no category with the given title exists. - */ - public Category getCategory(String title) throws WikiApiException { - Category cat = new Category(this, title); - return cat; - } + /** + * Returns an iterable containing all archived discussion pages for + * the page with the given title String. <br> + * The page retrieval works as defined in {@link #getPage(String title)}.<br> + * The most recent discussion page is NOT included here! + * It can be obtained with {@link #getDiscussionPage(Page)}. + * + * @param title The title of the page for which the discussions should be retrieved. + * @return The page object for the discussion page. + * @throws WikiApiException If no page or redirect with this title exists or title could not be properly parsed. + */ + public Iterable<Page> getDiscussionArchives(String title) throws WikiApiException { + //Retrieve discussion archive pages with page title + return getDiscussionArchives(getPage(title)); + } - /** - * Gets the category for a given pageId. - * @param pageId The id of the {@link Category}. - * @return The category object or null if no category with this pageId exists. - */ - public Category getCategory(int pageId) { - long hibernateId = __getCategoryHibernateId(pageId); - if (hibernateId == -1) { - return null; - } + /** + * Return an iterable containing all archived discussion pages for + * the given article page. The most recent discussion page is not included. + * The most recent discussion page can be obtained with {@link #getDiscussionPage(Page)}. + * <br> + * The provided page Object must not be a discussion page itself! If it is + * a discussion page, is returned unchanged. + * + * @param articlePage the article page for which a discussion archives should be retrieved + * @return An iterable with the discussion archive page objects for the given article page object + * @throws WikiApiException If no page or redirect with this title exists or title could not be properly parsed. + */ + public Iterable<Page> getDiscussionArchives(Page articlePage) throws WikiApiException { + String articleTitle = articlePage.getTitle().getWikiStyleTitle(); + if(!articleTitle.startsWith(WikiConstants.DISCUSSION_PREFIX)){ + articleTitle=WikiConstants.DISCUSSION_PREFIX+articleTitle; + } + + Session session = this.__getHibernateSession(); + session.beginTransaction(); + + List<Page> discussionArchives = new LinkedList<>(); + + String sql = "SELECT pageID FROM PageMapLine where name like :name"; + Iterator<Integer> results = session.createQuery(sql, Integer.class) + .setParameter("name", articleTitle+"/%", StandardBasicTypes.STRING) + .list().iterator(); + + session.getTransaction().commit(); + + while (results.hasNext()) { + int pageID = results.next(); + discussionArchives.add(getPage(pageID)); + } + return discussionArchives; + } - try { - Category cat = new Category(this, hibernateId); - return cat; - } catch (WikiPageNotFoundException e) { - return null; + /** + * Gets the pages or redirects with a name similar to the pattern. + * Calling this method is quite costly, as similarity is computed for all names. + * @param pPattern The pattern. + * @param pSize The maximum size of the result list. Only the most similar results will be included. + * @return A map of pages with names similar to the pattern and their distance values. Smaller distances are more similar. + * @throws WikiApiException Thrown if errors occurred. + */ + //// I do not want to make this public at the moment (TZ, March, 2007) + protected Map<Page, Double> getSimilarPages(String pPattern, int pSize) throws WikiApiException { + Title title = new Title(pPattern); + String pattern = title.getWikiStyleTitle(); + + // a mapping of the most similar pages and their similarity values + // It is returned by this method. + Map<Page, Double> pageMap = new HashMap<>(); + + // holds a mapping of the best distance values to page IDs + Map<Integer, Double> distanceMap = new HashMap<>(); + + Session session = this.__getHibernateSession(); + session.beginTransaction(); + for (Object o : session.createQuery("select pml.pageID, pml.name from PageMapLine as pml").list()) { + Object[] row = (Object[]) o; + int pageID = (Integer) row[0]; + String pageName = (String) row[1]; + + // this returns a similarity - if we want to use it, we have to change the semantics the ordering of the results + // double distance = new Levenshtein().getSimilarity(pageName, pPattern); + double distance = new LevenshteinStringDistance().distance(pageName, pattern); + + distanceMap.put(pageID, distance); + + // if there are more than "pSize" entries in the map remove the last one (it has the biggest distance) + if (distanceMap.size() > pSize) { + Set<Entry<Integer, Double>> valueSortedSet = new TreeSet<>(new ValueComparator()); + valueSortedSet.addAll(distanceMap.entrySet()); + Iterator<Entry<Integer, Double>> it = valueSortedSet.iterator(); + // remove the first element + if (it.hasNext()) { + // get the id of this entry and remove it in the distanceMap + distanceMap.remove(it.next().getKey()); } + } } + session.getTransaction().commit(); - /** - * This returns an iterable over all {@link Category categories}, as returning all category objects would be much too expensive. - * @return An iterable over all categories. - */ - public Iterable<Category> getCategories() { - return new CategoryIterable(this); + for (int pageID : distanceMap.keySet()) { + Page page = null; + try { + page = this.getPage(pageID); + } catch (WikiPageNotFoundException e) { + logger.error("Page with pageID " + pageID + " could not be found. Fatal error. Terminating."); + e.printStackTrace(); + System.exit(1); + } + pageMap.put(page, distanceMap.get(pageID)); } + return pageMap; + } - /** - * Gets the {@link Category categories} for a given {@link Page} identified by its {@code pageTitle}. - * @param pageTitle The title of a {@link Page}, not a category. - * @return The category objects which are associated with the given {@code pageTitle}. - * @throws WikiPageNotFoundException Thrown if no {@link Page} exists for the given {@code pageTitle}. - */ - public Set<Category> getCategories(String pageTitle) throws WikiPageNotFoundException - { - if (pageTitle == null || pageTitle.length() == 0) { - throw new WikiPageNotFoundException(); - } + /** + * Gets the category for a given title. + * If the {@link Category} title start with a lowercase letter it converts it to an uppercase letter, + * as each Wikipedia category title starts with an uppercase letter. Spaces in the title are converted to + * underscores, as this is a convention for Wikipedia category titles. + * <p> + * For example, the (possible) category "Famous steamboats" could be queried with + * - "Famous steamboats" + * - "Famous_steamboats" + * - "famous steamboats" + * - "famous_steamboats" + * @param title The title of the category. + * @return The category object with the given title. + * @throws WikiApiException If no category with the given title exists. + */ + public Category getCategory(String title) throws WikiApiException { + return new Category(this, title); + } - Session session = this.__getHibernateSession(); - session.beginTransaction(); - List<Integer> categoryHibernateIds = session.createQuery( - "select c from Page p left join p.categories c where p.name = :pageTitle", Integer.class) - .setParameter("pageTitle", pageTitle).list(); - session.getTransaction().commit(); - - Set<Category> categorySet = new HashSet<>(categoryHibernateIds.size()); - for (int hibernateId : categoryHibernateIds) { - try { - categorySet.add(new Category(this, hibernateId)); - } catch (WikiPageNotFoundException e) { - logger.warn("Could not load Category by it's HibernateId = '"+hibernateId+"'"); - } - } - return categorySet; + /** + * Gets the category for a given pageId. + * @param pageId The id of the {@link Category}. + * @return The category object or null if no category with this pageId exists. + */ + public Category getCategory(int pageId) { + long hibernateId = __getCategoryHibernateId(pageId); + if (hibernateId == -1) { + return null; } - /** - * Get all wikipedia {@link Category categories}. - * Returns only an iterable, as a collection may not fit into memory for a large wikipedia. - * @param bufferSize The size of the internal page buffer. - * @return An iterable over all categories. - */ - protected Iterable<Category> getCategories(int bufferSize) { - return new CategoryIterable(this, bufferSize); + try { + return new Category(this, hibernateId); + } catch (WikiPageNotFoundException e) { + return null; } + } + /** + * This returns an iterable over all {@link Category categories}, as returning all category objects would be much too expensive. + * @return An iterable over all categories. + */ + public Iterable<Category> getCategories() { + return new CategoryIterable(this); + } - /** - * Protected method that is much faster than the public version, but exposes too much implementation details. - * Get a set with all category pageIDs. Returning all category objects is much too expensive. - * @return A set with all category pageIDs - */ - protected Set<Integer> __getCategories() { + /** + * Gets the {@link Category categories} for a given {@link Page} identified by its {@code pageTitle}. + * @param pageTitle The title of a {@link Page}, not a category. + * @return The category objects which are associated with the given {@code pageTitle}. + * @throws WikiPageNotFoundException Thrown if no {@link Page} exists for the given {@code pageTitle}. + */ + public Set<Category> getCategories(String pageTitle) throws WikiPageNotFoundException { + if (pageTitle == null || pageTitle.length() == 0) { + throw new WikiPageNotFoundException(); + } + + Session session = this.__getHibernateSession(); + session.beginTransaction(); + String sql = "select c from Page p left join p.categories c where p.name = :pageTitle"; + List<Integer> categoryHibernateIds = session.createQuery(sql, Integer.class) + .setParameter("pageTitle", pageTitle).list(); + session.getTransaction().commit(); + + Set<Category> categorySet = new HashSet<>(categoryHibernateIds.size()); + for (int hibernateId : categoryHibernateIds) { + try { + categorySet.add(new Category(this, hibernateId)); + } catch (WikiPageNotFoundException e) { + logger.warn("Could not load Category by it's HibernateId = '"+hibernateId+"'"); + } + } + return categorySet; + } - // TODO this should be replaced with the buffered category iterator, as it might produce an HeapSpace Overflow, if there are too many categories. - Session session = this.__getHibernateSession(); - session.beginTransaction(); - List<Integer> idList = session.createQuery( - "select cat.pageId from Category as cat", Integer.class).list(); - Set<Integer> categorySet = new HashSet<>(idList); - session.getTransaction().commit(); + /** + * Get all wikipedia {@link Category categories}. + * Returns only an iterable, as a collection may not fit into memory for a large wikipedia. + * @param bufferSize The size of the internal page buffer. + * @return An iterable over all categories. + */ + protected Iterable<Category> getCategories(int bufferSize) { + return new CategoryIterable(this, bufferSize); + } - return categorySet; - } - /** - * Get all wikipedia pages. - * Does not include redirects, as they are only pointers to real pages. - * Returns only an iterable, as a collection may not fit into memory for a large wikipedia. - * @return An iterable over all pages. - */ - public Iterable<Page> getPages() { - return new PageIterable(this, false); - } + /** + * Protected method that is much faster than the public version, but exposes too much implementation details. + * Get a set with all category pageIDs. Returning all category objects is much too expensive. + * @return A set with all category pageIDs + */ + // TODO this should be replaced with the buffered category iterator, as it might produce an HeapSpace Overflow, if there are too many categories. + protected Set<Integer> __getCategories() { + Session session = this.__getHibernateSession(); + session.beginTransaction(); + String sql = "select cat.pageId from Category as cat"; + List<Integer> idList = session.createQuery(sql, Integer.class).list(); + session.getTransaction().commit(); + + return new HashSet<>(idList); + } - /** - * Get all wikipedia pages. - * Does not include redirects, as they are only pointers to real pages. - * Returns only an iterable, as a collection may not fit into memory for a large wikipedia. - * @param bufferSize The size of the internal page buffer. - * @return An iterable over all pages. - */ - protected Iterable<Page> getPages(int bufferSize) { - return new PageIterable(this, false, bufferSize); - } + /** + * Get all wikipedia pages. + * Does not include redirects, as they are only pointers to real pages. + * Returns only an iterable, as a collection may not fit into memory for a large wikipedia. + * @return An iterable over all pages. + */ + public Iterable<Page> getPages() { + return new PageIterable(this, false); + } - /** - * Protected method that is much faster than the public version, but exposes too much implementation details. - * Get a set with all {@code pageIDs}. Returning all page objects is much too expensive. - * Does not include redirects, as they are only pointers to real pages. - * <p> - * As ids can be useful for several application (e.g. in combination with - * the RevisionMachine, they have been made publicly available via - * {@link #getPageIds()}. - * - * @return A set with all {@code pageIDs}. Returning all pages is much to expensive. - */ - protected Set<Integer> __getPages() { - Session session = this.__getHibernateSession(); - session.beginTransaction(); - List<Integer> idList = session.createQuery( - "select page.pageId from Page as page", Integer.class).list(); - Set<Integer> pageSet = new HashSet<>(idList); - session.getTransaction().commit(); - - return pageSet; - } + /** + * Get all wikipedia pages. + * Does not include redirects, as they are only pointers to real pages. + * Returns only an iterable, as a collection may not fit into memory for a large wikipedia. + * @param bufferSize The size of the internal page buffer. + * @return An iterable over all pages. + */ + protected Iterable<Page> getPages(int bufferSize) { + return new PageIterable(this, false, bufferSize); + } - /** - * @return an iterable over all {@code pageIDs} (without redirects) - */ - public Iterable<Integer> getPageIds(){ - return this.__getPages(); - } + /** + * Protected method that is much faster than the public version, but exposes too much implementation details. + * Get a set with all {@code pageIDs}. Returning all page objects is much too expensive. + * Does not include redirects, as they are only pointers to real pages. + * <p> + * As ids can be useful for several application (e.g. in combination with + * the RevisionMachine, they have been made publicly available via + * {@link #getPageIds()}. + * + * @return A set with all {@code pageIDs}. Returning all pages is much to expensive. + */ + protected Set<Integer> __getPages() { + Session session = this.__getHibernateSession(); + session.beginTransaction(); + String sql = "select page.pageId from Page as page"; + List<Integer> idList = session.createQuery(sql, Integer.class).list(); + session.getTransaction().commit(); + + return new HashSet<>(idList); + } - /** - * Get the pages that match the given query. - * Does not include redirects, as they are only pointers to real pages. - * Attention: may be running very slow, depending on the size of the Wikipedia! - * @param query A query object containing the query conditions. - * @return A set of pages that match the given query. - * @throws WikiApiException Thrown if errors occurred. - */ - public Iterable<Page> getPages(PageQuery query) throws WikiApiException { - return new PageQueryIterable(this, query); - } + /** + * @return an iterable over all {@code pageIDs} (without redirects) + */ + public Iterable<Integer> getPageIds(){ + return this.__getPages(); + } + /** + * Get the pages that match the given query. + * Does not include redirects, as they are only pointers to real pages. + * Attention: may be running very slow, depending on the size of the Wikipedia! + * @param query A query object containing the query conditions. + * @return A set of pages that match the given query. + * @throws WikiApiException Thrown if errors occurred. + */ + public Iterable<Page> getPages(PageQuery query) throws WikiApiException { + return new PageQueryIterable(this, query); + } - /** - * Get all articles (pages MINUS disambiguationPages MINUS redirects). - * Returns only an iterable, as a collection may not fit into memory for a large wikipedia. - * @return An iterable of all article pages. - */ - public Iterable<Page> getArticles() { - return new PageIterable(this, true); - } - /** - * Get all titles including disambiguation pages and redirects). - * Returns only an iterable, as a collection may not fit into memory for a large wikipedia. - * @return An iterable of all article pages. - */ - public Iterable<Title> getTitles() { - return new TitleIterable(this); - } + /** + * Get all articles (pages MINUS disambiguationPages MINUS redirects). + * Returns only an iterable, as a collection may not fit into memory for a large wikipedia. + * @return An iterable of all article pages. + */ + public Iterable<Page> getArticles() { + return new PageIterable(this, true); + } - /** - * @return The {@link Language} of this Wikipedia. - */ - public Language getLanguage() { - return this.language; - } + /** + * Get all titles including disambiguation pages and redirects). + * Returns only an iterable, as a collection may not fit into memory for a large wikipedia. + * @return An iterable of all article pages. + */ + public Iterable<Title> getTitles() { + return new TitleIterable(this); + } - /** - * Tests, whether a page or redirect with the given title exists. - * Trying to retrieve a page that does not exist in Wikipedia throws an exception. - * You may catch the exception or use this test, depending on your task. - * @param title The title of the page. - * @return {@code True}, if a page or redirect with that title exits, {@code false} otherwise. - */ - public boolean existsPage(String title) { - - if (title == null || title.isEmpty()) { - return false; - } - Title t; - try { - t = new Title(title); - } catch (WikiTitleParsingException e) { - return false; - } - String encodedTitle = t.getWikiStyleTitle(); + /** + * @return The {@link Language} of this Wikipedia. + */ + public Language getLanguage() { + return this.language; + } - Session session = this.__getHibernateSession(); - session.beginTransaction(); - String query = "select p.id from PageMapLine as p where p.name = :pName"; - if(dbConfig.supportsCollation()) { - query += SQL_COLLATION; - } - Object returnValue = session.createNativeQuery(query) + /** + * Tests, whether a page or redirect with the given title exists. + * Trying to retrieve a page that does not exist in Wikipedia throws an exception. + * You may catch the exception or use this test, depending on your task. + * @param title The title of the page. + * @return {@code True}, if a page or redirect with that title exits, {@code false} otherwise. + */ + public boolean existsPage(String title) { + + if (title == null || title.isEmpty()) { + return false; + } + Title t; + try { + t = new Title(title); + } catch (WikiTitleParsingException e) { + return false; + } + String encodedTitle = t.getWikiStyleTitle(); + + Session session = this.__getHibernateSession(); + session.beginTransaction(); + String query = "select p.id from PageMapLine as p where p.name = :pName"; + if(dbConfig.supportsCollation()) { + query += SQL_COLLATION; + } + Object returnValue = session.createNativeQuery(query) .setParameter("pName", encodedTitle, StandardBasicTypes.STRING) .uniqueResult(); - session.getTransaction().commit(); + session.getTransaction().commit(); - return returnValue != null; - } - - /** - * Tests, whether a page with the given pageID exists. - * Trying to retrieve a pageID that does not exist in Wikipedia throws an exception. - * - * @param pageID A pageID. - * @return {@code True}, if a page with that pageID exits, {@code false} otherwise. - */ - public boolean existsPage(int pageID) { - - // This is a hack to provide a much quicker way to test whether a page exists. - // Encoding the title in this way surpasses the normal way of creating a title first. - // Anyway, I do not like this hack :-| - - if (pageID < 0) { - return false; - } + return returnValue != null; + } - Session session = this.__getHibernateSession(); - session.beginTransaction(); - List returnList = session.createNativeQuery( - "select p.id from PageMapLine as p where p.pageID = :pageId") + /** + * Tests, whether a page with the given pageID exists. + * Trying to retrieve a pageID that does not exist in Wikipedia throws an exception. + * + * @param pageID A pageID. + * @return {@code True}, if a page with that pageID exits, {@code false} otherwise. + */ + public boolean existsPage(int pageID) { + + // This is a hack to provide a much quicker way to test whether a page exists. + // Encoding the title in this way surpasses the normal way of creating a title first. + // Anyway, I do not like this hack :-| + if (pageID < 0) { + return false; + } + + Session session = this.__getHibernateSession(); + session.beginTransaction(); + String sql = "select p.id from PageMapLine as p where p.pageID = :pageId"; + Long returnValue = session.createNativeQuery(sql, Long.class) .setParameter("pageId", pageID, StandardBasicTypes.INTEGER) - .list(); - session.getTransaction().commit(); - - return returnList.size() != 0; - } - - /** - * Get the hibernate ID to a given pageID of a page. - * We need different methods for pages and categories here, as a page and a category can have the same ID. - * - * @param pageID A pageID that should be mapped to the corresponding hibernate ID. - * @return The hibernateID of the page with pageID or -1, if the pageID is not valid - */ - protected long __getPageHibernateId(int pageID) { - long hibernateID = -1; - - // first look in the id mapping cache - if (idMapPages.containsKey(pageID)) { - return idMapPages.get(pageID); - } + .uniqueResult(); + session.getTransaction().commit(); - // The id was not found in the id mapping cache. - // It may not be in the cahe or may not exist at all. - Session session = this.__getHibernateSession(); - session.beginTransaction(); - String sql = "select page.id from Page as page where page.pageId = :pageId"; - Long retObjectPage = session.createQuery(sql, Long.class) - .setParameter("pageId", pageID, StandardBasicTypes.INTEGER) - .uniqueResult(); - session.getTransaction().commit(); - if (retObjectPage != null) { - hibernateID = retObjectPage; - // add it to the cache - idMapPages.put(pageID, hibernateID); - return hibernateID; - } + return returnValue != null; + } - return hibernateID; + /** + * Get the hibernate ID to a given pageID of a page. + * We need different methods for pages and categories here, as a page and a category can have the same ID. + * + * @param pageID A pageID that should be mapped to the corresponding hibernate ID. + * @return The hibernateID of the page with pageID or -1, if the pageID is not valid + */ + protected long __getPageHibernateId(int pageID) { + long hibernateID = -1; + + // first look in the id mapping cache + if (idMapPages.containsKey(pageID)) { + return idMapPages.get(pageID); + } + + // The id was not found in the id mapping cache. + // It may not be in the cahe or may not exist at all. + Session session = this.__getHibernateSession(); + session.beginTransaction(); + String sql = "select page.id from Page as page where page.pageId = :pageId"; + Long retObjectPage = session.createQuery(sql, Long.class) + .setParameter("pageId", pageID, StandardBasicTypes.INTEGER) + .uniqueResult(); + session.getTransaction().commit(); + if (retObjectPage != null) { + hibernateID = retObjectPage; + // add it to the cache + idMapPages.put(pageID, hibernateID); + return hibernateID; } - /** - * Get the hibernate ID to a given pageID of a category. - * We need different methods for pages and categories here, as a page and a category can have the same ID. - * - * @param pageID A pageID that should be mapped to the corresponding hibernate ID. - * @return The hibernateID of the page with pageID or -1, if the pageID is not valid - */ - protected long __getCategoryHibernateId(int pageID) { - long hibernateID = -1; - - // first look in the id mapping cache - if (idMapCategories.containsKey(pageID)) { - return idMapCategories.get(pageID); - } - - // The id was not found in the id mapping cache. - // It may not be in the cahe or may not exist at all. - Session session = this.__getHibernateSession(); - session.beginTransaction(); - String sql = "select cat.id from Category as cat where cat.pageId = :pageId"; - Long retObjectPage = session.createQuery(sql, Long.class) - .setParameter("pageId", pageID, StandardBasicTypes.INTEGER) - .uniqueResult(); - session.getTransaction().commit(); - if (retObjectPage != null) { - hibernateID = retObjectPage; - // add it to the cache - idMapCategories.put(pageID, hibernateID); - } + return hibernateID; + } - return hibernateID; + /** + * Get the hibernate ID to a given pageID of a category. + * We need different methods for pages and categories here, as a page and a category can have the same ID. + * + * @param pageID A pageID that should be mapped to the corresponding hibernate ID. + * @return The hibernateID of the page with pageID or -1, if the pageID is not valid + */ + protected long __getCategoryHibernateId(int pageID) { + long hibernateID = -1; + + // first look in the id mapping cache + if (idMapCategories.containsKey(pageID)) { + return idMapCategories.get(pageID); + } + + // The id was not found in the id mapping cache. + // It may not be in the cahe or may not exist at all. + Session session = this.__getHibernateSession(); + session.beginTransaction(); + String sql = "select cat.id from Category as cat where cat.pageId = :pageId"; + Long retObjectPage = session.createQuery(sql, Long.class) + .setParameter("pageId", pageID, StandardBasicTypes.INTEGER) + .uniqueResult(); + session.getTransaction().commit(); + if (retObjectPage != null) { + hibernateID = retObjectPage; + // add it to the cache + idMapCategories.put(pageID, hibernateID); } - /** - * @return A {@link MetaData} object containing all meta data about this instance of Wikipedia. - */ - public MetaData getMetaData() { - return this.metaData; - } + return hibernateID; + } - /** - * @return The {@link DatabaseConfiguration} object that was used to create the Wikipedia object. - */ - public DatabaseConfiguration getDatabaseConfiguration() { - return this.dbConfig; - } + /** + * @return A {@link MetaData} object containing all meta data about this instance of Wikipedia. + */ + public MetaData getMetaData() { + return this.metaData; + } - /** - * @return Shortcut for getting a hibernate session. - */ - protected Session __getHibernateSession() { - return WikiHibernateUtil.getSessionFactory(this.dbConfig).getCurrentSession(); - } + /** + * @return The {@link DatabaseConfiguration} object that was used to create the Wikipedia object. + */ + public DatabaseConfiguration getDatabaseConfiguration() { + return this.dbConfig; + } - /** - * The ID consists of the host, the database, and the language. - * This should be unique in most cases. - * @return Returns a unique ID for this Wikipedia object. - */ - public String getWikipediaId() { - StringBuilder sb = new StringBuilder(); - sb.append(this.getDatabaseConfiguration().getHost()); - sb.append("_"); - sb.append(this.getDatabaseConfiguration().getDatabase()); - sb.append("_"); - sb.append(this.getDatabaseConfiguration().getLanguage()); - return sb.toString(); - } + /** + * @return Shortcut for getting a hibernate session. + */ + protected Session __getHibernateSession() { + return WikiHibernateUtil.getSessionFactory(this.dbConfig).getCurrentSession(); + } + + /** + * The ID consists of the host, the database, and the language. + * This should be unique in most cases. + * @return Returns a unique ID for this Wikipedia object. + */ + public String getWikipediaId() { + StringBuilder sb = new StringBuilder(); + sb.append(this.getDatabaseConfiguration().getHost()); + sb.append("_"); + sb.append(this.getDatabaseConfiguration().getDatabase()); + sb.append("_"); + sb.append(this.getDatabaseConfiguration().getLanguage()); + return sb.toString(); + } } -class ValueComparator implements Comparator<Map.Entry<Integer,Double>> { +class ValueComparator implements Comparator<Entry<Integer,Double>> { @Override - public int compare(Entry<Integer, Double> e1, Entry<Integer, Double> e2) { - return Double.compare(e2.getValue(), e1.getValue()); + public int compare(Entry<Integer, Double> e1, Entry<Integer, Double> e2) { + return Double.compare(e2.getValue(), e1.getValue()); } } diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/WikipediaInfo.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/WikipediaInfo.java index 07274598..099be59d 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/WikipediaInfo.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/WikipediaInfo.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -34,381 +34,390 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -/** Holds numerous information on a given subset (that may also be +/** + * Holds numerous information on a given subset (that may also be * the whole Wikipedia) of Wikipedia nodes. */ public class WikipediaInfo { - private static final Logger logger = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + private static final Logger logger = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); - private Iterable<Page> pages; - private double averageFanOut; + private Iterable<Page> pages; + private double averageFanOut; - private int numberOfPages; + private int numberOfPages; - private Map<Integer,Integer> degreeDistribution; - private Set<Integer> categorizedArticleSet; + private Map<Integer, Integer> degreeDistribution; + private Set<Integer> categorizedArticleSet; - private Wikipedia wiki; + private Wikipedia wiki; - /** - * Get infos for the whole wikipedia. - * @param pWiki The wiki object. - */ - public WikipediaInfo(Wikipedia pWiki) throws WikiApiException { - this.wiki = pWiki; - new WikipediaInfo(this.wiki.getPages()); - - } + /** + * Get infos for the whole wikipedia. + * + * @param pWiki The wiki object. + */ + public WikipediaInfo(Wikipedia pWiki) throws WikiApiException { + this.wiki = pWiki; + new WikipediaInfo(this.wiki.getPages()); + } - /** - * Get infos only for a subset of articles. - * @param pPages A set of pages. Only this subset of wiki pages is used in the info object. - */ - public WikipediaInfo(Iterable<Page> pPages) throws WikiApiException { - if (pPages == null) { - throw new WikiApiException("The page set has to be initialized."); - } - pages = pPages; - averageFanOut = -1.0; // lazy initialization => it is computed and stored when it is accessed + /** + * Get infos only for a subset of articles. + * + * @param pPages A set of pages. Only this subset of wiki pages is used in the info object. + */ + public WikipediaInfo(Iterable<Page> pPages) throws WikiApiException { + if (pPages == null) { + throw new WikiApiException("The page set has to be initialized."); + } - degreeDistribution = new HashMap<>(); - categorizedArticleSet = new HashSet<>(); + pages = pPages; + averageFanOut = -1.0; // lazy initialization => it is computed and stored when it is accessed - // get number of pages - numberOfPages = 0; - while (pages.iterator().hasNext()) { - numberOfPages++; - pages.iterator().next(); - } + degreeDistribution = new HashMap<>(); + categorizedArticleSet = new HashSet<>(); + // get number of pages + numberOfPages = 0; + while (pages.iterator().hasNext()) { + numberOfPages++; + pages.iterator().next(); } + } - /** - * Computes the average fan out of the page set. - * Fan out is the number of outgoing links per page. - * @param pages The pages in an iterable form. - * @return The average fan out. - */ - private double computeAverageFanOut(Iterable<Page> pages) { - Set<Integer> pageIDs = new HashSet<>(); - while (pages.iterator().hasNext()) { - pageIDs.add(pages.iterator().next().getPageId()); - } + /** + * Computes the average fan out of the page set. + * Fan out is the number of outgoing links per page. + * + * @param pages The pages in an iterable form. + * @return The average fan out. + */ + private double computeAverageFanOut(Iterable<Page> pages) { - if (pageIDs.isEmpty()) { - logger.warn("Cannot compute average fan-out of an empty page set."); - return 0.0; - } + Set<Integer> pageIDs = new HashSet<>(); + while (pages.iterator().hasNext()) { + pageIDs.add(pages.iterator().next().getPageId()); + } - int fanOutCounter = 0; + if (pageIDs.isEmpty()) { + logger.warn("Cannot compute average fan-out of an empty page set."); + return 0.0; + } - Session session = this.wiki.__getHibernateSession(); - session.beginTransaction(); - for (Object o : session.createQuery("select page.outLinks, page.pageId from Page as page").list()) { - Object[] row = (Object[]) o; - Set outLinks = (Set) row[0]; - Integer pageId = (Integer) row[1]; + int fanOutCounter = 0; - // if the current page ID is in the desired result set => add outlink value - if (pageIDs.contains(pageId)) { - fanOutCounter += outLinks.size(); - } - } - session.getTransaction().commit(); + Session session = this.wiki.__getHibernateSession(); + session.beginTransaction(); + for (Object o : session.createQuery("select page.outLinks, page.pageId from Page as page").list()) { + Object[] row = (Object[]) o; + Set outLinks = (Set) row[0]; + Integer pageId = (Integer) row[1]; - return (double) fanOutCounter / this.getNumberOfPages(); + // if the current page ID is in the desired result set => add outlink value + if (pageIDs.contains(pageId)) { + fanOutCounter += outLinks.size(); + } } + session.getTransaction().commit(); - /** - * @return Returns the averageFanOut. - */ - public double getAverageFanOut() { - if (averageFanOut < 0) { // not yet initialized - averageFanOut = computeAverageFanOut(this.pages); - } + return (double) fanOutCounter / this.getNumberOfPages(); + } - return averageFanOut; + /** + * @return Returns the averageFanOut. + */ + public double getAverageFanOut() { + if (averageFanOut < 0) { // not yet initialized + averageFanOut = computeAverageFanOut(this.pages); } - /** - * @return Returns the numberOfPages. - */ - public int getNumberOfPages() { - return numberOfPages; + return averageFanOut; + } + + /** + * @return Returns the numberOfPages. + */ + public int getNumberOfPages() { + return numberOfPages; + } + + /** + * Building a mapping from categories to article sets. + * + * @param pWiki The wikipedia object. + * @param pNodes The category nodes that should be used to build the map. + * @return A mapping from categories to article sets. + * @throws WikiPageNotFoundException + */ + private Map<Integer, Set<Integer>> getCategoryArticleMap(Wikipedia pWiki, Set<Integer> pNodes) throws WikiPageNotFoundException { + Map<Integer, Set<Integer>> categoryArticleMap = new HashMap<>(); + + int progress = 0; + for (int node : pNodes) { + progress++; + ApiUtilities.printProgressInfo(progress, pNodes.size(), 10, ApiUtilities.ProgressInfoMode.TEXT, "Getting category-article map."); + + Category cat = pWiki.getCategory(node); + if (cat != null) { + Set<Integer> pages = new HashSet<>(cat.__getPages()); + categoryArticleMap.put(node, pages); + } else { + logger.info("{} is not a category.", node); + } } - /** - * Building a mapping from categories to article sets. - * @param pWiki The wikipedia object. - * @param pNodes The category nodes that should be used to build the map. - * @return A mapping from categories to article sets. - * @throws WikiPageNotFoundException - */ - private Map<Integer,Set<Integer>> getCategoryArticleMap(Wikipedia pWiki, Set<Integer> pNodes) throws WikiPageNotFoundException { - Map<Integer,Set<Integer>> categoryArticleMap = new HashMap<>(); - - int progress = 0; - for (int node : pNodes) { - progress++; - ApiUtilities.printProgressInfo(progress, pNodes.size(), 10, ApiUtilities.ProgressInfoMode.TEXT, "Getting category-article map."); - - Category cat = pWiki.getCategory(node); - if (cat != null) { - Set<Integer> pages = new HashSet<>(cat.__getPages()); - categoryArticleMap.put(node, pages); - } - else { - logger.info("{} is not a category.", node); + return categoryArticleMap; + } + + /** + * Get various graph parameters like diameter, average out-degree etc of the categroy graph. + * + * @param catGraph The category graph. + */ + public void getGraphParameters(CategoryGraph catGraph) { + double startTime = System.currentTimeMillis(); + logger.error(catGraph.getGraphInfo()); + double endTime = (System.currentTimeMillis() - startTime) / 1000.0; + logger.error(endTime + "s"); + } + + /** + * Articles in wikipedia may be tagged with multiple categories. + * It may be interesting to know how many articles have at least one category in common. + * Such articles would have a very high semantic relatedness even if they share a quite secondary category. + * + * @param pWiki The wikipedia object. + * @param catGraph The category graph. + * @throws WikiApiException + */ + public void getOverlapping(Wikipedia pWiki, CategoryGraph catGraph) throws WikiApiException { + double startTime = System.currentTimeMillis(); + + int articlesWithOverlappingCategories = getArticlesWithOverlappingCategories(pWiki, catGraph); + double overlappingCategoriesRatio = (double) articlesWithOverlappingCategories / (double) pWiki.getMetaData().getNumberOfPages(); + logger.info(articlesWithOverlappingCategories + " - " + pWiki.getMetaData().getNumberOfPages() + " - " + overlappingCategoriesRatio); + + double endTime = (System.currentTimeMillis() - startTime) / 1000.0; + logger.debug("{} ms", endTime); + } + + + /** + * Articles in wikipedia may be tagged with multiple categories. + * It may be interesting to know how many articles have at least one category in common. + * Such articles would have a very high semantic relatedness even if they share a quite secondary category. + * + * @param pWiki The wikipedia object. + * @param pGraph The category graph. + * @return The number of articles that have at least one category in common. + * @throws WikiPageNotFoundException + */ + private int getArticlesWithOverlappingCategories(Wikipedia pWiki, CategoryGraph pGraph) throws WikiPageNotFoundException { + Set<Integer> overlappingArticles = new HashSet<>(); + + // iterate over all node pairs + Set<Integer> nodes = pGraph.getGraph().vertexSet(); + + Map<Integer, Set<Integer>> categoryArticleMap = getCategoryArticleMap(pWiki, nodes); + + // sort the Array so we can use a simple iteration with two for loops to access all pairs + Object[] nodeArray = nodes.toArray(); + Arrays.sort(nodeArray); + + int progress = 0; + for (int i = 0; i < nodes.size(); i++) { + progress++; + ApiUtilities.printProgressInfo(progress, nodes.size(), 100, ApiUtilities.ProgressInfoMode.TEXT, ""); + + int outerNode = (Integer) nodeArray[i]; + + for (int j = i + 1; j < nodes.size(); j++) { + int innerNode = (Integer) nodeArray[j]; + + // test whether the categories have pages in common + Set<Integer> outerPages = categoryArticleMap.get(outerNode); + Set<Integer> innerPages = categoryArticleMap.get(innerNode); + + for (int outerPage : outerPages) { + if (innerPages.contains(outerPage)) { + if (!overlappingArticles.contains(outerPage)) { + overlappingArticles.add(outerPage); } + } } - return categoryArticleMap; - } - - /** - * Get various graph parameters like diameter, average out-degree etc of the categroy graph. - * @param catGraph The category graph. - */ - public void getGraphParameters(CategoryGraph catGraph) { - double startTime = System.currentTimeMillis(); - logger.error(catGraph.getGraphInfo()); - double endTime = (System.currentTimeMillis() - startTime) / 1000.0; - logger.error(endTime + "s"); + } } - /** - * Articles in wikipedia may be tagged with multiple categories. - * It may be interesting to know how many articles have at least one category in common. - * Such articles would have a very high semantic relatedness even if they share a quite secondary category. - * @param pWiki The wikipedia object. - * @param catGraph The category graph. - * @throws WikiApiException - */ - public void getOverlapping(Wikipedia pWiki, CategoryGraph catGraph) throws WikiApiException { - double startTime = System.currentTimeMillis(); - - int articlesWithOverlappingCategories = getArticlesWithOverlappingCategories(pWiki, catGraph); - double overlappingCategoriesRatio = (double) articlesWithOverlappingCategories / (double) pWiki.getMetaData().getNumberOfPages(); - logger.info(articlesWithOverlappingCategories + " - " + pWiki.getMetaData().getNumberOfPages() + " - " + overlappingCategoriesRatio); - - double endTime = (System.currentTimeMillis() - startTime) / 1000.0; - logger.debug("{} ms", endTime); - } - - - /** - * Articles in wikipedia may be tagged with multiple categories. - * It may be interesting to know how many articles have at least one category in common. - * Such articles would have a very high semantic relatedness even if they share a quite secondary category. - * @param pWiki The wikipedia object. - * @param pGraph The category graph. - * @return The number of articles that have at least one category in common. - * @throws WikiPageNotFoundException - */ - private int getArticlesWithOverlappingCategories(Wikipedia pWiki, CategoryGraph pGraph) throws WikiPageNotFoundException { - Set<Integer> overlappingArticles = new HashSet<>(); + return overlappingArticles.size(); + } - // iterate over all node pairs - Set<Integer> nodes = pGraph.getGraph().vertexSet(); + public void getCategorizedArticles(Wikipedia pWiki, CategoryGraph catGraph) throws WikiApiException { + double startTime = System.currentTimeMillis(); - Map<Integer,Set<Integer>> categoryArticleMap = getCategoryArticleMap(pWiki, nodes); + int numberOfCategorizedArticles = getNumberOfCategorizedArticles(pWiki, catGraph); + double categorizedArticlesRatio = (double) numberOfCategorizedArticles / (double) pWiki.getMetaData().getNumberOfPages(); - // sort the Array so we can use a simple iteration with two for loops to access all pairs - Object[] nodeArray = nodes.toArray(); - Arrays.sort(nodeArray); + logger.info("Categorized articles: {}", numberOfCategorizedArticles); + logger.info("All articles: {}", pWiki.getMetaData().getNumberOfPages()); + logger.info("Ratio: {}", categorizedArticlesRatio); - int progress = 0; - for (int i=0; i<nodes.size(); i++) { - progress++; - ApiUtilities.printProgressInfo(progress, nodes.size(), 100, ApiUtilities.ProgressInfoMode.TEXT, ""); + double endTime = (System.currentTimeMillis() - startTime) / 1000.0; + logger.debug("{}ms", endTime); + } - int outerNode = (Integer) nodeArray[i]; + public double getAveragePathLengthFromRoot(Wikipedia pWiki, CategoryGraph connectedCatGraph) throws WikiApiException { + // get root node + Category rootCategory = pWiki.getMetaData().getMainCategory(); + int root = rootCategory.getPageId(); - for (int j=i+1; j<nodes.size(); j++) { - int innerNode = (Integer) nodeArray[j]; + int pathLengthSum = computeShortestPathLenghts(root, connectedCatGraph); - // test whether the categories have pages in common - Set<Integer> outerPages = categoryArticleMap.get(outerNode); - Set<Integer> innerPages = categoryArticleMap.get(innerNode); + return (double) pathLengthSum / (connectedCatGraph.getGraph().vertexSet().size() - 1); + } - for (int outerPage : outerPages) { - if (innerPages.contains(outerPage)) { - if (!overlappingArticles.contains(outerPage)) { - overlappingArticles.add(outerPage); - } - } - } - } - } - - return overlappingArticles.size(); - } - - public void getCategorizedArticles(Wikipedia pWiki, CategoryGraph catGraph) throws WikiApiException { - double startTime = System.currentTimeMillis(); - - int numberOfCategorizedArticles = getNumberOfCategorizedArticles(pWiki, catGraph); - double categorizedArticlesRatio = (double) numberOfCategorizedArticles / (double) pWiki.getMetaData().getNumberOfPages(); - - logger.info("Categorized articles: {}", numberOfCategorizedArticles); - logger.info("All articles: {}", pWiki.getMetaData().getNumberOfPages()); - logger.info("Ratio: {}", categorizedArticlesRatio); - - double endTime = (System.currentTimeMillis() - startTime) / 1000.0; - logger.debug( "{}ms", endTime); + /** + * If the return value has been already computed, it is returned, else it is computed at retrieval time. + * + * @param pWiki The wikipedia object. + * @param catGraph The category graph. + * @return The number of categorized articles, i.e. articles that have at least one category. + */ + public int getNumberOfCategorizedArticles(Wikipedia pWiki, CategoryGraph catGraph) throws WikiApiException { + if (categorizedArticleSet == null) { // has not been initialized yet + iterateCategoriesGetArticles(pWiki, catGraph); } - - public double getAveragePathLengthFromRoot(Wikipedia pWiki, CategoryGraph connectedCatGraph) throws WikiApiException { - // get root node - Category rootCategory = pWiki.getMetaData().getMainCategory(); - int root = rootCategory.getPageId(); - - int pathLengthSum = computeShortestPathLenghts(root, connectedCatGraph); - - return (double) pathLengthSum / (connectedCatGraph.getGraph().vertexSet().size()-1); + return categorizedArticleSet.size(); + } + + /** + * Computes the distribution of the number of articles per category. + * If the return value has been already computed, it is returned, else it is computed at retrieval time. + * + * @param pWiki The wikipedia object. + * @param catGraph The category graph. + * @return A map containing the distribution mapping from a degree to the number of times this degree is found in the category graph. + * @throws WikiPageNotFoundException + */ + public Map<Integer, Integer> getDistributionOfArticlesByCategory(Wikipedia pWiki, CategoryGraph catGraph) throws WikiPageNotFoundException { + if (degreeDistribution == null) { // has not been initialized yet + iterateCategoriesGetArticles(pWiki, catGraph); } - - - /** - * If the return value has been already computed, it is returned, else it is computed at retrieval time. - * @param pWiki The wikipedia object. - * @param catGraph The category graph. - * @return The number of categorized articles, i.e. articles that have at least one category. - */ - public int getNumberOfCategorizedArticles(Wikipedia pWiki, CategoryGraph catGraph) throws WikiApiException{ - if (categorizedArticleSet == null) { // has not been initialized yet - iterateCategoriesGetArticles(pWiki, catGraph); + return degreeDistribution; + } + + /** + * Methods computing stuff that have to iterate over all categories and access category articles can plug-in here. + * Recently plugin-in: + * numberOfCategorizedArticles + * distributionOfArticlesByCategory + * + * @param pWiki The wikipedia object. + * @param catGraph The category graph. + * @throws WikiPageNotFoundException + */ + private void iterateCategoriesGetArticles(Wikipedia pWiki, CategoryGraph catGraph) throws WikiPageNotFoundException { + Map<Integer, Integer> localDegreeDistribution = new HashMap<>(); + Set<Integer> localCategorizedArticleSet = new HashSet<>(); + Set<Integer> categoryNodes = catGraph.getGraph().vertexSet(); + // iterate over all categories + int progress = 0; + for (int node : categoryNodes) { + progress++; + ApiUtilities.printProgressInfo(progress, categoryNodes.size(), 100, ApiUtilities.ProgressInfoMode.TEXT, "iterate over categories"); + + // get the category + Category cat = pWiki.getCategory(node); + if (cat != null) { + Set<Integer> pages = new HashSet<>(cat.__getPages()); + + // update degree distribution map + int numberOfArticles = pages.size(); + if (localDegreeDistribution.containsKey(numberOfArticles)) { + int count = localDegreeDistribution.get(numberOfArticles); + count++; + localDegreeDistribution.put(numberOfArticles, count); + } else { + localDegreeDistribution.put(numberOfArticles, 1); } - return categorizedArticleSet.size(); - } - /** - * Computes the distribution of the number of articles per category. - * If the return value has been already computed, it is returned, else it is computed at retrieval time. - * @param pWiki The wikipedia object. - * @param catGraph The category graph. - * @return A map containing the distribution mapping from a degree to the number of times this degree is found in the category graph. - * @throws WikiPageNotFoundException - */ - public Map<Integer,Integer> getDistributionOfArticlesByCategory(Wikipedia pWiki, CategoryGraph catGraph) throws WikiPageNotFoundException { - if (degreeDistribution == null) { // has not been initialized yet - iterateCategoriesGetArticles(pWiki, catGraph); + // add the page to the categorized articles set, if it is to already in it + for (int page : pages) { + if (!localCategorizedArticleSet.contains(page)) { + localCategorizedArticleSet.add(page); + } } - return degreeDistribution; + } else { + logger.info("{} is not a category.", node); + } } - - /** - * Methods computing stuff that have to iterate over all categories and access category articles can plug-in here. - * Recently plugin-in: - * numberOfCategorizedArticles - * distributionOfArticlesByCategory - * @param pWiki The wikipedia object. - * @param catGraph The category graph. - * @throws WikiPageNotFoundException - */ - private void iterateCategoriesGetArticles(Wikipedia pWiki, CategoryGraph catGraph) throws WikiPageNotFoundException { - Map<Integer,Integer> localDegreeDistribution = new HashMap<>(); - Set<Integer> localCategorizedArticleSet = new HashSet<>(); - Set<Integer> categoryNodes = catGraph.getGraph().vertexSet(); - // iterate over all categories - int progress = 0; - for (int node : categoryNodes) { - progress++; - ApiUtilities.printProgressInfo(progress, categoryNodes.size(), 100, ApiUtilities.ProgressInfoMode.TEXT, "iterate over categories"); - - // get the category - Category cat = pWiki.getCategory(node); - if (cat != null) { - Set<Integer> pages = new HashSet<>(cat.__getPages()); - - // update degree distribution map - int numberOfArticles = pages.size(); - if (localDegreeDistribution.containsKey(numberOfArticles)) { - int count = localDegreeDistribution.get(numberOfArticles); - count++; - localDegreeDistribution.put(numberOfArticles, count); - } - else { - localDegreeDistribution.put(numberOfArticles, 1); - } - - // add the page to the categorized articles set, if it is to already in it - for (int page : pages) { - if (!localCategorizedArticleSet.contains(page)) { - localCategorizedArticleSet.add(page); - } - } - } - else { - logger.info("{} is not a category.", node); - } - } - this.degreeDistribution = localDegreeDistribution; - this.categorizedArticleSet = localCategorizedArticleSet; - } - - /** - * Computes the shortest path from node to all other nodes. - * As the JGraphT BreadthFirstIterator does not provide information about - * the distance to the start node in each step, we will use our own BFS implementation. - * @param pStartNode The start node of the search. - * @param catGraph The category graph. - * @return An array of double values. - */ - private int computeShortestPathLenghts(int pStartNode, CategoryGraph catGraph) { - int shortestPathLengthSum = 0; - - // a set of nodes that have already been expanded -> algorithm should expand nodes monotonically and not go back - Set<Integer> alreadyExpanded = new HashSet<>(); - - // a queue holding the newly discovered nodes with their and their distance to the start node - List<int[]> queue = new ArrayList<>(); - - // initialize queue with start node - int[] innerList = new int[2]; - innerList[0] = pStartNode; // the node - innerList[1] = 0; // the distance to the start node - queue.add(innerList); - - // while the queue is not empty - while (!queue.isEmpty()) { - // remove first element from queue - int[] queueElement = queue.get(0); - int currentNode = queueElement[0]; - int distance = queueElement[1]; - queue.remove(0); - - // if the node was not already expanded - if (!alreadyExpanded.contains(currentNode)) { - // the node gets expanded now - alreadyExpanded.add(currentNode); - - // add the distance of this node to shortestPathLengthSum - shortestPathLengthSum += distance; - - // get the neighbors of the queue element - Set<Integer> neighbors = catGraph.getNeighbors(currentNode); - - // iterate over all neighbors - for (int neighbor : neighbors) { - // if the node was not already expanded - if (!alreadyExpanded.contains(neighbor)) { - // add the node to the queue, increase node distance by one - int[] tmpList = new int[2]; - tmpList[0] = neighbor; - tmpList[1] = (distance + 1); - queue.add(tmpList); - } - } - } + this.degreeDistribution = localDegreeDistribution; + this.categorizedArticleSet = localCategorizedArticleSet; + } + + /** + * Computes the shortest path from node to all other nodes. + * As the JGraphT BreadthFirstIterator does not provide information about + * the distance to the start node in each step, we will use our own BFS implementation. + * + * @param pStartNode The start node of the search. + * @param catGraph The category graph. + * @return An array of double values. + */ + private int computeShortestPathLenghts(int pStartNode, CategoryGraph catGraph) { + int shortestPathLengthSum = 0; + + // a set of nodes that have already been expanded -> algorithm should expand nodes monotonically and not go back + Set<Integer> alreadyExpanded = new HashSet<>(); + + // a queue holding the newly discovered nodes with their and their distance to the start node + List<int[]> queue = new ArrayList<>(); + + // initialize queue with start node + int[] innerList = new int[2]; + innerList[0] = pStartNode; // the node + innerList[1] = 0; // the distance to the start node + queue.add(innerList); + + // while the queue is not empty + while (!queue.isEmpty()) { + // remove first element from queue + int[] queueElement = queue.get(0); + int currentNode = queueElement[0]; + int distance = queueElement[1]; + queue.remove(0); + + // if the node was not already expanded + if (!alreadyExpanded.contains(currentNode)) { + // the node gets expanded now + alreadyExpanded.add(currentNode); + + // add the distance of this node to shortestPathLengthSum + shortestPathLengthSum += distance; + + // get the neighbors of the queue element + Set<Integer> neighbors = catGraph.getNeighbors(currentNode); + + // iterate over all neighbors + for (int neighbor : neighbors) { + // if the node was not already expanded + if (!alreadyExpanded.contains(neighbor)) { + // add the node to the queue, increase node distance by one + int[] tmpList = new int[2]; + tmpList[0] = neighbor; + tmpList[1] = (distance + 1); + queue.add(tmpList); + } } - return shortestPathLengthSum; + } } + return shortestPathLengthSum; + } } diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/exception/WikiApiException.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/exception/WikiApiException.java index 53711828..0efd9d6a 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/exception/WikiApiException.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/exception/WikiApiException.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -20,22 +20,22 @@ public class WikiApiException extends WikiException { - static final long serialVersionUID = 1L; + private static final long serialVersionUID = 4780158247277092677L; - public WikiApiException() { - super(); - } + public WikiApiException() { + super(); + } - public WikiApiException(String txt) { - super(txt); - } + public WikiApiException(String txt) { + super(txt); + } - public WikiApiException(String message, Throwable cause) { - super(message, cause); - } + public WikiApiException(String message, Throwable cause) { + super(message, cause); + } - public WikiApiException(Throwable cause) { - super(cause); - } + public WikiApiException(Throwable cause) { + super(cause); + } } diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/exception/WikiException.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/exception/WikiException.java index bab1cfea..4609de3b 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/exception/WikiException.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/exception/WikiException.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -19,21 +19,21 @@ public class WikiException extends Exception { - static final long serialVersionUID = 1L; + private static final long serialVersionUID = 3891003920835683241L; - public WikiException() { - super(); - } + public WikiException() { + super(); + } - public WikiException(String txt) { - super(txt); - } + public WikiException(String txt) { + super(txt); + } - public WikiException(String message, Throwable cause) { - super(message, cause); - } + public WikiException(String message, Throwable cause) { + super(message, cause); + } - public WikiException(Throwable cause) { - super(cause); - } + public WikiException(Throwable cause) { + super(cause); + } } diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/exception/WikiInitializationException.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/exception/WikiInitializationException.java index b60d04b7..608061f2 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/exception/WikiInitializationException.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/exception/WikiInitializationException.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -20,25 +20,25 @@ /** * Thrown, when the Wikipedia object could not be properly initialized. - * */ public class WikiInitializationException extends WikiApiException { - static final long serialVersionUID = 1L; - public WikiInitializationException() { - super(); - } + private static final long serialVersionUID = 7240072132466204183L; + + public WikiInitializationException() { + super(); + } - public WikiInitializationException(String txt) { - super(txt); - } + public WikiInitializationException(String txt) { + super(txt); + } - public WikiInitializationException(String message, Throwable cause) { - super(message, cause); - } + public WikiInitializationException(String message, Throwable cause) { + super(message, cause); + } - public WikiInitializationException(Throwable cause) { - super(cause); - } + public WikiInitializationException(Throwable cause) { + super(cause); + } } diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/exception/WikiPageNotFoundException.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/exception/WikiPageNotFoundException.java index 74a033fb..026be6e0 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/exception/WikiPageNotFoundException.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/exception/WikiPageNotFoundException.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -20,25 +20,25 @@ /** * Thrown when a requested page or category could not be found in Wikipedia. - * */ public class WikiPageNotFoundException extends WikiApiException { - static final long serialVersionUID = 1L; - public WikiPageNotFoundException() { - super(); - } + private static final long serialVersionUID = -3676016515948761351L; + + public WikiPageNotFoundException() { + super(); + } - public WikiPageNotFoundException(String txt) { - super(txt); - } + public WikiPageNotFoundException(String txt) { + super(txt); + } - public WikiPageNotFoundException(String message, Throwable cause) { - super(message, cause); - } + public WikiPageNotFoundException(String message, Throwable cause) { + super(message, cause); + } - public WikiPageNotFoundException(Throwable cause) { - super(cause); - } + public WikiPageNotFoundException(Throwable cause) { + super(cause); + } } diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/exception/WikiRelatednessException.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/exception/WikiRelatednessException.java deleted file mode 100644 index 3889f0fc..00000000 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/exception/WikiRelatednessException.java +++ /dev/null @@ -1,40 +0,0 @@ -/* - * Licensed to the Technische Universität Darmstadt under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt - * licenses this file to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.dkpro.jwpl.api.exception; - - -public class WikiRelatednessException extends WikiException { - - static final long serialVersionUID = 1L; - - public WikiRelatednessException(){ - super(); - } - - public WikiRelatednessException(String txt) { - super(txt); - } - - public WikiRelatednessException(String message, Throwable cause) { - super(message, cause); - } - - public WikiRelatednessException(Throwable cause) { - super(cause); - } -} diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/exception/WikiTitleParsingException.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/exception/WikiTitleParsingException.java index c29462e1..3480601a 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/exception/WikiTitleParsingException.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/exception/WikiTitleParsingException.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -20,25 +20,24 @@ /** * Thrown when an exceptional situation occurs during parsing a page title to create a Title object. - * */ public class WikiTitleParsingException extends WikiApiException { - static final long serialVersionUID = 1L; + private static final long serialVersionUID = 7152744066557304950L; - public WikiTitleParsingException() { - super(); - } + public WikiTitleParsingException() { + super(); + } - public WikiTitleParsingException(String txt) { - super(txt); - } + public WikiTitleParsingException(String txt) { + super(txt); + } - public WikiTitleParsingException(String message, Throwable cause) { - super(message, cause); - } + public WikiTitleParsingException(String message, Throwable cause) { + super(message, cause); + } - public WikiTitleParsingException(Throwable cause) { - super(cause); - } + public WikiTitleParsingException(Throwable cause) { + super(cause); + } } diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/hibernate/Category.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/hibernate/Category.java index 51102d63..4448f525 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/hibernate/Category.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/hibernate/Category.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -21,63 +21,66 @@ import java.util.Set; public class Category { - private long id; - private int pageId; - private String name; - private Set<Integer> inLinks = new HashSet<>(); - private Set<Integer> outLinks = new HashSet<>(); - private Set<Integer> pages = new HashSet<>(); + private long id; + private int pageId; + private String name; + private Set<Integer> inLinks = new HashSet<>(); + private Set<Integer> outLinks = new HashSet<>(); + private Set<Integer> pages = new HashSet<>(); - /** A no argument constructor as required by Hibernate. */ - public Category () {} + /** + * A no argument constructor as required by Hibernate. + */ + public Category() { + } public long getId() { - return id; - } + return id; + } - @SuppressWarnings("unused") - private void setId(long id) { - this.id = id; - } + @SuppressWarnings("unused") + private void setId(long id) { + this.id = id; + } - public int getPageId() { - return pageId; - } + public int getPageId() { + return pageId; + } - public void setPageId(int pageId) { - this.pageId = pageId; - } + public void setPageId(int pageId) { + this.pageId = pageId; + } - public Set<Integer> getInLinks() { - return inLinks; - } + public Set<Integer> getInLinks() { + return inLinks; + } - public void setInLinks(Set<Integer> inLinks) { - this.inLinks = inLinks; - } + public void setInLinks(Set<Integer> inLinks) { + this.inLinks = inLinks; + } - public String getName() { - return name; - } + public String getName() { + return name; + } - public void setName(String name) { - this.name = name; - } + public void setName(String name) { + this.name = name; + } - public Set<Integer> getOutLinks() { - return outLinks; - } + public Set<Integer> getOutLinks() { + return outLinks; + } - public void setOutLinks(Set<Integer> outLinks) { - this.outLinks = outLinks; - } + public void setOutLinks(Set<Integer> outLinks) { + this.outLinks = outLinks; + } - public Set<Integer> getPages() { - return pages; - } + public Set<Integer> getPages() { + return pages; + } - public void setPages(Set<Integer> pages) { - this.pages = pages; - } + public void setPages(Set<Integer> pages) { + this.pages = pages; + } } diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/hibernate/CategoryDAO.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/hibernate/CategoryDAO.java index 37210156..86a7d5f2 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/hibernate/CategoryDAO.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/hibernate/CategoryDAO.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -17,59 +17,59 @@ */ package org.dkpro.jwpl.api.hibernate; +import java.lang.invoke.MethodHandles; + import org.dkpro.jwpl.api.Wikipedia; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.lang.invoke.MethodHandles; - /** * Data access object for class {@link Category} - * - * @see org.dkpro.jwpl.api.Category + * * @author Hibernate Tools + * @see org.dkpro.jwpl.api.Category */ public class CategoryDAO extends GenericDAO<Category> { - private static final Logger logger = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + private static final Logger logger = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); - public CategoryDAO(Wikipedia pWiki) { - super(pWiki, Category.class); - } + public CategoryDAO(Wikipedia pWiki) { + super(pWiki, Category.class); + } - @Override - public void persist(Category transientInstance) { - logger.debug("persisting Category instance"); - super.persist(transientInstance); - } + @Override + public void persist(Category transientInstance) { + logger.debug("persisting Category instance"); + super.persist(transientInstance); + } - @Override - public void attachDirty(Category instance) { - logger.debug("attaching dirty Category instance"); - super.attachDirty(instance); - } + @Override + public void attachDirty(Category instance) { + logger.debug("attaching dirty Category instance"); + super.attachDirty(instance); + } - @Override - public void attachClean(Category instance) { - logger.debug("attaching clean Category instance"); - super.attachClean(instance); - } + @Override + public void attachClean(Category instance) { + logger.debug("attaching clean Category instance"); + super.attachClean(instance); + } - @Override - public void delete(Category persistentInstance) { - logger.debug("deleting Category instance"); - super.delete(persistentInstance); - } + @Override + public void delete(Category persistentInstance) { + logger.debug("deleting Category instance"); + super.delete(persistentInstance); + } - @Override - public Category merge(Category detachedInstance) { - logger.debug("merging Category instance"); - return super.merge(detachedInstance); - } + @Override + public Category merge(Category detachedInstance) { + logger.debug("merging Category instance"); + return super.merge(detachedInstance); + } - @Override - public Category findById(java.lang.Long id) { - logger.debug("getting Category instance with id: " + id); - return super.findById(id); - } + @Override + public Category findById(java.lang.Long id) { + logger.debug("getting Category instance with id: " + id); + return super.findById(id); + } } diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/hibernate/GenericDAO.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/hibernate/GenericDAO.java index dd072c1d..3273bf60 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/hibernate/GenericDAO.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/hibernate/GenericDAO.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -17,6 +17,8 @@ */ package org.dkpro.jwpl.api.hibernate; +import java.lang.invoke.MethodHandles; + import org.dkpro.jwpl.api.Wikipedia; import org.hibernate.LockOptions; import org.hibernate.Session; @@ -24,108 +26,106 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.lang.invoke.MethodHandles; - /** * A common base class for DAO classes. - * + * * @param <T> The entity type to provide persistence features for. */ public abstract class GenericDAO<T> { - private static final Logger logger = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); - - private final Wikipedia wiki; - private final SessionFactory sessionFactory; + private static final Logger logger = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); - private final String entityClass; + private final Wikipedia wiki; + private final SessionFactory sessionFactory; - GenericDAO(Wikipedia wiki, Class<?> entityClass) { - this.wiki = wiki; - this.entityClass = entityClass.getName(); - this.sessionFactory = initializeSessionFactory(); - } + private final String entityClass; - private SessionFactory initializeSessionFactory() { - try { - return WikiHibernateUtil.getSessionFactory(wiki.getDatabaseConfiguration()); - } catch (Exception e) { - throw new IllegalStateException("Could not locate SessionFactory in JNDI", e); - } - } + GenericDAO(Wikipedia wiki, Class<?> entityClass) { + this.wiki = wiki; + this.entityClass = entityClass.getName(); + this.sessionFactory = initializeSessionFactory(); + } - private SessionFactory getSessionFactory() { - return sessionFactory; + private SessionFactory initializeSessionFactory() { + try { + return WikiHibernateUtil.getSessionFactory(wiki.getDatabaseConfiguration()); + } catch (Exception e) { + throw new IllegalStateException("Could not locate SessionFactory in JNDI", e); } + } - protected Session getSession() { - return getSessionFactory().getCurrentSession(); - } + private SessionFactory getSessionFactory() { + return sessionFactory; + } - public void persist(T transientInstance) { - logger.debug("persisting MetaData instance"); - try { - getSession().persist(transientInstance); - logger.trace("persist successful"); - } catch (RuntimeException re) { - logger.error("Failed persisting " + entityClass + " instance", re); - throw re; - } + protected Session getSession() { + return getSessionFactory().getCurrentSession(); + } + + public void persist(T transientInstance) { + logger.debug("persisting MetaData instance"); + try { + getSession().persist(transientInstance); + logger.trace("persist successful"); + } catch (RuntimeException re) { + logger.error("Failed persisting " + entityClass + " instance", re); + throw re; } + } - public void delete(T persistentInstance) { - try { - getSession().remove(persistentInstance); - logger.trace("delete successful"); - } catch (RuntimeException re) { - logger.error("Failed deleting {} instance", entityClass, re); - throw re; - } + public void delete(T persistentInstance) { + try { + getSession().remove(persistentInstance); + logger.trace("delete successful"); + } catch (RuntimeException re) { + logger.error("Failed deleting {} instance", entityClass, re); + throw re; } + } - public T merge(T detachedInstance) { - try { - T result = (T) getSession().merge(detachedInstance); - logger.trace("merge successful"); - return result; - } catch (RuntimeException re) { - logger.error("Failed merging " + entityClass + " instance", re); - throw re; - } + public T merge(T detachedInstance) { + try { + T result = (T) getSession().merge(detachedInstance); + logger.trace("merge successful"); + return result; + } catch (RuntimeException re) { + logger.error("Failed merging " + entityClass + " instance", re); + throw re; } + } - public void attachClean(T instance) { - try { - getSession().buildLockRequest(LockOptions.NONE).lock(instance); - logger.trace("attach successful"); - } catch (RuntimeException re) { - logger.error("Failed attaching " + entityClass + " instance", re); - throw re; - } + public void attachClean(T instance) { + try { + getSession().buildLockRequest(LockOptions.NONE).lock(instance); + logger.trace("attach successful"); + } catch (RuntimeException re) { + logger.error("Failed attaching " + entityClass + " instance", re); + throw re; } + } - public void attachDirty(T instance) { - try { - getSession().saveOrUpdate(instance); - logger.trace("attach successful"); - } catch (RuntimeException re) { - logger.error("attach failed", re); - throw re; - } + public void attachDirty(T instance) { + try { + getSession().merge(instance); + logger.trace("attach successful"); + } catch (RuntimeException re) { + logger.error("attach failed", re); + throw re; } - - public T findById(Long id) { - try { - T instance = (T) getSession().get(entityClass, id); - if (instance == null) { - logger.trace("get successful, no " + entityClass + " instance found"); - } else { - logger.trace("get successful, instance found"); - } - return instance; - } catch (RuntimeException re) { - logger.error("Failed finding " + entityClass + " instance by id", re); - throw re; - } + } + + public T findById(Long id) { + try { + T instance = (T) getSession().get(entityClass, id); + if (instance == null) { + logger.trace("get successful, no " + entityClass + " instance found"); + } else { + logger.trace("get successful, instance found"); + } + return instance; + } catch (RuntimeException re) { + logger.error("Failed finding " + entityClass + " instance by id", re); + throw re; } + } } diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/hibernate/MetaData.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/hibernate/MetaData.java index 8629ce0f..87663d02 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/hibernate/MetaData.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/hibernate/MetaData.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -19,90 +19,93 @@ public class MetaData { - private long id; + private long id; - private String language; - private String disambiguationCategory; - private String mainCategory; - private String version; + private String language; + private String disambiguationCategory; + private String mainCategory; + private String version; - private long nrofPages; - private long nrofRedirects; - private long nrofDisambiguationPages; - private long nrofCategories; + private long nrofPages; + private long nrofRedirects; + private long nrofDisambiguationPages; + private long nrofCategories; - /** A no argument constructor as required by Hibernate. */ - public MetaData() {} + /** + * A no argument constructor as required by Hibernate. + */ + public MetaData() { + } - public String getDisambiguationCategory() { - return disambiguationCategory; - } + public String getDisambiguationCategory() { + return disambiguationCategory; + } - public void setDisambiguationCategory(String disambiguationCategory) { - this.disambiguationCategory = disambiguationCategory; - } + public void setDisambiguationCategory(String disambiguationCategory) { + this.disambiguationCategory = disambiguationCategory; + } - public String getMainCategory() { - return mainCategory; - } + public String getMainCategory() { + return mainCategory; + } - public void setMainCategory(String mainCategory) { - this.mainCategory = mainCategory; - } + public void setMainCategory(String mainCategory) { + this.mainCategory = mainCategory; + } - public long getNrofCategories() { - return nrofCategories; - } + public long getNrofCategories() { + return nrofCategories; + } - public long getNrofDisambiguationPages() { - return nrofDisambiguationPages; - } + public long getNrofDisambiguationPages() { + return nrofDisambiguationPages; + } - public long getNrofPages() { - return nrofPages; - } + public long getNrofPages() { + return nrofPages; + } - public long getNrofRedirects() { - return nrofRedirects; - } + public long getNrofRedirects() { + return nrofRedirects; + } - public long getId() { - return id; - } + public long getId() { + return id; + } - public void setId(long id) { - this.id = id; - } + public void setId(long id) { + this.id = id; + } - public String getLanguage() { - return language; - } + public String getLanguage() { + return language; + } - public void setLanguage(String language) { - this.language = language; - } + public void setLanguage(String language) { + this.language = language; + } - public void setNrofCategories(long nrofCategories) { - this.nrofCategories = nrofCategories; - } + public void setNrofCategories(long nrofCategories) { + this.nrofCategories = nrofCategories; + } - public void setNrofDisambiguationPages(long nrofDisambiguationPages) { - this.nrofDisambiguationPages = nrofDisambiguationPages; - } + public void setNrofDisambiguationPages(long nrofDisambiguationPages) { + this.nrofDisambiguationPages = nrofDisambiguationPages; + } - public void setNrofPages(long nrofPages) { - this.nrofPages = nrofPages; - } + public void setNrofPages(long nrofPages) { + this.nrofPages = nrofPages; + } - public void setNrofRedirects(long nrofRedirects) { - this.nrofRedirects = nrofRedirects; - } + public void setNrofRedirects(long nrofRedirects) { + this.nrofRedirects = nrofRedirects; + } - public String getVersion() { - return version; - } + public String getVersion() { + return version; + } - public void setVersion(String version) { - this.version = version; - } + public void setVersion(String version) { + this.version = version; + } } diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/hibernate/MetaDataDAO.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/hibernate/MetaDataDAO.java index c6a95957..a9003dda 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/hibernate/MetaDataDAO.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/hibernate/MetaDataDAO.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -17,60 +17,60 @@ */ package org.dkpro.jwpl.api.hibernate; +import java.lang.invoke.MethodHandles; + import org.dkpro.jwpl.api.WikiConstants; import org.dkpro.jwpl.api.Wikipedia; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.lang.invoke.MethodHandles; - /** * Data access object for class {@link MetaData}. - * - * @see org.dkpro.jwpl.api.MetaData + * * @author Hibernate Tools + * @see org.dkpro.jwpl.api.MetaData */ public class MetaDataDAO extends GenericDAO<MetaData> implements WikiConstants { - private static final Logger logger = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + private static final Logger logger = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); - public MetaDataDAO(Wikipedia wiki) { - super(wiki, MetaData.class); - } + public MetaDataDAO(Wikipedia wiki) { + super(wiki, MetaData.class); + } - @Override - public void persist(MetaData transientInstance) { - logger.debug("persisting MetaData instance"); - super.persist(transientInstance); - } + @Override + public void persist(MetaData transientInstance) { + logger.debug("persisting MetaData instance"); + super.persist(transientInstance); + } - @Override - public void attachDirty(MetaData instance) { - logger.debug("attaching dirty MetaData instance"); - super.attachDirty(instance); - } + @Override + public void attachDirty(MetaData instance) { + logger.debug("attaching dirty MetaData instance"); + super.attachDirty(instance); + } - @Override - public void attachClean(MetaData instance) { - logger.debug("attaching clean MetaData instance"); - super.attachClean(instance); - } + @Override + public void attachClean(MetaData instance) { + logger.debug("attaching clean MetaData instance"); + super.attachClean(instance); + } - @Override - public void delete(MetaData persistentInstance) { - logger.debug("deleting MetaData instance"); - super.delete(persistentInstance); - } + @Override + public void delete(MetaData persistentInstance) { + logger.debug("deleting MetaData instance"); + super.delete(persistentInstance); + } - @Override - public MetaData merge(MetaData detachedInstance) { - logger.debug("merging MetaData instance"); - return super.merge(detachedInstance); - } + @Override + public MetaData merge(MetaData detachedInstance) { + logger.debug("merging MetaData instance"); + return super.merge(detachedInstance); + } - @Override - public MetaData findById(java.lang.Long id) { - logger.debug("getting MetaData instance with id: " + id); - return super.findById(id); - } + @Override + public MetaData findById(java.lang.Long id) { + logger.debug("getting MetaData instance with id: " + id); + return super.findById(id); + } } diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/hibernate/Page.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/hibernate/Page.java index a11e0bd1..4a25ac2e 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/hibernate/Page.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/hibernate/Page.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -23,100 +23,102 @@ /** * The page class that is actually persisted by Hibernate. * It is accessed via a equally named class in the api package to hide session management from the user. - * */ public class Page { - private long id; - private int pageId; - private String name; - private String text; - private boolean isDisambiguation; - private Set<Integer> inLinks = new HashSet<>(); - private Set<Integer> outLinks = new HashSet<>(); - private Set<Integer> categories = new HashSet<>(); - private Set<String> redirects = new HashSet<>(); - - /** A no argument constructor as required by Hibernate. */ - public Page () {} + + private long id; + private int pageId; + private String name; + private String text; + private boolean isDisambiguation; + private Set<Integer> inLinks = new HashSet<>(); + private Set<Integer> outLinks = new HashSet<>(); + private Set<Integer> categories = new HashSet<>(); + private Set<String> redirects = new HashSet<>(); + + /** + * A no argument constructor as required by Hibernate. + */ + public Page() { + } public long getId() { - return id; - } + return id; + } - @SuppressWarnings("unused") - private void setId(long id) { - this.id = id; - } + @SuppressWarnings("unused") + private void setId(long id) { + this.id = id; + } - public int getPageId() { - return pageId; - } + public int getPageId() { + return pageId; + } - public void setPageId(int pageId) { - this.pageId = pageId; - } + public void setPageId(int pageId) { + this.pageId = pageId; + } - public Set<Integer> getCategories() { - return categories; - } + public Set<Integer> getCategories() { + return categories; + } - public void setCategories(Set<Integer> categories) { - this.categories = categories; - } + public void setCategories(Set<Integer> categories) { + this.categories = categories; + } - public Set<Integer> getInLinks() { - return inLinks; - } + public Set<Integer> getInLinks() { + return inLinks; + } - public void setInLinks(Set<Integer> inLinks) { - this.inLinks = inLinks; - } + public void setInLinks(Set<Integer> inLinks) { + this.inLinks = inLinks; + } - public String getName() { - return name; - } + public String getName() { + return name; + } - public void setName(String name) { - this.name = name; - } + public void setName(String name) { + this.name = name; + } - public Set<Integer> getOutLinks() { - return outLinks; - } + public Set<Integer> getOutLinks() { + return outLinks; + } - public int getOutDegree() { - return outLinks.size(); - } + public int getOutDegree() { + return outLinks.size(); + } - public void setOutLinks(Set<Integer> outLinks) { - this.outLinks = outLinks; - } + public void setOutLinks(Set<Integer> outLinks) { + this.outLinks = outLinks; + } - public Set<String> getRedirects() { - return redirects; - } + public Set<String> getRedirects() { + return redirects; + } - public void setRedirects(Set<String> redirects) { - this.redirects = redirects; - } + public void setRedirects(Set<String> redirects) { + this.redirects = redirects; + } - public String getText() { - return text; - } + public String getText() { + return text; + } - public void setText(String text) { - this.text = text; - } + public void setText(String text) { + this.text = text; + } - public boolean getIsDisambiguation() { - return isDisambiguation; - } + public boolean getIsDisambiguation() { + return isDisambiguation; + } - public void setIsDisambiguation(Boolean isDisambiguation) - { - if (isDisambiguation == null) { - isDisambiguation = false; - } - this.isDisambiguation = isDisambiguation; - } + public void setIsDisambiguation(Boolean isDisambiguation) { + if (isDisambiguation == null) { + isDisambiguation = false; + } + this.isDisambiguation = isDisambiguation; + } } diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/hibernate/PageDAO.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/hibernate/PageDAO.java index 140d4f76..1e3e86cf 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/hibernate/PageDAO.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/hibernate/PageDAO.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -25,51 +25,51 @@ /** * Data access object for class {@link Page}. - * - * @see org.dkpro.jwpl.api.Page + * * @author Hibernate Tools + * @see org.dkpro.jwpl.api.Page */ public class PageDAO extends GenericDAO<Page> { - private static final Logger logger = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + private static final Logger logger = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); - public PageDAO(Wikipedia pWiki) { - super(pWiki, Page.class); - } + public PageDAO(Wikipedia pWiki) { + super(pWiki, Page.class); + } - @Override - public void persist(Page transientInstance) { - logger.debug("persisting Page instance"); - super.persist(transientInstance); - } + @Override + public void persist(Page transientInstance) { + logger.debug("persisting Page instance"); + super.persist(transientInstance); + } - @Override - public void attachDirty(Page instance) { - logger.debug("attaching dirty Page instance"); - super.attachDirty(instance); - } + @Override + public void attachDirty(Page instance) { + logger.debug("attaching dirty Page instance"); + super.attachDirty(instance); + } - @Override - public void attachClean(Page instance) { - logger.debug("attaching clean Page instance"); - super.attachClean(instance); - } + @Override + public void attachClean(Page instance) { + logger.debug("attaching clean Page instance"); + super.attachClean(instance); + } - @Override - public void delete(Page persistentInstance) { - logger.debug("deleting Page instance"); - super.delete(persistentInstance); - } + @Override + public void delete(Page persistentInstance) { + logger.debug("deleting Page instance"); + super.delete(persistentInstance); + } - @Override - public Page merge(Page detachedInstance) { - logger.debug("merging Page instance"); - return super.merge(detachedInstance); - } + @Override + public Page merge(Page detachedInstance) { + logger.debug("merging Page instance"); + return super.merge(detachedInstance); + } - @Override - public Page findById(java.lang.Long id) { - logger.debug("getting Page instance with id: " + id); - return super.findById(id); - } + @Override + public Page findById(java.lang.Long id) { + logger.debug("getting Page instance with id: " + id); + return super.findById(id); + } } diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/hibernate/PageMapLine.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/hibernate/PageMapLine.java index 9ea1a705..db933e98 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/hibernate/PageMapLine.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/hibernate/PageMapLine.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -18,52 +18,55 @@ package org.dkpro.jwpl.api.hibernate; public class PageMapLine { - private long id; - private String name; - private int pageID; - private String stem; - private String lemma; + private long id; + private String name; + private int pageID; + private String stem; + private String lemma; - /** A no argument constructor as required by Hibernate. */ - public PageMapLine () {} + /** + * A no argument constructor as required by Hibernate. + */ + public PageMapLine() { + } - public long getId() { - return id; - } + public long getId() { + return id; + } - public void setId(long id) { - this.id = id; - } + public void setId(long id) { + this.id = id; + } - public String getName() { - return name; - } + public String getName() { + return name; + } - public void setName(String name) { - this.name = name; - } + public void setName(String name) { + this.name = name; + } - public int getPageID() { - return pageID; - } + public int getPageID() { + return pageID; + } - public void setPageID(int pageID) { - this.pageID = pageID; - } + public void setPageID(int pageID) { + this.pageID = pageID; + } - public String getLemma() { - return lemma; - } + public String getLemma() { + return lemma; + } - public void setLemma(String lemma) { - this.lemma = lemma; - } + public void setLemma(String lemma) { + this.lemma = lemma; + } - public String getStem() { - return stem; - } + public String getStem() { + return stem; + } - public void setStem(String stem) { - this.stem = stem; - } + public void setStem(String stem) { + this.stem = stem; + } } diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/hibernate/WikiHibernateUtil.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/hibernate/WikiHibernateUtil.java index 1e1e574e..4e310e71 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/hibernate/WikiHibernateUtil.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/hibernate/WikiHibernateUtil.java @@ -59,7 +59,7 @@ private static Properties getProperties(DatabaseConfiguration config) { String password = config.getPassword(); /* - * Ensures explicit DMBS type specific configuration for hsqldb from junit tests context + * Ensures explicit DBMS type specific configuration for hsqldb from junit tests context */ String jdbcURL = config.getJdbcURL(); String databaseDriverClass = config.getDatabaseDriver(); diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/sweble/PlainTextConverter.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/sweble/PlainTextConverter.java index 801b99b6..d77d2c49 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/sweble/PlainTextConverter.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/sweble/PlainTextConverter.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -82,7 +82,7 @@ * <li><a href="http://en.wikipedia.org/wiki/Visitor_pattern">Visitor Pattern (classic pattern)</a></li> * <li><a href="http://www.javaworld.com/javaworld/javatips/jw-javatip98.html">the version we use here</a></li> * </ul> - * + * <p> * The methods needed to descend into an AST and visit the children of a given * node <code>n</code> are * <ul> @@ -98,101 +98,96 @@ * * @author Open Source Research Group, University of Erlangen-Nürnberg */ -public class PlainTextConverter extends AstVisitor<WtNode> -{ - - private static final Pattern ws = Pattern.compile("\\s+"); - - private static final Logger logger = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); - - private final WikiConfig config; - - private final int wrapCol; - - private StringBuilder sb; - - private StringBuilder line; - - private boolean pastBod; - - private int needNewlines; - - private boolean needSpace; - private boolean noWrap; - private final boolean enumerateSections; - - private LinkedList<Integer> sections; - - /* Things needed for processing tables */ - private List<List<String>> rows; - private List<String> currentRow; - private StringBuilder currentCell; - private String currentLinkTitleInCell; - - // ========================================================================= - - - /** - * Creates a new visitor that produces a plain text String representation - * of a parsed Wikipedia article. -s */ - public PlainTextConverter() - { - this(DefaultConfigEnWp.generate(), false, Integer.MAX_VALUE); //no fixed textwidth - } - - /** - * Creates a new visitor that produces a plain text String representation - * of a parsed Wikipedia article. - * - * @param enumerateSection {@code True}, if sections should be enumerated in the output, {@code false} otherwise. - */ - public PlainTextConverter(boolean enumerateSection) - { - this(DefaultConfigEnWp.generate(), enumerateSection, Integer.MAX_VALUE); //no fixed textwidth - } - - /** - * Creates a new visitor that produces a plain text String representation - * of a parsed Wikipedia article. - * - * @param config A valid {@link WikiConfig} instance. Must not be {@code null}. - * @param enumerateSections {@code True}, if sections should be enumerated in the output, {@code false} otherwise. - * @param wrapCol Defines the max length of a line. longer lines will be broken. - */ - public PlainTextConverter(WikiConfig config, boolean enumerateSections, int wrapCol) - { - this.config = config; - this.wrapCol = wrapCol; - this.enumerateSections = enumerateSections; - } - - @Override - protected WtNode before(WtNode node) - { - // This method is called by go() before visitation starts - sb = new StringBuilder(); - line = new StringBuilder(); - pastBod = false; - needNewlines = 0; - needSpace = false; - noWrap = false; - sections = new LinkedList<>(); - rows = new ArrayList<>(); - return super.before(node); - } - - @Override - protected Object after(WtNode node, Object result) - { - finishLine(); - - // This method is called by go() after visitation has finished - // The return value will be passed to go() which passes it to the caller - return sb.toString(); - } - - // ========================================================================= +public class PlainTextConverter extends AstVisitor<WtNode> { + + private static final Pattern ws = Pattern.compile("\\s+"); + + private static final Logger logger = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + + private final WikiConfig config; + + private final int wrapCol; + + private StringBuilder sb; + + private StringBuilder line; + + private boolean pastBod; + + private int needNewlines; + + private boolean needSpace; + private boolean noWrap; + private final boolean enumerateSections; + + private LinkedList<Integer> sections; + + /* Things needed for processing tables */ + private List<List<String>> rows; + private List<String> currentRow; + private StringBuilder currentCell; + private String currentLinkTitleInCell; + + // ========================================================================= + + + /** + * Creates a new visitor that produces a plain text String representation + * of a parsed Wikipedia article. + * s + */ + public PlainTextConverter() { + this(DefaultConfigEnWp.generate(), false, Integer.MAX_VALUE); //no fixed textwidth + } + + /** + * Creates a new visitor that produces a plain text String representation + * of a parsed Wikipedia article. + * + * @param enumerateSection {@code True}, if sections should be enumerated in the output, {@code false} otherwise. + */ + public PlainTextConverter(boolean enumerateSection) { + this(DefaultConfigEnWp.generate(), enumerateSection, Integer.MAX_VALUE); //no fixed textwidth + } + + /** + * Creates a new visitor that produces a plain text String representation + * of a parsed Wikipedia article. + * + * @param config A valid {@link WikiConfig} instance. Must not be {@code null}. + * @param enumerateSections {@code True}, if sections should be enumerated in the output, {@code false} otherwise. + * @param wrapCol Defines the max length of a line. longer lines will be broken. + */ + public PlainTextConverter(WikiConfig config, boolean enumerateSections, int wrapCol) { + this.config = config; + this.wrapCol = wrapCol; + this.enumerateSections = enumerateSections; + } + + @Override + protected WtNode before(WtNode node) { + // This method is called by go() before visitation starts + sb = new StringBuilder(); + line = new StringBuilder(); + pastBod = false; + needNewlines = 0; + needSpace = false; + noWrap = false; + sections = new LinkedList<>(); + rows = new ArrayList<>(); + return super.before(node); + } + + @Override + protected Object after(WtNode node, Object result) { + finishLine(); + + // This method is called by go() after visitation has finished + // The return value will be passed to go() which passes it to the caller + return sb.toString(); + } + + // ========================================================================= /* * We CAN NOT allow this method being implemented here, as it will clash with @@ -208,448 +203,393 @@ public void visit(WtNode n) } */ - public void visit(WtNodeList n) - { - iterate(n); - } - - public void visit(WtPage p) - { - iterate(p); - } - - public void visit(AstText text) - { - if(currentCell != null) { - // handles table cell content - currentCell.append(text.getContent()); - } else { - // regular case for all nodes that are not explicitly handled below - write(text.getContent()); - } - - } - - public void visit(WtWhitespace w) - { - write(" "); - } - - public void visit(WtBold b) - { - //write("**"); - iterate(b); - //write("**"); - } - - public void visit(WtItalics i) - { - //write("//"); - iterate(i); - //write("//"); - } - - public void visit(WtXmlCharRef cr) - { - write(Character.toChars(cr.getCodePoint())); - } - - public void visit(WtXmlEntityRef er) - { - - String ch = er.getResolved(); - if (ch == null) - { - write('&'); - write(er.getName()); - write(';'); - } - else - { - write(ch); - } - } - - public void visit(WtUrl url) - { - write(url.getProtocol()); - write(':'); - write(url.getPath()); - } - - public void visit(WtExternalLink link) - { - //TODO How should we represent external links in the plain text output? - write('['); - iterate(link.getTitle()); - write(']'); - } - - public void visit(WtInternalLink link) - { - currentLinkTitleInCell = null; - try - { - PageTitle page = PageTitle.make(config, link.getTarget().getAsString()); - if (page.getNamespace().equals(config.getNamespace("Category"))) { - return; - } - } - catch (LinkTargetException e) - { - logger.warn(e.getLocalizedMessage()); - } - - write(link.getPrefix()); - WtLinkTitle pageTitle = link.getTitle(); - - if (pageTitle == null || pageTitle.isEmpty()) - { - // remember this as it could be needed to process table rows correctly - currentLinkTitleInCell = link.getTarget().getAsString(); - if(currentLinkTitleInCell.contains("#")) { - // only take the first part of the string, no anchors on pages (divided by '#' symbols) - currentLinkTitleInCell = currentLinkTitleInCell.split(Pattern.quote("#"), 2)[0]; - } - // for regular cases: just write the original value here - if(currentCell==null) { - write(link.getTarget().getAsString()); - } - } - else - { - iterate(link.getTitle()); - } - write(link.getPostfix()); - } - - public void visit(WtSection s) - { - finishLine(); - StringBuilder saveSb = sb; - boolean saveNoWrap = noWrap; - - sb = new StringBuilder(); - noWrap = true; - - iterate(s.getHeading()); - finishLine(); - String title = sb.toString().trim(); - - sb = saveSb; - - if (s.getLevel() >= 1) - { - while (sections.size() > s.getLevel()) { - sections.removeLast(); - } - while (sections.size() < s.getLevel()) { - sections.add(1); - } - - if(enumerateSections){ - StringBuilder sb2 = new StringBuilder(); - for (int i = 0; i < sections.size(); ++i) - { - if (i < 1) { - continue; - } - - sb2.append(sections.get(i)); - sb2.append('.'); - } - - if (sb2.length() > 0) { - sb2.append(' '); - } - sb2.append(title); - title = sb2.toString(); - } - } - - newline(1); - write(title); - newline(1); + public void visit(WtNodeList n) { + iterate(n); + } + + public void visit(WtPage p) { + iterate(p); + } + + public void visit(AstText text) { + if (currentCell != null) { + // handles table cell content + currentCell.append(text.getContent()); + } else { + // regular case for all nodes that are not explicitly handled below + write(text.getContent()); + } + + } + + public void visit(WtWhitespace w) { + write(" "); + } + + public void visit(WtBold b) { + //write("**"); + iterate(b); + //write("**"); + } + + public void visit(WtItalics i) { + //write("//"); + iterate(i); + //write("//"); + } + + public void visit(WtXmlCharRef cr) { + write(Character.toChars(cr.getCodePoint())); + } + + public void visit(WtXmlEntityRef er) { + + String ch = er.getResolved(); + if (ch == null) { + write('&'); + write(er.getName()); + write(';'); + } else { + write(ch); + } + } + + public void visit(WtUrl url) { + write(url.getProtocol()); + write(':'); + write(url.getPath()); + } + + public void visit(WtExternalLink link) { + //TODO How should we represent external links in the plain text output? + write('['); + iterate(link.getTitle()); + write(']'); + } + + public void visit(WtInternalLink link) { + currentLinkTitleInCell = null; + try { + PageTitle page = PageTitle.make(config, link.getTarget().getAsString()); + if (page.getNamespace().equals(config.getNamespace("Category"))) { + return; + } + } catch (LinkTargetException e) { + logger.warn(e.getLocalizedMessage()); + } + + write(link.getPrefix()); + WtLinkTitle pageTitle = link.getTitle(); + + if (pageTitle == null || pageTitle.isEmpty()) { + // remember this as it could be needed to process table rows correctly + currentLinkTitleInCell = link.getTarget().getAsString(); + if (currentLinkTitleInCell.contains("#")) { + // only take the first part of the string, no anchors on pages (divided by '#' symbols) + currentLinkTitleInCell = currentLinkTitleInCell.split(Pattern.quote("#"), 2)[0]; + } + // for regular cases: just write the original value here + if (currentCell == null) { + write(link.getTarget().getAsString()); + } + } else { + iterate(link.getTitle()); + } + write(link.getPostfix()); + } + + public void visit(WtSection s) { + finishLine(); + StringBuilder saveSb = sb; + boolean saveNoWrap = noWrap; + + sb = new StringBuilder(); + noWrap = true; + + iterate(s.getHeading()); + finishLine(); + String title = sb.toString().trim(); + + sb = saveSb; + + if (s.getLevel() >= 1) { + while (sections.size() > s.getLevel()) { + sections.removeLast(); + } + while (sections.size() < s.getLevel()) { + sections.add(1); + } + + if (enumerateSections) { + StringBuilder sb2 = new StringBuilder(); + for (int i = 0; i < sections.size(); ++i) { + if (i < 1) { + continue; + } + + sb2.append(sections.get(i)); + sb2.append('.'); + } + + if (sb2.length() > 0) { + sb2.append(' '); + } + sb2.append(title); + title = sb2.toString(); + } + } + + newline(1); + write(title); + newline(1); // write(StringUtils.strrep('-', title.length())); // newline(1); - noWrap = saveNoWrap; + noWrap = saveNoWrap; - iterate(s.getBody()); + iterate(s.getBody()); - while (sections.size() > s.getLevel()) { - sections.removeLast(); - } - sections.add(sections.removeLast() + 1); - } + while (sections.size() > s.getLevel()) { + sections.removeLast(); + } + sections.add(sections.removeLast() + 1); + } - public void visit(WtParagraph p) - { - iterate(p); - newline(1); - } + public void visit(WtParagraph p) { + iterate(p); + newline(1); + } - public void visit(WtHorizontalRule hr) - { - newline(1); + public void visit(WtHorizontalRule hr) { + newline(1); // write(StringUtils.strrep('-', wrapCol)); // newline(1); - } - - public void visit(WtXmlElement e) - { - if (e.getName().equalsIgnoreCase("br")) - { - newline(1); - } - else - { - iterate(e.getBody()); - } - } - - public void visit(WtXmlEndTag t) - { - iterate(t); - } - - public void visit(WtXmlAttribute n) - { - // ignore formatting information from xml attributes as the result is expected in plain text - } - - public void visit(WtListItem n) - { - iterate(n); - } - - /** - * Called when a {@link WtTable table structure} is about to be processed. - * @param n A node representing a table. - */ - public void visit(WtTable n) - { - iterate(n); - } - - /** - * Called when an inner {@link WtTableImplicitTableBody table body} is about to be processed. - * @param n A node representing a table body. - */ - public void visit(WtTableImplicitTableBody n) - { - iterate(n); - } - - /** - * Called when a {@link WtTableCaption table caption} is about to be processed. - * @param n A node representing a table caption. - */ - public void visit(WtTableCaption n) - { - iterate(n); - } - - /** - * Called when a {@link WtTableRow table row} is about to be processed. - * @param n A node representing a table row. - */ - public void visit(WtTableRow n) - { - if (currentRow == null) - { - currentRow = new ArrayList<>(); - iterate(n); - if(currentRow.size() > 0) - { - rows.add(currentRow); - } - if(currentRow.size() == n.getBody().size()) { - StringBuilder tableRowFormatted = new StringBuilder(); - for(int i = 0; i < currentRow.size(); i++) { - tableRowFormatted.append(currentRow.get(i)); - if(i+1 < currentRow.size()) { - // appending a separator char only in between cells here - tableRowFormatted.append('|'); - } - } - writeWord(tableRowFormatted.toString()); - } - currentRow = null; - } - } - - /** - * Called when a header {@link WtTableHeader cell} is about to be processed. - * @param n A node representing a table header cell. - */ - public void visit(WtTableHeader n) - { - processCellContent(n); - } - - /** - * Called when a regular {@link WtTableCell cell} is about to be processed. - * @param n A node representing a table header cell. - */ - public void visit(WtTableCell n) - { - processCellContent(n); - } - - private void processCellContent(WtInnerNode2 n) - { - if (currentRow != null) - { - currentCell = new StringBuilder(); - iterate(n); - String cellValue = currentCell.toString().trim(); - if(currentLinkTitleInCell != null) { - cellValue = currentLinkTitleInCell + " " + cellValue; - currentLinkTitleInCell = null; - } - currentRow.add(cellValue); - currentCell = null; - } - } - - - // ========================================================================= - // Stuff we want to hide - - public void visit(WtImageLink n) - { - } - - public void visit(WtIllegalCodePoint n) - { - } - - public void visit(WtXmlComment n) - { - } - - public void visit(WtTemplate n) - { - } - - public void visit(WtTemplateArgument n) - { - } - - public void visit(WtTemplateParameter n) - { - } - - public void visit(WtTagExtension n) - { - } - - - // ========================================================================= - - private void newline(int num) - { - if (pastBod) - { - if (num > needNewlines) { - needNewlines = num; - } - } - } - - private void wantSpace() - { - if (pastBod) { - needSpace = true; - } - } - - private void finishLine() - { - sb.append(line.toString()); - line.setLength(0); - } - - private void writeNewlines(int num) - { - finishLine(); - sb.append(StringTools.strrep('\n', num)); - needNewlines = 0; - needSpace = false; - } - - private void writeWord(String s) - { - int length = s.length(); - if (length == 0) { - return; - } - - if (!noWrap && needNewlines <= 0) - { - if (needSpace) { - length += 1; - } - - if (line.length() + length >= wrapCol && line.length() > 0) { - writeNewlines(1); - } - } - - if (needSpace && needNewlines <= 0) { - line.append(' '); - } - - if (needNewlines > 0) { - writeNewlines(needNewlines); - } - - needSpace = false; - pastBod = true; - line.append(s); - } - - private void write(String s) - { - if (s.isEmpty()) { - return; - } - - if (Character.isSpaceChar(s.charAt(0))) { - wantSpace(); - } - - String[] words = ws.split(s); - for (int i = 0; i < words.length;) - { - writeWord(words[i]); - if (++i < words.length) { - wantSpace(); - } - } - - char charAtEnd = s.charAt(s.length() - 1); - if('\n' == charAtEnd){ - writeNewlines(1); - } - if (Character.isSpaceChar(charAtEnd)) { - wantSpace(); - } - } - - private void write(char[] cs) - { - write(String.valueOf(cs)); - } - - private void write(char ch) - { - writeWord(String.valueOf(ch)); - } - - private void write(int num) - { - writeWord(String.valueOf(num)); - } + } + + public void visit(WtXmlElement e) { + if (e.getName().equalsIgnoreCase("br")) { + newline(1); + } else { + iterate(e.getBody()); + } + } + + public void visit(WtXmlEndTag t) { + iterate(t); + } + + public void visit(WtXmlAttribute n) { + // ignore formatting information from xml attributes as the result is expected in plain text + } + + public void visit(WtListItem n) { + iterate(n); + } + + /** + * Called when a {@link WtTable table structure} is about to be processed. + * + * @param n A node representing a table. + */ + public void visit(WtTable n) { + iterate(n); + } + + /** + * Called when an inner {@link WtTableImplicitTableBody table body} is about to be processed. + * + * @param n A node representing a table body. + */ + public void visit(WtTableImplicitTableBody n) { + iterate(n); + } + + /** + * Called when a {@link WtTableCaption table caption} is about to be processed. + * + * @param n A node representing a table caption. + */ + public void visit(WtTableCaption n) { + iterate(n); + } + + /** + * Called when a {@link WtTableRow table row} is about to be processed. + * + * @param n A node representing a table row. + */ + public void visit(WtTableRow n) { + if (currentRow == null) { + currentRow = new ArrayList<>(); + iterate(n); + if (currentRow.size() > 0) { + rows.add(currentRow); + } + if (currentRow.size() == n.getBody().size()) { + StringBuilder tableRowFormatted = new StringBuilder(); + for (int i = 0; i < currentRow.size(); i++) { + tableRowFormatted.append(currentRow.get(i)); + if (i + 1 < currentRow.size()) { + // appending a separator char only in between cells here + tableRowFormatted.append('|'); + } + } + writeWord(tableRowFormatted.toString()); + } + currentRow = null; + } + } + + /** + * Called when a header {@link WtTableHeader cell} is about to be processed. + * + * @param n A node representing a table header cell. + */ + public void visit(WtTableHeader n) { + processCellContent(n); + } + + /** + * Called when a regular {@link WtTableCell cell} is about to be processed. + * + * @param n A node representing a table header cell. + */ + public void visit(WtTableCell n) { + processCellContent(n); + } + + private void processCellContent(WtInnerNode2 n) { + if (currentRow != null) { + currentCell = new StringBuilder(); + iterate(n); + String cellValue = currentCell.toString().trim(); + if (currentLinkTitleInCell != null) { + cellValue = currentLinkTitleInCell + " " + cellValue; + currentLinkTitleInCell = null; + } + currentRow.add(cellValue); + currentCell = null; + } + } + + + // ========================================================================= + // Stuff we want to hide + + public void visit(WtImageLink n) { + } + + public void visit(WtIllegalCodePoint n) { + } + + public void visit(WtXmlComment n) { + } + + public void visit(WtTemplate n) { + } + + public void visit(WtTemplateArgument n) { + } + + public void visit(WtTemplateParameter n) { + } + + public void visit(WtTagExtension n) { + } + + + // ========================================================================= + + private void newline(int num) { + if (pastBod) { + if (num > needNewlines) { + needNewlines = num; + } + } + } + + private void wantSpace() { + if (pastBod) { + needSpace = true; + } + } + + private void finishLine() { + sb.append(line.toString()); + line.setLength(0); + } + + private void writeNewlines(int num) { + finishLine(); + sb.append(StringTools.strrep('\n', num)); + needNewlines = 0; + needSpace = false; + } + + private void writeWord(String s) { + int length = s.length(); + if (length == 0) { + return; + } + + if (!noWrap && needNewlines <= 0) { + if (needSpace) { + length += 1; + } + + if (line.length() + length >= wrapCol && line.length() > 0) { + writeNewlines(1); + } + } + + if (needSpace && needNewlines <= 0) { + line.append(' '); + } + + if (needNewlines > 0) { + writeNewlines(needNewlines); + } + + needSpace = false; + pastBod = true; + line.append(s); + } + + private void write(String s) { + if (s.isEmpty()) { + return; + } + + if (Character.isSpaceChar(s.charAt(0))) { + wantSpace(); + } + + String[] words = ws.split(s); + for (int i = 0; i < words.length; ) { + writeWord(words[i]); + if (++i < words.length) { + wantSpace(); + } + } + + char charAtEnd = s.charAt(s.length() - 1); + if ('\n' == charAtEnd) { + writeNewlines(1); + } + if (Character.isSpaceChar(charAtEnd)) { + wantSpace(); + } + } + + private void write(char[] cs) { + write(String.valueOf(cs)); + } + + private void write(char ch) { + writeWord(String.valueOf(ch)); + } + + private void write(int num) { + writeWord(String.valueOf(num)); + } } diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/sweble/TemplateNameExtractor.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/sweble/TemplateNameExtractor.java index 46d32810..b1f1a28e 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/sweble/TemplateNameExtractor.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/sweble/TemplateNameExtractor.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -40,74 +40,66 @@ /** * A visitor that extracts template names (no parameters) from an article AST. */ -public class TemplateNameExtractor extends AstVisitor<WtNode> -{ - private final WikiConfig config; - - - private List<String> templates; - - // ========================================================================= - - - /** - * Creates a new visitor that extracts anchors of internal links from a - * parsed Wikipedia article using the default Sweble config as defined - * in WikiConstants.SWEBLE_CONFIG. - */ - public TemplateNameExtractor() - { - this.config = DefaultConfigEnWp.generate(); - } - - /** - * Creates a new visitor that extracts anchors of internal links from a - * parsed Wikipedia article. - * - * @param config the Sweble configuration - */ - public TemplateNameExtractor(WikiConfig config) - { - this.config = config; - } - - @Override - protected WtNode before(WtNode node) - { - // This method is called by go() before visitation starts - templates = new LinkedList<>(); - return super.before(node); - } - - @Override - protected Object after(WtNode node, Object result) - { - return templates; - } - - // ========================================================================= - - public void visit(WtNode n) - { - iterate(n); - } - - public void visit(WtTemplate tmpl) throws IOException - { - for(AstNode n: tmpl.getName()){ - if(n instanceof AstText){ - add(((AstText)n).getContent()); - } - } - } - - private void add(String s) - { - s=s.replace("\n", "").replace("\r", ""); - if (s.trim().isEmpty()) { - return; - } - templates.add(s); - } +public class TemplateNameExtractor extends AstVisitor<WtNode> { + private final WikiConfig config; + + + private List<String> templates; + + // ========================================================================= + + + /** + * Creates a new visitor that extracts anchors of internal links from a + * parsed Wikipedia article using the default Sweble config as defined + * in WikiConstants.SWEBLE_CONFIG. + */ + public TemplateNameExtractor() { + this.config = DefaultConfigEnWp.generate(); + } + + /** + * Creates a new visitor that extracts anchors of internal links from a + * parsed Wikipedia article. + * + * @param config the Sweble configuration + */ + public TemplateNameExtractor(WikiConfig config) { + this.config = config; + } + + @Override + protected WtNode before(WtNode node) { + // This method is called by go() before visitation starts + templates = new LinkedList<>(); + return super.before(node); + } + + @Override + protected Object after(WtNode node, Object result) { + return templates; + } + + // ========================================================================= + + public void visit(WtNode n) { + iterate(n); + } + + public void visit(WtTemplate tmpl) throws IOException { + for (AstNode n : tmpl.getName()) { + if (n instanceof AstText) { + add(((AstText) n).getContent()); + } + } + } + + private void add(String s) { + s = s.replace("\n", "").replace("\r", ""); + if (s.trim().isEmpty()) { + return; + } + templates.add(s); + } } diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/util/GraphSerialization.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/util/GraphSerialization.java index ba2733bb..5303152c 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/util/GraphSerialization.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/util/GraphSerialization.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -32,89 +32,88 @@ /** * Utility for serializing and deserializing {@link DefaultDirectedGraph} objects, that are<br> * wrapped into {@link SerializableDirectedGraph} objects. - * */ public final class GraphSerialization { - /** - * This class cannot be instantiated. - */ - private GraphSerialization() { - } + /** + * This class cannot be instantiated. + */ + private GraphSerialization() { + } - /** - * Serializes the given {@link DefaultDirectedGraph} object to the given location. - * - * @param graph Must not be {@code null}. - * @param location Must not be {@code null} and a valid file path. - * @throws IOException Thrown if errors occurred on the IO level. - */ - public static void saveGraph(DefaultDirectedGraph<Integer, DefaultEdge> graph, String location) throws IOException { - File file = new File(location); - file.createNewFile(); - if (!file.canWrite()) { - throw new IOException("Cannot write to file " + location); - } - GraphSerialization.saveGraph(graph, file); - } + /** + * Serializes the given {@link DefaultDirectedGraph} object to the given location. + * + * @param graph Must not be {@code null}. + * @param location Must not be {@code null} and a valid file path. + * @throws IOException Thrown if errors occurred on the IO level. + */ + public static void saveGraph(DefaultDirectedGraph<Integer, DefaultEdge> graph, String location) throws IOException { + File file = new File(location); + file.createNewFile(); + if (!file.canWrite()) { + throw new IOException("Cannot write to file " + location); + } + GraphSerialization.saveGraph(graph, file); + } - /** - * Serializes the given {@link DefaultDirectedGraph} object to the given location. - * - * @param graph Must not be {@code null}. - * @param file Must not be {@code null} and valid {@link File}. - * @throws IOException Thrown if errors occurred on the IO level. - */ - public static void saveGraph(DefaultDirectedGraph<Integer, DefaultEdge> graph, File file) throws IOException { - SerializableDirectedGraph serialGraph = new SerializableDirectedGraph(graph); - BufferedOutputStream fos; - ObjectOutputStream out; - fos = new BufferedOutputStream(new FileOutputStream(file)); - out = new ObjectOutputStream(fos); - out.writeObject(serialGraph); - out.close(); + /** + * Serializes the given {@link DefaultDirectedGraph} object to the given location. + * + * @param graph Must not be {@code null}. + * @param file Must not be {@code null} and valid {@link File}. + * @throws IOException Thrown if errors occurred on the IO level. + */ + public static void saveGraph(DefaultDirectedGraph<Integer, DefaultEdge> graph, File file) throws IOException { + SerializableDirectedGraph serialGraph = new SerializableDirectedGraph(graph); + BufferedOutputStream fos; + ObjectOutputStream out; + fos = new BufferedOutputStream(new FileOutputStream(file)); + out = new ObjectOutputStream(fos); + out.writeObject(serialGraph); + out.close(); - } + } - /** - * Deserializes a {@link SerializableDirectedGraph} object that is stored in the - * given location. This method returns the {@link DefaultDirectedGraph} object, that is wrapped - * in the {@link SerializableDirectedGraph}. - * - * @param location Must not be {@code null} and a valid file path. - * @return The {@link DefaultDirectedGraph} object, that is wrapped in the - * {@link SerializableDirectedGraph}. - * @throws IOException Thrown if errors occurred on the IO level. - * @throws ClassNotFoundException Thrown if a class could not be find while deserialization. - */ - public static DefaultDirectedGraph<Integer, DefaultEdge> loadGraph(String location) - throws IOException, ClassNotFoundException { - File file = new File(location); - if (!file.canWrite()) { - throw new IOException("Cannot read from file " + location); - } - return GraphSerialization.loadGraph(file); - } + /** + * Deserializes a {@link SerializableDirectedGraph} object that is stored in the + * given location. This method returns the {@link DefaultDirectedGraph} object, that is wrapped + * in the {@link SerializableDirectedGraph}. + * + * @param location Must not be {@code null} and a valid file path. + * @return The {@link DefaultDirectedGraph} object, that is wrapped in the + * {@link SerializableDirectedGraph}. + * @throws IOException Thrown if errors occurred on the IO level. + * @throws ClassNotFoundException Thrown if a class could not be find while deserialization. + */ + public static DefaultDirectedGraph<Integer, DefaultEdge> loadGraph(String location) + throws IOException, ClassNotFoundException { + File file = new File(location); + if (!file.canWrite()) { + throw new IOException("Cannot read from file " + location); + } + return GraphSerialization.loadGraph(file); + } - /** - * Deserializes a {@link SerializableDirectedGraph} object that is stored in the - * given location. This method returns the {@link DefaultDirectedGraph} object, that is wrapped - * in the {@link SerializableDirectedGraph}. - * - * @param file Must not be {@code null} and valid {@link File}. - * @return The {@link DefaultDirectedGraph} object, that is wrapped in the - * {@link SerializableDirectedGraph}. - * @throws IOException Thrown if errors occurred on the IO level. - * @throws ClassNotFoundException Thrown if a class could not be find while deserialization. - */ - public static DefaultDirectedGraph<Integer, DefaultEdge> loadGraph(File file) throws IOException, ClassNotFoundException { - SerializableDirectedGraph serialGraph; - BufferedInputStream fin; - ObjectInputStream in; - fin = new BufferedInputStream(new FileInputStream(file)); - in = new ObjectInputStream(fin); - serialGraph = (SerializableDirectedGraph) in.readObject(); - in.close(); - return serialGraph.getGraph(); - } + /** + * Deserializes a {@link SerializableDirectedGraph} object that is stored in the + * given location. This method returns the {@link DefaultDirectedGraph} object, that is wrapped + * in the {@link SerializableDirectedGraph}. + * + * @param file Must not be {@code null} and valid {@link File}. + * @return The {@link DefaultDirectedGraph} object, that is wrapped in the + * {@link SerializableDirectedGraph}. + * @throws IOException Thrown if errors occurred on the IO level. + * @throws ClassNotFoundException Thrown if a class could not be find while deserialization. + */ + public static DefaultDirectedGraph<Integer, DefaultEdge> loadGraph(File file) throws IOException, ClassNotFoundException { + SerializableDirectedGraph serialGraph; + BufferedInputStream fin; + ObjectInputStream in; + fin = new BufferedInputStream(new FileInputStream(file)); + in = new ObjectInputStream(fin); + serialGraph = (SerializableDirectedGraph) in.readObject(); + in.close(); + return serialGraph.getGraph(); + } } diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/util/SerializableDirectedGraph.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/util/SerializableDirectedGraph.java index 2029c60a..65be4715 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/util/SerializableDirectedGraph.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/util/SerializableDirectedGraph.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -23,34 +23,35 @@ import org.jgrapht.graph.DefaultEdge; /** - * Serializable Wrapper for a DirectedGraph object, that has Integer objects as vertices and DefaultEdge objects as edges.<br> - * There is no need in this case to serializale vertices and edges separately, because they already implement the interface Serializable. - * - * + * Serializable Wrapper for a DirectedGraph object, that has Integer objects as vertices and + * {@link DefaultEdge} objects as edges.<br> + * + * There is no need in this case to serializable vertices and edges separately, + * because they already implement the interface Serializable. */ public final class SerializableDirectedGraph implements Serializable { - /** - * Generated serial ID. - */ - private static final long serialVersionUID = -8298189410676038723L; + /** + * Generated serial ID. + */ + private static final long serialVersionUID = -192220033577521277L; - private final DefaultDirectedGraph<Integer,DefaultEdge> graph; + private final DefaultDirectedGraph<Integer, DefaultEdge> graph; - /** - * This Constructor is intended to be used before the serialization of the <br> - * directed graph. - * @param graph - */ - public SerializableDirectedGraph(DefaultDirectedGraph<Integer,DefaultEdge> graph){ - this.graph = graph; - } + /** + * This Constructor is intended to be used before the serialization of the <br> + * directed graph. + * + * @param graph + */ + public SerializableDirectedGraph(DefaultDirectedGraph<Integer, DefaultEdge> graph) { + this.graph = graph; + } - /** - * Returns the graph. - * @return - */ - public DefaultDirectedGraph<Integer,DefaultEdge> getGraph(){ - return graph; - } + /** + * @return A {@link DefaultDirectedGraph graph} instance. + */ + public DefaultDirectedGraph<Integer, DefaultEdge> getGraph() { + return graph; + } } diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/util/ApiUtilities.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/util/ApiUtilities.java index d620f85b..0ab784d8 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/util/ApiUtilities.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/util/ApiUtilities.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -17,83 +17,47 @@ */ package org.dkpro.jwpl.util; +import java.lang.invoke.MethodHandles; + import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.lang.invoke.MethodHandles; - public class ApiUtilities { - private static final Logger logger = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + private static final Logger logger = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); - /** - * DOTS - print progress dots. - * TEXT - print a message with progress in percent. - * - */ - public enum ProgressInfoMode { DOTS, TEXT } + /** + * DOTS - print progress dots. + * TEXT - print a message with progress in percent. + */ + public enum ProgressInfoMode {DOTS, TEXT} /** - * Prints a progress counter. - * @param counter Indicates the position in the task. - * @param size Size of the overall task. - * @param step How many parts should the progress counter have? - * @param mode Sets the output mode. - * @param text The text that should be print along with the progress indicator. - */ - public static void printProgressInfo(int counter, int size, int step, ProgressInfoMode mode, String text) { - if (size < step) { - return; - } + * Prints a progress counter. + * + * @param counter Indicates the position in the task. + * @param size Size of the overall task. + * @param step How many parts should the progress counter have? + * @param mode Sets the output mode. + * @param text The text that should be print along with the progress indicator. + */ + public static void printProgressInfo(int counter, int size, int step, ProgressInfoMode mode, String text) { + if (size < step) { + return; + } - if (counter % (size / step) == 0) { - double progressPercent = counter * 100 / size; - progressPercent = 1 + Math.round(progressPercent * 100) / 100.0; - if (mode.equals(ApiUtilities.ProgressInfoMode.TEXT)) { - logger.info(text + ": " + progressPercent + " - " + OS.getUsedMemory() + " MB"); - } - else if (mode.equals(ApiUtilities.ProgressInfoMode.DOTS)) { - System.out.print("."); - if (progressPercent >= 100) { - System.out.println(); - } - } + if (counter % (size / step) == 0) { + double progressPercent = counter * 100 / size; + progressPercent = 1 + Math.round(progressPercent * 100) / 100.0; + if (mode.equals(ApiUtilities.ProgressInfoMode.TEXT)) { + logger.info(text + ": " + progressPercent + " - " + OS.getUsedMemory() + " MB"); + } else if (mode.equals(ApiUtilities.ProgressInfoMode.DOTS)) { + System.out.print("."); + if (progressPercent >= 100) { + System.out.println(); } + } } - - -// /** -// * Serialize an instance of CategoryGraph. -// * -// * @param fileName -// * Complete path and file name. -// */ -// public static void saveCategoryGraph(String fileName, CategoryGraph catGraph) { -// try { -// ObjectOutputStream file = new ObjectOutputStream( -// new FileOutputStream(fileName)); -// file.writeObject(catGraph); -// file.close(); -// } catch (Exception e) { -// e.printStackTrace(); -// } -// } -// -// /** -// * Load a serialized instance of CategoryGraph. -// * @param fileName Complete path and file name. -// */ -// public static CategoryGraph loadCategoryGraph(String fileName) { -// CategoryGraph catGraph; -// try { -// ObjectInputStream file = new ObjectInputStream(new FileInputStream( -// fileName)); -// catGraph = (CategoryGraph) file.readObject(); -// file.close(); -// } catch (Exception e) { -// e.printStackTrace(); -// return null; -// } -// return catGraph; -// } + } + } diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/util/CommonUtilities.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/util/CommonUtilities.java index 492cb272..d5de0847 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/util/CommonUtilities.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/util/CommonUtilities.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -23,43 +23,45 @@ public class CommonUtilities { - /** - * Debug output an internal set structure. - * @param s Must not be {@code null}. - * @return The resulting String of the contents of {@code s}. - */ - public static String getSetContents(Set<?> s) { - Object[] sortedArray = s.toArray(); - Arrays.sort(sortedArray); + /** + * Debug output an internal set structure. + * + * @param s Must not be {@code null}. + * @return The resulting String of the contents of {@code s}. + */ + public static String getSetContents(Set<?> s) { + Object[] sortedArray = s.toArray(); + Arrays.sort(sortedArray); - int counter = 0; - int elementsPerRow = 10; - StringBuffer sb = new StringBuffer(1000); - for (Object element : sortedArray) { - sb.append(element.toString() + " "); - counter++; - if ((counter % elementsPerRow) == 0) { - sb.append(System.getProperty("line.separator")); - } - } + int counter = 0; + int elementsPerRow = 10; + StringBuffer sb = new StringBuffer(1000); + for (Object element : sortedArray) { + sb.append(element.toString()).append(" "); + counter++; + if ((counter % elementsPerRow) == 0) { sb.append(System.getProperty("line.separator")); - return sb.toString(); + } } + sb.append(System.getProperty("line.separator")); + return sb.toString(); + } - /** - * Debug output an internal map structure as key-value pairs. - * @param m Must not be {@code null}. - * @return The resulting String of the contents of {@code m}. - */ - public static String getMapContents(Map<?,?> m) { - Object[] sortedArray = m.keySet().toArray(); - Arrays.sort(sortedArray); + /** + * Debug output an internal map structure as key-value pairs. + * + * @param m Must not be {@code null}. + * @return The resulting String of the contents of {@code m}. + */ + public static String getMapContents(Map<?, ?> m) { + Object[] sortedArray = m.keySet().toArray(); + Arrays.sort(sortedArray); - StringBuffer sb = new StringBuffer(1000); - for (Object element : sortedArray) { - sb.append(element.toString() + " - " + m.get(element) + System.getProperty("line.separator")); - } - return sb.toString(); + StringBuffer sb = new StringBuffer(1000); + for (Object element : sortedArray) { + sb.append(element.toString()).append(" - ").append(m.get(element)).append(System.getProperty("line.separator")); } + return sb.toString(); + } } diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/util/DbUtilities.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/util/DbUtilities.java index a02c0586..cce77c4b 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/util/DbUtilities.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/util/DbUtilities.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -28,36 +28,35 @@ public class DbUtilities { - private final Connection conn; + private final Connection conn; - private static final Logger logger = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + private static final Logger logger = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); - public DbUtilities(Connection conn) { - this.conn = conn; - } + public DbUtilities(Connection conn) { + this.conn = conn; + } - public boolean tableExists(String tableName) { + public boolean tableExists(String tableName) { - try { - DatabaseMetaData dbmd = conn.getMetaData(); + try { + DatabaseMetaData dbmd = conn.getMetaData(); - // Specify the type of object; in this case we want tables - String[] types = {"TABLE"}; + // Specify the type of object; in this case we want tables + String[] types = {"TABLE"}; - // get all table names - ResultSet resultSet = dbmd.getTables(null, null, "%", types); + // get all table names + ResultSet resultSet = dbmd.getTables(null, null, "%", types); - while (resultSet.next()) { - if (resultSet.getString("TABLE_NAME").equals(tableName)) { - return true; - } - } - } - catch (SQLException e) { - logger.error("Table " + tableName + " does not exist.", new RuntimeException(e)); + while (resultSet.next()) { + if (resultSet.getString("TABLE_NAME").equals(tableName)) { + return true; } - - return false; + } + } catch (SQLException e) { + logger.error("Table " + tableName + " does not exist.", new RuntimeException(e)); } + return false; + } + } diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/util/GraphUtilities.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/util/GraphUtilities.java index 0f79d95c..c7fb6f54 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/util/GraphUtilities.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/util/GraphUtilities.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -28,54 +28,56 @@ public class GraphUtilities { - private static final Logger logger = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + private static final Logger logger = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); - public static Set<Integer> getRandomPageSubset(Iterable<Page> pages, int pResultSetSize) { - Set<Integer> pageIDs = new HashSet<>(); - while (pages.iterator().hasNext()) { - pageIDs.add(pages.iterator().next().getPageId()); - } - return getRandomPageSubset(pageIDs, pResultSetSize); + public static Set<Integer> getRandomPageSubset(Iterable<Page> pages, int pResultSetSize) { + Set<Integer> pageIDs = new HashSet<>(); + while (pages.iterator().hasNext()) { + pageIDs.add(pages.iterator().next().getPageId()); } + return getRandomPageSubset(pageIDs, pResultSetSize); + } - /** Get a random subset (of size pSize) of the page set passed to the method. - * @param pPageIDs The pages. - * @param pResultSetSize The size of the result set. - * @return A random subset of the original page set of the given size or null, if the requested subset size is larger than the original page set. - */ - public static Set<Integer> getRandomPageSubset(Set<Integer> pPageIDs, int pResultSetSize) { + /** + * Get a random subset (of size pSize) of the page set passed to the method. + * + * @param pPageIDs The pages. + * @param pResultSetSize The size of the result set. + * @return A random subset of the original page set of the given size or null, if the requested subset size is larger than the original page set. + */ + public static Set<Integer> getRandomPageSubset(Set<Integer> pPageIDs, int pResultSetSize) { - Set<Integer> uniqueRandomSet = new HashSet<>(); + Set<Integer> uniqueRandomSet = new HashSet<>(); - if (pPageIDs.size() < pResultSetSize) { - logger.warn("Requested subset size is larger than the original page set size."); - return null; - } + if (pPageIDs.size() < pResultSetSize) { + logger.warn("Requested subset size is larger than the original page set size."); + return null; + } - Random rand = new Random(); + Random rand = new Random(); - Object[] pageIdArray = pPageIDs.toArray(); + Object[] pageIdArray = pPageIDs.toArray(); - // If pSize is quite close to the size of the original pageSet the probability of generating the offset of the last missing pageIDs is quite low, with the consequence of unpredictable run-time. - // => if more than the half of pages should be included in the result set, better remove random numbers than adding them - if (pResultSetSize > (pPageIDs.size() / 2)) { - uniqueRandomSet.addAll(pPageIDs); - while (uniqueRandomSet.size() > pResultSetSize) { - int randomOffset = rand.nextInt(pPageIDs.size()); - if (uniqueRandomSet.contains(pageIdArray[randomOffset])) { - uniqueRandomSet.remove(pageIdArray[randomOffset]); - } - } + // If pSize is quite close to the size of the original pageSet the probability of generating the offset of the last + // missing pageIDs is quite low, with the consequence of unpredictable run-time. + // => if more than the half of pages should be included in the result set, better remove random numbers than adding them + if (pResultSetSize > (pPageIDs.size() / 2)) { + uniqueRandomSet.addAll(pPageIDs); + while (uniqueRandomSet.size() > pResultSetSize) { + int randomOffset = rand.nextInt(pPageIDs.size()); + if (uniqueRandomSet.contains(pageIdArray[randomOffset])) { + uniqueRandomSet.remove(pageIdArray[randomOffset]); } - else { - while (uniqueRandomSet.size() < pResultSetSize) { - int randomOffset = rand.nextInt(pPageIDs.size()); - if (!uniqueRandomSet.contains(pageIdArray[randomOffset])) { - uniqueRandomSet.add((Integer)pageIdArray[randomOffset]); - } - } + } + } else { + while (uniqueRandomSet.size() < pResultSetSize) { + int randomOffset = rand.nextInt(pPageIDs.size()); + if (!uniqueRandomSet.contains(pageIdArray[randomOffset])) { + uniqueRandomSet.add((Integer) pageIdArray[randomOffset]); } - - return uniqueRandomSet; + } } + + return uniqueRandomSet; + } } diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/util/HibernateUtilities.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/util/HibernateUtilities.java index a8e8f9bc..ad7b17f1 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/util/HibernateUtilities.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/util/HibernateUtilities.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -28,45 +28,49 @@ public class HibernateUtilities implements WikiConstants { - private final DatabaseConfiguration dbConfig; + private final DatabaseConfiguration dbConfig; - public HibernateUtilities(Language pLanguage, DatabaseConfiguration dbConfig) { - this.dbConfig = dbConfig; - } + public HibernateUtilities(Language pLanguage, DatabaseConfiguration dbConfig) { + this.dbConfig = dbConfig; + } - /** Hibernate IDs are needed to load an object from the database. - * Internal references are via pageIDs. - * @return A mapping of pageIDs to hibernate IDs. - */ - public Map<Integer, Long> getIdMappingPages() { - Map<Integer, Long> idMapping = new HashMap<>(); + /** + * Hibernate IDs are needed to load an object from the database. + * Internal references are via pageIDs. + * + * @return A mapping of pageIDs to hibernate IDs. + */ + public Map<Integer, Long> getIdMappingPages() { + Map<Integer, Long> idMapping = new HashMap<>(); - Session session = WikiHibernateUtil.getSessionFactory(this.dbConfig).getCurrentSession(); - session.beginTransaction(); - for (Object o : session.createQuery("select page.id, page.pageId from Page as page").list()) { - Object[] row = (Object[]) o; - // put (pageID, id) - idMapping.put((Integer) row[1], (Long) row[0]); - } - session.getTransaction().commit(); - return idMapping; + Session session = WikiHibernateUtil.getSessionFactory(this.dbConfig).getCurrentSession(); + session.beginTransaction(); + for (Object o : session.createQuery("select page.id, page.pageId from Page as page").list()) { + Object[] row = (Object[]) o; + // put (pageID, id) + idMapping.put((Integer) row[1], (Long) row[0]); } + session.getTransaction().commit(); + return idMapping; + } - /** Hibernate IDs are needed to load an object from the database. - * Internal references are via pageIDs. - * @return A mapping of pageIDs to hibernate IDs. - */ - public Map<Integer, Long> getIdMappingCategories() { - Map<Integer, Long> idMapping = new HashMap<>(); + /** + * Hibernate IDs are needed to load an object from the database. + * Internal references are via pageIDs. + * + * @return A mapping of pageIDs to hibernate IDs. + */ + public Map<Integer, Long> getIdMappingCategories() { + Map<Integer, Long> idMapping = new HashMap<>(); - Session session = WikiHibernateUtil.getSessionFactory(this.dbConfig).getCurrentSession(); - session.beginTransaction(); - for (Object o : session.createQuery("select cat.id, cat.pageId from Category as cat").list()) { - Object[] row = (Object[]) o; - // put (pageID, id) - idMapping.put((Integer) row[1], (Long) row[0]); - } - session.getTransaction().commit(); - return idMapping; + Session session = WikiHibernateUtil.getSessionFactory(this.dbConfig).getCurrentSession(); + session.beginTransaction(); + for (Object o : session.createQuery("select cat.id, cat.pageId from Category as cat").list()) { + Object[] row = (Object[]) o; + // put (pageID, id) + idMapping.put((Integer) row[1], (Long) row[0]); } + session.getTransaction().commit(); + return idMapping; + } } diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/util/OS.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/util/OS.java index 8414341f..30c26f23 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/util/OS.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/util/OS.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -19,31 +19,34 @@ public class OS { - /** Tries to determine the tpye of OS the application is running on. - * At the moment only Windows and Linux are supported. - * @return The type of OS the application is running on. Or "unknown" if the system is unknown. - */ - public static String getOsType() { - String osType = "unknown"; - String osName = System.getProperty("os.name"); - if (osName.contains("Windows")) { - osType = "Windows"; - } - else if (osName.contains("Linux")) { - osType = "Linux"; - } - return osType; + /** + * Tries to determine the type of OS the application is running on. + * At the moment only Windows and Linux are supported. + * + * @return The type of OS the application is running on. Or "unknown" if the system is unknown. + */ + public static String getOsType() { + String osType = "unknown"; + String osName = System.getProperty("os.name"); + if (osName.contains("Windows")) { + osType = "Windows"; + } else if (osName.contains("Linux")) { + osType = "Linux"; } + return osType; + } - /** Gets the memory used by the JVM in MB. - * @return Returns how much memory (in MB) is used by the JVM at the moment. - */ - public static double getUsedMemory() { - Runtime rt = Runtime.getRuntime(); - - long memLong = rt.totalMemory() - rt.freeMemory(); - double memDouble = memLong / (1024.0 * 1024.0); - memDouble = Math.round(memDouble * 100) / 100.0; - return memDouble; - } + /** + * Gets the memory used by the JVM in MB. + * + * @return Returns how much memory (in MB) is used by the JVM at the moment. + */ + public static double getUsedMemory() { + Runtime rt = Runtime.getRuntime(); + + long memLong = rt.totalMemory() - rt.freeMemory(); + double memDouble = memLong / (1024.0 * 1024.0); + memDouble = Math.round(memDouble * 100) / 100.0; + return memDouble; + } } diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/util/StringUtils.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/util/StringUtils.java index a6227ab3..3a215fed 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/util/StringUtils.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/util/StringUtils.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -26,86 +26,83 @@ public class StringUtils { - private static final Logger logger = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + private static final Logger logger = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); - private static final StringBuilder buffer = new StringBuilder(10_000_000); + private static final StringBuilder buffer = new StringBuilder(10_000_000); - /** - * Joins the elements of a collection into a string. - * - * @param c - * The collection which elements should be joined. - * @param delimiter - * String that is introduced between two joined elements. - * @return The joined string. - */ - public static String join(Collection<?> c, String delimiter) { - buffer.setLength(0); - Iterator<?> iter = c.iterator(); - while (iter.hasNext()) { - buffer.append(iter.next()); - if (iter.hasNext()) { - buffer.append(delimiter); - } - } - return buffer.toString(); - } + /** + * Joins the elements of a collection into a string. + * + * @param c The collection which elements should be joined. + * @param delimiter String that is introduced between two joined elements. + * @return The joined string. + */ + public static String join(Collection<?> c, String delimiter) { + buffer.setLength(0); + Iterator<?> iter = c.iterator(); + while (iter.hasNext()) { + buffer.append(iter.next()); + if (iter.hasNext()) { + buffer.append(delimiter); + } + } + return buffer.toString(); + } - /** - * Replaces all problematic characters from a String with their escaped - * versions to make it SQL conform. - * - * @param str - * unescaped String - * @return SQL safe escaped String - */ - public static String sqlEscape(String str) { - final int len = str.length(); - buffer.setLength(0); - StringBuilder sql = buffer; + /** + * Replaces all problematic characters from a String with their escaped + * versions to make it SQL conform. + * + * @param str unescaped String + * @return SQL safe escaped String + */ + public static String sqlEscape(String str) { + final int len = str.length(); + buffer.setLength(0); + StringBuilder sql = buffer; - for (int i = 0; i < len; i++) { - char c = str.charAt(i); - switch (c) { - case '\u0000': - sql.append('\\').append('0'); - break; - case '\n': - sql.append('\\').append('n'); - break; - case '\t': - sql.append('\\').append('t'); - break; - case '\r': - sql.append('\\').append('r'); - break; - case '\u001a': - sql.append('\\').append('Z'); - break; - case '\'': - sql.append('\\').append('\''); - break; - case '\"': - sql.append('\\').append('"'); - break; - case '\b': - sql.append('\\').append('b'); - break; - case '\\': - sql.append('\\').append('\\'); - break; - // case '%': - // sql.append('[').append('%').append(']'); - // break; - // case '_': - // sql.append('[').append('_').append(']'); - // break; - default: - sql.append(c); - break; - } - } - return sql.toString(); - } + for (int i = 0; i < len; i++) { + char c = str.charAt(i); + switch (c) { + case '\u0000': + sql.append('\\').append('0'); + break; + case '\n': + sql.append('\\').append('n'); + break; + case '\t': + sql.append('\\').append('t'); + break; + case '\r': + sql.append('\\').append('r'); + break; + case '\u001a': + sql.append('\\').append('Z'); + break; + case '\'': + sql.append('\\').append('\''); + break; + case '\"': + sql.append('\\').append('"'); + break; + case '\b': + sql.append('\\').append('b'); + break; + case '\\': + sql.append('\\').append('\\'); + break; + // case '%': + // sql.append('[').append('%').append(']'); + // break; + // case '_': + // sql.append('[').append('_').append(']'); + // break; + default: + sql.append(c); + break; + } + } + return sql.toString(); + } } diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/util/UnmodifiableArraySet.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/util/UnmodifiableArraySet.java index 59cd5dc0..626d3fc8 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/util/UnmodifiableArraySet.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/util/UnmodifiableArraySet.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -22,116 +22,99 @@ import java.util.Iterator; import java.util.Set; -public class UnmodifiableArraySet<E> - implements Set<E> -{ - private final Object[] data; - - public UnmodifiableArraySet(E[] aData) - { - data = new Object[aData.length]; - System.arraycopy(aData, 0, data, 0, data.length); - } - - public UnmodifiableArraySet(Set<E> aData) - { - data = new Object[aData.size()]; - System.arraycopy(aData.toArray(), 0, data, 0, data.length); - } - - @Override - public int size() - { - return data != null ? data.length : 0; - } - - @Override - public boolean isEmpty() - { - return data != null ? data.length > 0 : true; - } - - @Override - public boolean contains(Object aO) - { - if (data == null) { - return false; - } - for (Object d : data) { - if (d.equals(aO)) { - return true; - } - } - return false; - } - - @SuppressWarnings("unchecked") - @Override - public Iterator<E> iterator() - { - return (Iterator<E>) Arrays.asList(data).iterator(); - } - - @Override - public Object[] toArray() - { - return data; - } - - @Override - public <T> T[] toArray(T[] aA) - { - if (aA.length != data.length) { - throw new IllegalArgumentException("Target array too small"); - } - System.arraycopy(data, 0, aA, 0, aA.length); - return aA; - } - - @Override - public boolean add(E aE) - { - throw new UnsupportedOperationException("Unmodifiable set"); - } - - @Override - public boolean remove(Object aO) - { - throw new UnsupportedOperationException("Unmodifiable set"); - } - - @Override - public boolean containsAll(Collection<?> aC) - { - for (Object o : aC) { - if (!contains(o)) { - return false; - } - } - return true; - } - - @Override - public boolean addAll(Collection<? extends E> aC) - { - throw new UnsupportedOperationException("Unmodifiable set"); - } - - @Override - public boolean retainAll(Collection<?> aC) - { - throw new UnsupportedOperationException("Unmodifiable set"); - } - - @Override - public boolean removeAll(Collection<?> aC) - { - throw new UnsupportedOperationException("Unmodifiable set"); - } - - @Override - public void clear() - { - throw new UnsupportedOperationException("Unmodifiable set"); - } +public class UnmodifiableArraySet<E> implements Set<E> { + private final Object[] data; + + public UnmodifiableArraySet(E[] aData) { + data = new Object[aData.length]; + System.arraycopy(aData, 0, data, 0, data.length); + } + + public UnmodifiableArraySet(Set<E> aData) { + data = new Object[aData.size()]; + System.arraycopy(aData.toArray(), 0, data, 0, data.length); + } + + @Override + public int size() { + return data != null ? data.length : 0; + } + + @Override + public boolean isEmpty() { + return data != null ? data.length > 0 : true; + } + + @Override + public boolean contains(Object aO) { + if (data == null) { + return false; + } + for (Object d : data) { + if (d.equals(aO)) { + return true; + } + } + return false; + } + + @SuppressWarnings("unchecked") + @Override + public Iterator<E> iterator() { + return (Iterator<E>) Arrays.asList(data).iterator(); + } + + @Override + public Object[] toArray() { + return data; + } + + @Override + public <T> T[] toArray(T[] aA) { + if (aA.length != data.length) { + throw new IllegalArgumentException("Target array too small"); + } + System.arraycopy(data, 0, aA, 0, aA.length); + return aA; + } + + @Override + public boolean add(E aE) { + throw new UnsupportedOperationException("Unmodifiable set"); + } + + @Override + public boolean remove(Object aO) { + throw new UnsupportedOperationException("Unmodifiable set"); + } + + @Override + public boolean containsAll(Collection<?> aC) { + for (Object o : aC) { + if (!contains(o)) { + return false; + } + } + return true; + } + + @Override + public boolean addAll(Collection<? extends E> aC) { + throw new UnsupportedOperationException("Unmodifiable set"); + } + + @Override + public boolean retainAll(Collection<?> aC) { + throw new UnsupportedOperationException("Unmodifiable set"); + } + + @Override + public boolean removeAll(Collection<?> aC) { + throw new UnsupportedOperationException("Unmodifiable set"); + } + + @Override + public void clear() { + throw new UnsupportedOperationException("Unmodifiable set"); + } } diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/util/distance/LevenshteinStringDistance.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/util/distance/LevenshteinStringDistance.java index 356c6d27..5c6009cd 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/util/distance/LevenshteinStringDistance.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/util/distance/LevenshteinStringDistance.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -19,64 +19,65 @@ public class LevenshteinStringDistance implements StringDistance { - public double distance(String s, String t) { - int[][] d; // matrix - int n; // length of s - int m; // length of t - int i; // iterates through s - int j; // iterates through t - char s_i; // ith character of s - char t_j; // jth character of t - int cost; // cost + @Override + public double distance(String s, String t) { + int[][] d; // matrix + int n; // length of s + int m; // length of t + int i; // iterates through s + int j; // iterates through t + char s_i; // ith character of s + char t_j; // jth character of t + int cost; // cost - // Step 1 - n = s.length(); - m = t.length(); - if (n == 0) { - return m; - } - if (m == 0) { - return n; - } - d = new int[n + 1][m + 1]; + // Step 1 + n = s.length(); + m = t.length(); + if (n == 0) { + return m; + } + if (m == 0) { + return n; + } + d = new int[n + 1][m + 1]; - // Step 2 - for (i = 0; i <= n; i++) { - d[i][0] = i; - } - for (j = 0; j <= m; j++) { - d[0][j] = j; - } - // Step 3 - for (i = 1; i <= n; i++) { - s_i = s.charAt(i - 1); - // Step 4 - for (j = 1; j <= m; j++) { - t_j = t.charAt(j - 1); - // Step 5 - if (s_i == t_j) { - cost = 0; - } else { - cost = 1; - } - // Step 6 - d[i][j] = Minimum(d[i - 1][j] + 1, d[i][j - 1] + 1, - d[i - 1][j - 1] + cost); - } + // Step 2 + for (i = 0; i <= n; i++) { + d[i][0] = i; + } + for (j = 0; j <= m; j++) { + d[0][j] = j; + } + // Step 3 + for (i = 1; i <= n; i++) { + s_i = s.charAt(i - 1); + // Step 4 + for (j = 1; j <= m; j++) { + t_j = t.charAt(j - 1); + // Step 5 + if (s_i == t_j) { + cost = 0; + } else { + cost = 1; } - // Step 7 - return Integer.valueOf(d[n][m]).doubleValue(); + // Step 6 + d[i][j] = Minimum(d[i - 1][j] + 1, d[i][j - 1] + 1, + d[i - 1][j - 1] + cost); + } } + // Step 7 + return Integer.valueOf(d[n][m]).doubleValue(); + } - private int Minimum(int a, int b, int c) { - int min; - min = a; - if (b < min) { - min = b; - } - if (c < min) { - min = c; - } - return min; + private int Minimum(int a, int b, int c) { + int min; + min = a; + if (b < min) { + min = b; + } + if (c < min) { + min = c; } + return min; + } } diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/util/distance/StringDistance.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/util/distance/StringDistance.java index acc544e8..c538a265 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/util/distance/StringDistance.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/util/distance/StringDistance.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -19,6 +19,6 @@ public interface StringDistance { - double distance(String s1, String s2); + double distance(String s1, String s2); } diff --git a/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/domain/DataMachineFiles.java b/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/domain/DataMachineFiles.java index ff29b706..1ae33455 100644 --- a/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/domain/DataMachineFiles.java +++ b/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/domain/DataMachineFiles.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -23,159 +23,159 @@ import org.dkpro.jwpl.wikimachine.domain.Files; public class DataMachineFiles extends Files { - private final static String INPUT_PAGELINKS = "pagelinks.sql"; - private final static String INPUT_PAGESARTICLES = "pages-articles.xml"; - private final static String INPUT_CATEGORYLINKS = "categorylinks.sql"; - private final static String INPUT_PAGESMETACURRENT = "pages-meta-current.xml"; - - private final static String GENERATED_PAGE = "page.bin"; - private final static String GENERATED_REVISION = "revision.bin"; - private final static String GENERATED_TEXT = "text.bin"; - /* - * discussions.bin is currently unused. Discussions are put in pages.bin - */ - private final static String GENERATED_DISCUSSIONS = "discussions.bin"; - - private final static String ARCHIVE_EXTENSION = ".gz"; - - private File dataDirectory = new File("."); - private boolean compressGeneratedFiles = false; - - private File inputPagelinks = null; - private File inputPagesarticles = null; - private File inputCategorylinks = null; - private File inputPagesMetaCurrent = null; - - public DataMachineFiles(ILogger logger) { - super(logger); - outputDirectory = setOutputDirectory(dataDirectory); - } - - public DataMachineFiles(DataMachineFiles files) { - super(files); - this.dataDirectory = files.dataDirectory; - this.compressGeneratedFiles = files.compressGeneratedFiles; - this.inputPagelinks = files.inputPagelinks; - this.inputPagesarticles = files.inputPagesarticles; - this.inputCategorylinks = files.inputCategorylinks; - this.inputPagesMetaCurrent = files.inputPagesMetaCurrent; - } - - private File setOutputDirectory(File parentDirectory) { - File result = new File(parentDirectory.getAbsolutePath() - + File.separator + OUTPUT_DIRECTORY); - - return result; - } - - public void setDataDirectory(String newDataDirectory) { - File inputDataDirectory = new File(newDataDirectory); - if (inputDataDirectory.isDirectory()) { - this.dataDirectory = inputDataDirectory; - this.outputDirectory = setOutputDirectory(dataDirectory); - } else { - logger.log(dataDirectory - + " is not a directory. Continue read from " - + this.dataDirectory.getAbsolutePath()); - } - - } - - public boolean checkDatamachineSourceFiles() { - File[] filesInDataDirectory = dataDirectory.listFiles(); - if (filesInDataDirectory.length > 2) { - for (File currentFile : filesInDataDirectory) { - - //TODO improve file check. Only accept files that come in a supported compression format - String currentFileName = currentFile.getName(); - if (currentFileName.contains(INPUT_PAGESARTICLES)) { - inputPagesarticles = currentFile; - } else if (currentFileName.contains(INPUT_PAGELINKS)) { - inputPagelinks = currentFile; - } else if (currentFileName.contains(INPUT_CATEGORYLINKS)) { - inputCategorylinks = currentFile; - } else if (currentFileName.contains(INPUT_PAGESMETACURRENT)) { - inputPagesMetaCurrent = currentFile; - } - } - } - // either inputPagesarticles or inputPagesMetaCurrent have to be placed - // in the input directory - return !((inputPagesarticles == null && inputPagesMetaCurrent == null ) - || inputPagelinks == null || inputCategorylinks == null); - } - - public String getGeneratedPage() { - return getGeneratedPath(GENERATED_PAGE); - } - - public String getGeneratedRevision() { - return getGeneratedPath(GENERATED_REVISION); - } - - public String getGeneratedText() { - return getGeneratedPath(GENERATED_TEXT); - } - - public String getGeneratedDiscussions() { - return getGeneratedPath(GENERATED_DISCUSSIONS); - } - - public String getInputPageLinks() { - return (inputPagelinks != null) ? inputPagelinks.getAbsolutePath() - : null; - } - - public String getInputPagesArticles() { - return (inputPagesarticles != null) ? inputPagesarticles - .getAbsolutePath() : null; - } - - public String getInputCategoryLinks() { - return (inputCategorylinks != null) ? inputCategorylinks - .getAbsolutePath() : null; - } - - public String getInputPagesMetaCurrent() { - return (inputPagesMetaCurrent != null) ? inputPagesMetaCurrent - .getAbsolutePath() : null; - } - - - private String getGeneratedPath(String fileName) { - String path = dataDirectory.getAbsolutePath() + File.separator - + fileName; - if (compressGeneratedFiles) { - path = path.concat(ARCHIVE_EXTENSION); - } - return path; - } - - /** - * @see DataMachineFiles#setCompressGeneratedFiles(boolean) - */ - public boolean isCompressGeneratedFiles() { - return compressGeneratedFiles; - } - - /** - * Set the input parameter to {@code true} it you want to GZip the temporary - * files and save a disk space. <b>Attention:</b> {@code DataInputStream} - * can have problems reading from a compressed file. This can be a reason - * for strange side effects like heap overflow or some other exceptions. <br> - * For UKP-Developers: you can save much more disk space if you'll parse the - * page-articles XML Dump every time you need it: during processPage(), - * processRevision() and processText(). See TimeMachine solution especially - * the package org.dkpro.jwpl.timemachine.dump.xml - * - * @param compressGeneratedFiles - */ - public void setCompressGeneratedFiles(boolean compressGeneratedFiles) { - this.compressGeneratedFiles = compressGeneratedFiles; - } - - @Override - public boolean checkAll() { - return checkOutputDirectory() && checkDatamachineSourceFiles(); - } + private final static String INPUT_PAGELINKS = "pagelinks.sql"; + private final static String INPUT_PAGESARTICLES = "pages-articles.xml"; + private final static String INPUT_CATEGORYLINKS = "categorylinks.sql"; + private final static String INPUT_PAGESMETACURRENT = "pages-meta-current.xml"; + + private final static String GENERATED_PAGE = "page.bin"; + private final static String GENERATED_REVISION = "revision.bin"; + private final static String GENERATED_TEXT = "text.bin"; + /* + * discussions.bin is currently unused. Discussions are put in pages.bin + */ + private final static String GENERATED_DISCUSSIONS = "discussions.bin"; + + private final static String ARCHIVE_EXTENSION = ".gz"; + + private File dataDirectory = new File("."); + private boolean compressGeneratedFiles = false; + + private File inputPagelinks = null; + private File inputPagesarticles = null; + private File inputCategorylinks = null; + private File inputPagesMetaCurrent = null; + + public DataMachineFiles(ILogger logger) { + super(logger); + outputDirectory = setOutputDirectory(dataDirectory); + } + + public DataMachineFiles(DataMachineFiles files) { + super(files); + this.dataDirectory = files.dataDirectory; + this.compressGeneratedFiles = files.compressGeneratedFiles; + this.inputPagelinks = files.inputPagelinks; + this.inputPagesarticles = files.inputPagesarticles; + this.inputCategorylinks = files.inputCategorylinks; + this.inputPagesMetaCurrent = files.inputPagesMetaCurrent; + } + + private File setOutputDirectory(File parentDirectory) { + File result = new File(parentDirectory.getAbsolutePath() + + File.separator + OUTPUT_DIRECTORY); + + return result; + } + + public void setDataDirectory(String newDataDirectory) { + File inputDataDirectory = new File(newDataDirectory); + if (inputDataDirectory.isDirectory()) { + this.dataDirectory = inputDataDirectory; + this.outputDirectory = setOutputDirectory(dataDirectory); + } else { + logger.log(dataDirectory + + " is not a directory. Continue read from " + + this.dataDirectory.getAbsolutePath()); + } + + } + + public boolean checkDatamachineSourceFiles() { + File[] filesInDataDirectory = dataDirectory.listFiles(); + if (filesInDataDirectory.length > 2) { + for (File currentFile : filesInDataDirectory) { + + //TODO improve file check. Only accept files that come in a supported compression format + String currentFileName = currentFile.getName(); + if (currentFileName.contains(INPUT_PAGESARTICLES)) { + inputPagesarticles = currentFile; + } else if (currentFileName.contains(INPUT_PAGELINKS)) { + inputPagelinks = currentFile; + } else if (currentFileName.contains(INPUT_CATEGORYLINKS)) { + inputCategorylinks = currentFile; + } else if (currentFileName.contains(INPUT_PAGESMETACURRENT)) { + inputPagesMetaCurrent = currentFile; + } + } + } + // either inputPagesarticles or inputPagesMetaCurrent have to be placed + // in the input directory + return !((inputPagesarticles == null && inputPagesMetaCurrent == null) + || inputPagelinks == null || inputCategorylinks == null); + } + + public String getGeneratedPage() { + return getGeneratedPath(GENERATED_PAGE); + } + + public String getGeneratedRevision() { + return getGeneratedPath(GENERATED_REVISION); + } + + public String getGeneratedText() { + return getGeneratedPath(GENERATED_TEXT); + } + + public String getGeneratedDiscussions() { + return getGeneratedPath(GENERATED_DISCUSSIONS); + } + + public String getInputPageLinks() { + return (inputPagelinks != null) ? inputPagelinks.getAbsolutePath() + : null; + } + + public String getInputPagesArticles() { + return (inputPagesarticles != null) ? inputPagesarticles + .getAbsolutePath() : null; + } + + public String getInputCategoryLinks() { + return (inputCategorylinks != null) ? inputCategorylinks + .getAbsolutePath() : null; + } + + public String getInputPagesMetaCurrent() { + return (inputPagesMetaCurrent != null) ? inputPagesMetaCurrent + .getAbsolutePath() : null; + } + + + private String getGeneratedPath(String fileName) { + String path = dataDirectory.getAbsolutePath() + File.separator + + fileName; + if (compressGeneratedFiles) { + path = path.concat(ARCHIVE_EXTENSION); + } + return path; + } + + /** + * @see DataMachineFiles#setCompressGeneratedFiles(boolean) + */ + public boolean isCompressGeneratedFiles() { + return compressGeneratedFiles; + } + + /** + * Set the input parameter to {@code true} it you want to GZip the temporary + * files and save a disk space. <b>Attention:</b> {@code DataInputStream} + * can have problems reading from a compressed file. This can be a reason + * for strange side effects like heap overflow or some other exceptions. <br> + * For UKP-Developers: you can save much more disk space if you'll parse the + * page-articles XML Dump every time you need it: during processPage(), + * processRevision() and processText(). See TimeMachine solution especially + * the package org.dkpro.jwpl.timemachine.dump.xml + * + * @param compressGeneratedFiles + */ + public void setCompressGeneratedFiles(boolean compressGeneratedFiles) { + this.compressGeneratedFiles = compressGeneratedFiles; + } + + @Override + public boolean checkAll() { + return checkOutputDirectory() && checkDatamachineSourceFiles(); + } } diff --git a/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/domain/DataMachineGenerator.java b/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/domain/DataMachineGenerator.java index 16d4e049..e6f1c4b4 100644 --- a/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/domain/DataMachineGenerator.java +++ b/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/domain/DataMachineGenerator.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -37,138 +37,129 @@ * Transforms a database from mediawiki format to JWPL format.<br> * The transformation produces .txt files for the different tables in the JWPL * database. - * - * */ public class DataMachineGenerator extends AbstractSnapshotGenerator { - DataMachineFiles files = null; - IDumpVersion version = null; - - public DataMachineGenerator(IEnvironmentFactory environmentFactory) { - super(environmentFactory); - } - - @Override - public void setFiles(Files files) { - this.files = (DataMachineFiles) files; - } - - @Override - public void start() throws Exception { - version = environmentFactory.getDumpVersion(); - MetaData metaData = MetaData.initWithConfig(configuration); - version.initialize(null); - version.setMetaData(metaData); - version.setFiles(files); - processInputDump(); - } - - private void processInputDump() throws IOException { - - logger.log("parse input dumps..."); - new XML2Binary(decompressor.getInputStream(getPagesArticlesFile()), - files); - - - dumpVersionProcessor.setDumpVersions(new IDumpVersion[] { version }); - - logger.log("processing table page..."); - dumpVersionProcessor.processPage(createPageParser()); - - logger.log("processing table categorylinks..."); - dumpVersionProcessor.processCategorylinks(createCategorylinksParser()); - - logger.log("processing table pagelinks..."); - dumpVersionProcessor.processPagelinks(createPagelinksParser()); - - logger.log("processing table revision..."); - dumpVersionProcessor.processRevision(createRevisionParser()); - - logger.log("processing table text..."); - dumpVersionProcessor.processText(createTextParser()); - - logger.log("writing metadata..."); - dumpVersionProcessor.writeMetaData(); - - logger.log("finished"); - } - - /** - * Parse either "pages-articles.xml" or "pages-meta-current.xml". If both - * files exist in the input directory "pages-meta-current.xml" will be - * favored. - * - * @return the input articles dump - */ - private String getPagesArticlesFile() { - String pagesArticlesFile = null; - String parseMessage = null; - - //Use of minimal dump only with articles - if (files.getInputPagesArticles() != null) { - pagesArticlesFile = files.getInputPagesArticles(); - parseMessage = "Discussions are unavailable"; - } - - //Use of dump with discussions - if (files.getInputPagesMetaCurrent() != null) { - pagesArticlesFile = files.getInputPagesMetaCurrent(); - parseMessage = "Discussions are available"; - } - - logger.log(parseMessage); - return pagesArticlesFile; - } - - private PageParser createPageParser() throws IOException { - String pageFile = files.getGeneratedPage(); - - DumpTableInputStream pageTableInputStream = environmentFactory - .getDumpTableInputStream(); - pageTableInputStream.initialize(decompressor.getInputStream(pageFile), - DumpTableEnum.PAGE); - - PageParser pageParser = environmentFactory.getPageParser(); - pageParser.setInputStream(pageTableInputStream); - return pageParser; - } - - private CategorylinksParser createCategorylinksParser() throws IOException { - String categorylinksFile = files.getInputCategoryLinks(); - return new CategorylinksParser(decompressor - .getInputStream(categorylinksFile)); - } - - private PagelinksParser createPagelinksParser() throws IOException { - String pagelinksFile = files.getInputPageLinks(); - return new PagelinksParser(decompressor.getInputStream(pagelinksFile)); - } - - private RevisionParser createRevisionParser() throws IOException { - String revisionFile = files.getGeneratedRevision(); - - DumpTableInputStream revisionTableInputStream = environmentFactory - .getDumpTableInputStream(); - revisionTableInputStream.initialize(decompressor - .getInputStream(revisionFile), DumpTableEnum.REVISION); - - RevisionParser revisionParser = environmentFactory.getRevisionParser(); - revisionParser.setInputStream(revisionTableInputStream); - return revisionParser; - } - - private TextParser createTextParser() throws IOException { - String textFile = files.getGeneratedText(); - - DumpTableInputStream textTableInputStream = environmentFactory - .getDumpTableInputStream(); - textTableInputStream.initialize(decompressor.getInputStream(textFile), - DumpTableEnum.TEXT); - - TextParser textParser = environmentFactory.getTextParser(); - textParser.setInputStream(textTableInputStream); - return textParser; - } + DataMachineFiles files = null; + IDumpVersion version = null; + + public DataMachineGenerator(IEnvironmentFactory environmentFactory) { + super(environmentFactory); + } + + @Override + public void setFiles(Files files) { + this.files = (DataMachineFiles) files; + } + + @Override + public void start() throws Exception { + version = environmentFactory.getDumpVersion(); + MetaData metaData = MetaData.initWithConfig(configuration); + version.initialize(null); + version.setMetaData(metaData); + version.setFiles(files); + processInputDump(); + } + + private void processInputDump() throws IOException { + + logger.log("parse input dumps..."); + new XML2Binary(decompressor.getInputStream(getPagesArticlesFile()), + files); + + + dumpVersionProcessor.setDumpVersions(new IDumpVersion[]{version}); + + logger.log("processing table page..."); + dumpVersionProcessor.processPage(createPageParser()); + + logger.log("processing table categorylinks..."); + dumpVersionProcessor.processCategorylinks(createCategorylinksParser()); + + logger.log("processing table pagelinks..."); + dumpVersionProcessor.processPagelinks(createPagelinksParser()); + + logger.log("processing table revision..."); + dumpVersionProcessor.processRevision(createRevisionParser()); + + logger.log("processing table text..."); + dumpVersionProcessor.processText(createTextParser()); + + logger.log("writing metadata..."); + dumpVersionProcessor.writeMetaData(); + + logger.log("finished"); + } + + /** + * Parse either "pages-articles.xml" or "pages-meta-current.xml". If both + * files exist in the input directory "pages-meta-current.xml" will be + * favored. + * + * @return the input articles dump + */ + private String getPagesArticlesFile() { + String pagesArticlesFile = null; + String parseMessage = null; + + //Use of minimal dump only with articles + if (files.getInputPagesArticles() != null) { + pagesArticlesFile = files.getInputPagesArticles(); + parseMessage = "Discussions are unavailable"; + } + + //Use of dump with discussions + if (files.getInputPagesMetaCurrent() != null) { + pagesArticlesFile = files.getInputPagesMetaCurrent(); + parseMessage = "Discussions are available"; + } + + logger.log(parseMessage); + return pagesArticlesFile; + } + + private PageParser createPageParser() throws IOException { + String pageFile = files.getGeneratedPage(); + + DumpTableInputStream pageTableInputStream = environmentFactory.getDumpTableInputStream(); + pageTableInputStream.initialize(decompressor.getInputStream(pageFile), DumpTableEnum.PAGE); + + PageParser pageParser = environmentFactory.getPageParser(); + pageParser.setInputStream(pageTableInputStream); + return pageParser; + } + + private CategorylinksParser createCategorylinksParser() throws IOException { + String categorylinksFile = files.getInputCategoryLinks(); + return new CategorylinksParser(decompressor.getInputStream(categorylinksFile)); + } + + private PagelinksParser createPagelinksParser() throws IOException { + String pagelinksFile = files.getInputPageLinks(); + return new PagelinksParser(decompressor.getInputStream(pagelinksFile)); + } + + private RevisionParser createRevisionParser() throws IOException { + String revisionFile = files.getGeneratedRevision(); + + DumpTableInputStream revisionTableInputStream = environmentFactory.getDumpTableInputStream(); + revisionTableInputStream.initialize(decompressor.getInputStream(revisionFile), DumpTableEnum.REVISION); + + RevisionParser revisionParser = environmentFactory.getRevisionParser(); + revisionParser.setInputStream(revisionTableInputStream); + return revisionParser; + } + + private TextParser createTextParser() throws IOException { + String textFile = files.getGeneratedText(); + + DumpTableInputStream textTableInputStream = environmentFactory.getDumpTableInputStream(); + textTableInputStream.initialize(decompressor.getInputStream(textFile), DumpTableEnum.TEXT); + + TextParser textParser = environmentFactory.getTextParser(); + textParser.setInputStream(textTableInputStream); + return textParser; + } } diff --git a/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/domain/JWPLDataMachine.java b/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/domain/JWPLDataMachine.java index 7779d4b4..c2ee8b82 100644 --- a/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/domain/JWPLDataMachine.java +++ b/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/domain/JWPLDataMachine.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -28,65 +28,64 @@ */ public class JWPLDataMachine { - private static final int LANG_ARG = 0; - private static final int MAINCATEGORY_ARG = 1; - private static final int DISAMBIGUATION_ARG = 2; - private static final int DATADIR_ARG = 3; + private static final int LANG_ARG = 0; + private static final int MAINCATEGORY_ARG = 1; + private static final int DISAMBIGUATION_ARG = 2; + private static final int DATADIR_ARG = 3; - private static final String USAGE = "Please use\n" - + "\tjava -jar JWPLDataMachine.jar <LANGUAGE> <TOP_CATEGORY_NAME> <DISAMBIGUATION_CATEGORY_NAME> <SOURCE_DIRECTORY>\n\n" - + "The source directory must contain files\n" - + "\tpagelinks.sql\n" - + "\tpages-articles.xml\n" - + "\tcategorylinks.sql\n" - + "GZip or BZip2 compressed archives of above-named files are also allowed.\n" - + "Please set up a decompressor.xml for a usage of other external archive utilities (see documentation for more help).\n"; + private static final String USAGE = "Please use\n" + + "\tjava -jar JWPLDataMachine.jar <LANGUAGE> <TOP_CATEGORY_NAME> <DISAMBIGUATION_CATEGORY_NAME> <SOURCE_DIRECTORY>\n\n" + + "The source directory must contain files\n" + + "\tpagelinks.sql\n" + + "\tpages-articles.xml\n" + + "\tcategorylinks.sql\n" + + "GZip or BZip2 compressed archives of above-named files are also allowed.\n" + + "Please set up a decompressor.xml for a usage of other external archive utilities (see documentation for more help).\n"; - private static final long startTime = System.currentTimeMillis(); + private static final long startTime = System.currentTimeMillis(); - private static final IEnvironmentFactory environmentFactory = SpringFactory - .getInstance(); + private static final IEnvironmentFactory environmentFactory = SpringFactory.getInstance(); - private static final ILogger logger = environmentFactory.getLogger(); + private static final ILogger logger = environmentFactory.getLogger(); - public static void main(String[] args) { - if (args.length > 3) { - Configuration config = getConfigFromArgs(args); - DataMachineFiles files = new DataMachineFiles(logger); - files.setDataDirectory(args[DATADIR_ARG]); - if (files.checkAll()) { - try { + public static void main(String[] args) { + if (args.length > 3) { + Configuration config = getConfigFromArgs(args); + DataMachineFiles files = new DataMachineFiles(logger); + files.setDataDirectory(args[DATADIR_ARG]); + if (files.checkAll()) { + try { - ISnapshotGenerator generator = environmentFactory - .getSnapshotGenerator(); - generator.setConfiguration(config); - generator.setFiles(files); - generator.start(); + ISnapshotGenerator generator = environmentFactory + .getSnapshotGenerator(); + generator.setConfiguration(config); + generator.setFiles(files); + generator.start(); - logger.log("End of the application. Working time = " - + (System.currentTimeMillis() + logger.log("End of the application. Working time = " + + (System.currentTimeMillis() - startTime) + " ms"); - } catch (Exception e) { - logger.log(e); - } - } else { - logger.log("Not all necessary source files could be found in " - + args[DATADIR_ARG]); - } + } catch (Exception e) { + logger.log(e); + } + } else { + logger.log("Not all necessary source files could be found in " + + args[DATADIR_ARG]); + } - } else { - System.out.println(USAGE); - } + } else { + System.out.println(USAGE); + } - } + } - private static Configuration getConfigFromArgs(String[] args) { - Configuration config = new Configuration(logger); - config.setLanguage(args[LANG_ARG]); - config.setMainCategory(args[MAINCATEGORY_ARG]); - config.setDisambiguationCategory(args[DISAMBIGUATION_ARG]); + private static Configuration getConfigFromArgs(String[] args) { + Configuration config = new Configuration(logger); + config.setLanguage(args[LANG_ARG]); + config.setMainCategory(args[MAINCATEGORY_ARG]); + config.setDisambiguationCategory(args[DISAMBIGUATION_ARG]); - return config; - } + return config; + } } diff --git a/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/dump/version/SingleDumpVersionJDKGeneric.java b/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/dump/version/SingleDumpVersionJDKGeneric.java index d7e1d6cb..62470f55 100644 --- a/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/dump/version/SingleDumpVersionJDKGeneric.java +++ b/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/dump/version/SingleDumpVersionJDKGeneric.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -36,242 +36,216 @@ import org.dkpro.jwpl.wikimachine.util.TxtFileWriter; public class SingleDumpVersionJDKGeneric<KeyType, HashAlgorithm extends IStringHashCode> - extends AbstractDumpVersion { - - private static final String SQL_NULL = "NULL"; - //TODO This constant is used to flag page titles of discussion pages. - // Is also defined in wikipedia.api:WikiConstants.DISCUSSION_PREFIX - // It just doesn't make sense to add a dependency just for the constant - private static final String DISCUSSION_PREFIX = "Discussion:"; - - private Map<Integer, String> pPageIdNameMap; - private Set<Integer> cPageIdNameMap; - private Map<KeyType, Integer> pNamePageIdMap; - private Map<KeyType, Integer> cNamePageIdMap; - private Map<Integer, String> rPageIdNameMap; - private Set<Integer> disambiguations; - private Map<Integer, Integer> textIdPageIdMap; - - IStringHashCode hashAlgorithm; - - @SuppressWarnings("unchecked") - public SingleDumpVersionJDKGeneric(Class<HashAlgorithm> hashAlgorithmClass) - throws InstantiationException, IllegalAccessException, NoSuchMethodException, InvocationTargetException { - - hashAlgorithm = hashAlgorithmClass.getDeclaredConstructor().newInstance(); - @SuppressWarnings("unused") - KeyType hashAlgorithmResult = (KeyType) hashAlgorithm.hashCode("test"); - } - - @Override - public void freeAfterCategoryLinksParsing() { - cPageIdNameMap.clear(); - cNamePageIdMap.clear(); - } - - @Override - public void freeAfterPageLinksParsing() { - // nothing to free - - } - - @Override - public void freeAfterPageParsing() { - metaData.setNrOfCategories(cPageIdNameMap.size()); - metaData.setNrOfPages(pPageIdNameMap.keySet().size() - + rPageIdNameMap.keySet().size()); - System.out.println("nrOfCategories: " + metaData.getNrOfCategories()); - System.out.println("nrOfPage: " + metaData.getNrOfPages()); - System.out - .println("nrOfRedirects before testing the validity of the destination:" - + rPageIdNameMap.size()); - - } - - @Override - public void freeAfterRevisonParsing() { - // nothing to free - - } - - @Override - public void freeAfterTextParsing() { - pPageIdNameMap.clear(); - cPageIdNameMap.clear(); - pNamePageIdMap.clear(); - cNamePageIdMap.clear(); - rPageIdNameMap.clear(); - disambiguations.clear(); - textIdPageIdMap.clear(); - } - - @Override - public void initialize(Timestamp timestamp) { - pPageIdNameMap = new HashMap<>(1_000_000); - cPageIdNameMap = new HashSet<>(1_000_000); - pNamePageIdMap = new HashMap<>(1_000_000); - cNamePageIdMap = new HashMap<>(1_000_000); - rPageIdNameMap = new HashMap<>(1_000_000); - disambiguations = new HashSet<>(1_000_000); - textIdPageIdMap = new HashMap<>(1_000_000); - } - - @SuppressWarnings("unchecked") - @Override - public void processCategoryLinksRow(CategorylinksParser clParser) - throws IOException { - String cl_to = clParser.getClTo(); - - if (cl_to != null) { - KeyType clToHash = (KeyType) hashAlgorithm.hashCode(cl_to); - - Integer cl_toValue = cNamePageIdMap.get(clToHash); - - if (cl_toValue != null) { - int cl_from = clParser.getClFrom(); - - if (pPageIdNameMap.containsKey(cl_from)) { - categoryPages.addRow(cl_toValue, cl_from); - pageCategories.addRow(cl_from, cl_toValue); - - if (cl_to.equals(metaData.getDisambiguationCategory())) { - disambiguations.add(cl_from); - metaData.addDisamb(); - } - } else if (cPageIdNameMap.contains(cl_from)) { - categoryOutlinks.addRow(cl_toValue, cl_from); - categoryInlinks.addRow(cl_from, cl_toValue); - } - - } - } - else { - throw new IOException("Parsin error." + CategorylinksParser.class.getName() + - " returned null value in " + this.getClass().getName()); - } - } - - @SuppressWarnings("unchecked") - @Override - public void processPageLinksRow(PagelinksParser plParser) - throws IOException { - int pl_from = plParser.getPlFrom(); - String pl_to = plParser.getPlTo(); - if (pl_to != null) { - KeyType plToHash = (KeyType) hashAlgorithm.hashCode(pl_to); - Integer pl_toValue = pNamePageIdMap.get(plToHash); - // skip redirects if skipPage is enabled - if ((!skipPage || pPageIdNameMap.containsKey(pl_from)) - && pl_toValue != null) { - pageOutlinks.addRow(pl_from, pl_toValue); - pageInlinks.addRow(pl_toValue, pl_from); - } - } - } - - @SuppressWarnings("unchecked") - @Override - public void processPageRow(PageParser pageParser) throws IOException { - int page_namespace = pageParser.getPageNamespace(); - int page_id = pageParser.getPageId(); - String page_title = pageParser.getPageTitle(); - if (page_title != null) { - switch (page_namespace) { - case NS_CATEGORY: { - // skip redirect categories if skipCategory is enabled - if (!(skipCategory && pageParser.getPageIsRedirect())) { - cPageIdNameMap.add(page_id); - cNamePageIdMap.put( - (KeyType) hashAlgorithm.hashCode(page_title), - page_id); - txtFW.addRow(page_id, page_id, page_title); - } - break; - } - - case NS_TALK: { - page_title = DISCUSSION_PREFIX + page_title; - //the NS_MAIN block will also be executed - //for NS_TALK pages ... - } - - case NS_MAIN: { - if (pageParser.getPageIsRedirect()) { - rPageIdNameMap.put(page_id, page_title); - } - else { - pPageIdNameMap.put(page_id, page_title); - pNamePageIdMap.put( - (KeyType) hashAlgorithm.hashCode(page_title), - page_id); - } - break; - } - } - } - - } - - @Override - public void processRevisionRow(RevisionParser revisionParser) { - textIdPageIdMap.put(revisionParser.getRevTextId(), revisionParser - .getRevPage()); - - } - - @SuppressWarnings("unchecked") - @Override - public void processTextRow(TextParser textParser) throws IOException { - int text_id = textParser.getOldId(); - if (textIdPageIdMap.containsKey(text_id)) { - - int page_id = textIdPageIdMap.get(text_id); - String page_idValueP = pPageIdNameMap.get(page_id); - if (page_idValueP != null) {// pages - page.addRow(page_id, page_id, page_idValueP, textParser - .getOldText(), formatBoolean(disambiguations - .contains(page_id))); - pageMapLine.addRow(page_id, page_idValueP, page_id, SQL_NULL, - SQL_NULL); - - } else { - String page_idValueR = rPageIdNameMap.get(page_id); - if (page_idValueR != null) {// Redirects - String destination = Redirects - .getRedirectDestination(textParser.getOldText()); - if (destination != null) { - KeyType destinationHash = (KeyType) hashAlgorithm - .hashCode(destination); - Integer destinationValue = pNamePageIdMap - .get(destinationHash); - if (destinationValue != null) { - - pageRedirects.addRow(destinationValue, - page_idValueR); - pageMapLine.addRow(page_id, page_idValueR, - destinationValue, SQL_NULL, SQL_NULL); - metaData.addRedirect(); - } - } - } - } - } - - } - - @Override - public void writeMetaData() throws IOException { - TxtFileWriter outputFile = new TxtFileWriter(versionFiles - .getOutputMetadata()); - // ID,LANGUAGE,DISAMBIGUATION_CATEGORY,MAIN_CATEGORY,nrOfPages,nrOfRedirects,nrOfDisambiguationPages,nrOfCategories - outputFile - .addRow(metaData.getId(), metaData.getLanguage(), metaData - .getDisambiguationCategory(), metaData - .getMainCategory(), metaData.getNrOfPages(), metaData - .getNrOfRedirects(), metaData.getNrOfDisambiguations(), - metaData.getNrOfCategories()); - outputFile.flush(); - outputFile.close(); - } + extends AbstractDumpVersion { + + private static final String SQL_NULL = "NULL"; + //TODO This constant is used to flag page titles of discussion pages. + // Is also defined in wikipedia.api:WikiConstants.DISCUSSION_PREFIX + // It just doesn't make sense to add a dependency just for the constant + private static final String DISCUSSION_PREFIX = "Discussion:"; + + private Map<Integer, String> pPageIdNameMap; + private Set<Integer> cPageIdNameMap; + private Map<KeyType, Integer> pNamePageIdMap; + private Map<KeyType, Integer> cNamePageIdMap; + private Map<Integer, String> rPageIdNameMap; + private Set<Integer> disambiguations; + private Map<Integer, Integer> textIdPageIdMap; + + IStringHashCode hashAlgorithm; + + @SuppressWarnings("unchecked") + public SingleDumpVersionJDKGeneric(Class<HashAlgorithm> hashAlgorithmClass) + throws InstantiationException, IllegalAccessException, NoSuchMethodException, InvocationTargetException { + + hashAlgorithm = hashAlgorithmClass.getDeclaredConstructor().newInstance(); + @SuppressWarnings("unused") + KeyType hashAlgorithmResult = (KeyType) hashAlgorithm.hashCode("test"); + } + + @Override + public void freeAfterCategoryLinksParsing() { + cPageIdNameMap.clear(); + cNamePageIdMap.clear(); + } + + @Override + public void freeAfterPageLinksParsing() { + // nothing to free + + } + + @Override + public void freeAfterPageParsing() { + metaData.setNrOfCategories(cPageIdNameMap.size()); + metaData.setNrOfPages(pPageIdNameMap.keySet().size() + rPageIdNameMap.keySet().size()); + System.out.println("nrOfCategories: " + metaData.getNrOfCategories()); + System.out.println("nrOfPage: " + metaData.getNrOfPages()); + System.out.println("nrOfRedirects before testing the validity of the destination:" + rPageIdNameMap.size()); + } + + @Override + public void freeAfterRevisonParsing() { + // nothing to free + } + + @Override + public void freeAfterTextParsing() { + pPageIdNameMap.clear(); + cPageIdNameMap.clear(); + pNamePageIdMap.clear(); + cNamePageIdMap.clear(); + rPageIdNameMap.clear(); + disambiguations.clear(); + textIdPageIdMap.clear(); + } + + @Override + public void initialize(Timestamp timestamp) { + pPageIdNameMap = new HashMap<>(1_000_000); + cPageIdNameMap = new HashSet<>(1_000_000); + pNamePageIdMap = new HashMap<>(1_000_000); + cNamePageIdMap = new HashMap<>(1_000_000); + rPageIdNameMap = new HashMap<>(1_000_000); + disambiguations = new HashSet<>(1_000_000); + textIdPageIdMap = new HashMap<>(1_000_000); + } + + @SuppressWarnings("unchecked") + @Override + public void processCategoryLinksRow(CategorylinksParser clParser) + throws IOException { + String cl_to = clParser.getClTo(); + + if (cl_to != null) { + KeyType clToHash = (KeyType) hashAlgorithm.hashCode(cl_to); + + Integer cl_toValue = cNamePageIdMap.get(clToHash); + + if (cl_toValue != null) { + int cl_from = clParser.getClFrom(); + + if (pPageIdNameMap.containsKey(cl_from)) { + categoryPages.addRow(cl_toValue, cl_from); + pageCategories.addRow(cl_from, cl_toValue); + + if (cl_to.equals(metaData.getDisambiguationCategory())) { + disambiguations.add(cl_from); + metaData.addDisamb(); + } + } else if (cPageIdNameMap.contains(cl_from)) { + categoryOutlinks.addRow(cl_toValue, cl_from); + categoryInlinks.addRow(cl_from, cl_toValue); + } + + } + } else { + throw new IOException("Parsin error." + CategorylinksParser.class.getName() + + " returned null value in " + this.getClass().getName()); + } + } + + @SuppressWarnings("unchecked") + @Override + public void processPageLinksRow(PagelinksParser plParser) + throws IOException { + int pl_from = plParser.getPlFrom(); + String pl_to = plParser.getPlTo(); + if (pl_to != null) { + KeyType plToHash = (KeyType) hashAlgorithm.hashCode(pl_to); + Integer pl_toValue = pNamePageIdMap.get(plToHash); + // skip redirects if skipPage is enabled + if ((!skipPage || pPageIdNameMap.containsKey(pl_from)) && pl_toValue != null) { + pageOutlinks.addRow(pl_from, pl_toValue); + pageInlinks.addRow(pl_toValue, pl_from); + } + } + } + + @SuppressWarnings("unchecked") + @Override + public void processPageRow(PageParser pageParser) throws IOException { + int page_namespace = pageParser.getPageNamespace(); + int page_id = pageParser.getPageId(); + String page_title = pageParser.getPageTitle(); + if (page_title != null) { + switch (page_namespace) { + case NS_CATEGORY: { + // skip redirect categories if skipCategory is enabled + if (!(skipCategory && pageParser.getPageIsRedirect())) { + cPageIdNameMap.add(page_id); + cNamePageIdMap.put((KeyType) hashAlgorithm.hashCode(page_title), page_id); + txtFW.addRow(page_id, page_id, page_title); + } + break; + } + + case NS_TALK: { + page_title = DISCUSSION_PREFIX + page_title; + //the NS_MAIN block will also be executed + //for NS_TALK pages ... + } + + case NS_MAIN: { + if (pageParser.getPageIsRedirect()) { + rPageIdNameMap.put(page_id, page_title); + } else { + pPageIdNameMap.put(page_id, page_title); + pNamePageIdMap.put((KeyType) hashAlgorithm.hashCode(page_title), page_id); + } + break; + } + } + } + } + + @Override + public void processRevisionRow(RevisionParser revisionParser) { + textIdPageIdMap.put(revisionParser.getRevTextId(), revisionParser.getRevPage()); + } + + @SuppressWarnings("unchecked") + @Override + public void processTextRow(TextParser textParser) throws IOException { + int text_id = textParser.getOldId(); + if (textIdPageIdMap.containsKey(text_id)) { + + int page_id = textIdPageIdMap.get(text_id); + String page_idValueP = pPageIdNameMap.get(page_id); + if (page_idValueP != null) {// pages + page.addRow(page_id, page_id, page_idValueP, textParser.getOldText(), formatBoolean(disambiguations + .contains(page_id))); + pageMapLine.addRow(page_id, page_idValueP, page_id, SQL_NULL, SQL_NULL); + + } else { + String page_idValueR = rPageIdNameMap.get(page_id); + if (page_idValueR != null) {// Redirects + String destination = Redirects.getRedirectDestination(textParser.getOldText()); + if (destination != null) { + KeyType destinationHash = (KeyType) hashAlgorithm.hashCode(destination); + Integer destinationValue = pNamePageIdMap.get(destinationHash); + if (destinationValue != null) { + + pageRedirects.addRow(destinationValue, page_idValueR); + pageMapLine.addRow(page_id, page_idValueR, destinationValue, SQL_NULL, SQL_NULL); + metaData.addRedirect(); + } + } + } + } + } + + } + + @Override + public void writeMetaData() throws IOException { + TxtFileWriter outputFile = new TxtFileWriter(versionFiles.getOutputMetadata()); + // ID,LANGUAGE,DISAMBIGUATION_CATEGORY,MAIN_CATEGORY,nrOfPages,nrOfRedirects,nrOfDisambiguationPages,nrOfCategories + outputFile.addRow(metaData.getId(), metaData.getLanguage(), metaData.getDisambiguationCategory(), metaData + .getMainCategory(), metaData.getNrOfPages(), metaData + .getNrOfRedirects(), metaData.getNrOfDisambiguations(), metaData.getNrOfCategories()); + outputFile.flush(); + outputFile.close(); + } } diff --git a/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/dump/version/SingleDumpVersionJDKIntKeyFactory.java b/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/dump/version/SingleDumpVersionJDKIntKeyFactory.java index 7aa9519a..7a88504b 100644 --- a/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/dump/version/SingleDumpVersionJDKIntKeyFactory.java +++ b/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/dump/version/SingleDumpVersionJDKIntKeyFactory.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -23,15 +23,14 @@ public class SingleDumpVersionJDKIntKeyFactory implements IDumpVersionFactory { - @Override - public IDumpVersion getDumpVersion() { - IDumpVersion dumpVersion; - try { - dumpVersion = new SingleDumpVersionJDKGeneric<Integer, StringHashCodeJDK>( - StringHashCodeJDK.class); - } catch (Exception e) { - dumpVersion = null; - } - return dumpVersion; - } + @Override + public IDumpVersion getDumpVersion() { + IDumpVersion dumpVersion; + try { + dumpVersion = new SingleDumpVersionJDKGeneric<Integer, StringHashCodeJDK>(StringHashCodeJDK.class); + } catch (Exception e) { + dumpVersion = null; + } + return dumpVersion; + } } diff --git a/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/dump/version/SingleDumpVersionJDKLongKeyFactory.java b/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/dump/version/SingleDumpVersionJDKLongKeyFactory.java index 025b9803..542aeba5 100644 --- a/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/dump/version/SingleDumpVersionJDKLongKeyFactory.java +++ b/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/dump/version/SingleDumpVersionJDKLongKeyFactory.java @@ -27,8 +27,7 @@ public class SingleDumpVersionJDKLongKeyFactory implements IDumpVersionFactory { public IDumpVersion getDumpVersion() { IDumpVersion dumpVersion; try { - dumpVersion = new SingleDumpVersionJDKGeneric<Long, StringHashCodeJBoss>( - StringHashCodeJBoss.class); + dumpVersion = new SingleDumpVersionJDKGeneric<Long, StringHashCodeJBoss>(StringHashCodeJBoss.class); } catch (Exception e) { dumpVersion = null; } diff --git a/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/dump/version/SingleDumpVersionJDKStringKeyFactory.java b/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/dump/version/SingleDumpVersionJDKStringKeyFactory.java index e226c7fa..a54efb0b 100644 --- a/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/dump/version/SingleDumpVersionJDKStringKeyFactory.java +++ b/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/dump/version/SingleDumpVersionJDKStringKeyFactory.java @@ -28,8 +28,7 @@ public class SingleDumpVersionJDKStringKeyFactory implements public IDumpVersion getDumpVersion() { IDumpVersion dumpVersion; try { - dumpVersion = new SingleDumpVersionJDKGeneric<String, StringHashCodeDisabled>( - StringHashCodeDisabled.class); + dumpVersion = new SingleDumpVersionJDKGeneric<String, StringHashCodeDisabled>(StringHashCodeDisabled.class); } catch (Exception e) { dumpVersion = null; } diff --git a/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/dump/version/SingleDumpVersionOriginal.java b/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/dump/version/SingleDumpVersionOriginal.java index 89e349fb..27c29383 100644 --- a/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/dump/version/SingleDumpVersionOriginal.java +++ b/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/dump/version/SingleDumpVersionOriginal.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -43,373 +43,353 @@ * The transformation produces .txt files for the different tables<br> * in the JWPL database.<br> * <br> - * + * <p> * Adopted to IDumpVersion by Galkin - * - * - * */ public class SingleDumpVersionOriginal implements IDumpVersion { - // metadata - private String language; - private String mainCategory; - private String disambiguationsCategory; - - // statistics - private int nrOfDisambiguations = 0; - private int nrOfPages = 0; - private int nrOfCategories = 0; - private int nrOfRedirects = 0; - - private Map<Integer, String> pPageIdNameMap;// maps page id's of pages to - // their names - private Map<Integer, String> cPageIdNameMap;// maps page id's of categories - // to their names - private Map<String, Integer> pNamePageIdMap;// maps names of pages to their - // page id's. - private Map<String, Integer> cNamePageIdMap;// maps names of categories to - // their page id's. - private Map<Integer, String> rPageIdNameMap;// maps page id's of redirects - // to their names. - private IntSet disambiguations; // caches the page id's of - // disambiguation pages. - private Int2IntOpenHashMap textIdPageIdMap;// maps text id's to the page - - // id's. - - // galkin: moved from local variables to fields - private TxtFileWriter txtFW; - private TxtFileWriter pageCategories; - private TxtFileWriter categoryPages; - private TxtFileWriter categoryInlinks; - private TxtFileWriter categoryOutlinks; - private TxtFileWriter pageInlinks; - private TxtFileWriter pageOutlinks; - private TxtFileWriter page; - private TxtFileWriter pageMapLine; - private TxtFileWriter pageRedirects; - private String outputDir; - - // galkin: added - - private ILogger logger; - private boolean skipPage = true; - private boolean skipCategory = true; - - /** - * Returns the String value of the bit 1 if the given boolean is true<br> - * and an empty String otherwise. This the way bit values are written<br> - * in .txt dump files. - * - * @param b - * @return - */ - private String formatBoolean(boolean b) { - return b ? new String(new byte[] { 1 }) : ""; - } - - @Override - public void exportAfterCategoryLinksParsing() throws IOException { - pageCategories.export(); - categoryPages.export(); - categoryInlinks.export(); - categoryOutlinks.export(); - } - - @Override - public void exportAfterPageLinksParsing() throws IOException { - pageInlinks.export(); - pageOutlinks.export(); - } - - @Override - public void exportAfterPageParsing() throws IOException { - txtFW.export(); - - nrOfCategories = cPageIdNameMap.keySet().size(); - nrOfPages = pPageIdNameMap.keySet().size() - + rPageIdNameMap.keySet().size(); - } - - @Override - public void exportAfterRevisionParsing() throws IOException { - } - - @Override - public void exportAfterTextParsing() throws IOException { - page.export(); - pageRedirects.export(); - pageMapLine.export(); - } - - @Override - public void flushByTextParsing() throws IOException { - page.flush(); - pageRedirects.flush(); - pageMapLine.flush(); - } - - @Override - public void freeAfterCategoryLinksParsing() { - - } - - @Override - public void freeAfterPageLinksParsing() { - - } - - @Override - public void freeAfterPageParsing() { - - } - - @Override - public void freeAfterRevisonParsing() { - } - - @Override - public void freeAfterTextParsing() { - page.export(); - pageRedirects.export(); - pageMapLine.export(); - } - - @Override - public void initCategoryLinksParsing() throws IOException { - pageCategories = new TxtFileWriter(outputDir + File.separator - + "page_categories.txt"); - categoryPages = new TxtFileWriter(outputDir + File.separator - + "category_pages.txt"); - categoryInlinks = new TxtFileWriter(outputDir + File.separator - + "category_inlinks.txt"); - categoryOutlinks = new TxtFileWriter(outputDir + File.separator - + "category_outlinks.txt"); - - } - - @Override - public void initPageLinksParsing() throws IOException { - - pageInlinks = new TxtFileWriter(outputDir + File.separator - + "page_inlinks.txt"); - pageOutlinks = new TxtFileWriter(outputDir + File.separator - + "page_outlinks.txt"); - - } - - @Override - public void initPageParsing() throws IOException { - txtFW = new TxtFileWriter(outputDir + File.separator + "Category.txt"); - - } - - @Override - public void initRevisionParsion() { - - } - - @Override - public void initTextParsing() throws IOException { - page = new TxtFileWriter(outputDir + File.separator + "Page.txt"); - pageMapLine = new TxtFileWriter(outputDir + File.separator - + "PageMapLine.txt"); - pageRedirects = new TxtFileWriter(outputDir + File.separator - + "page_redirects.txt"); - - } - - @Override - public void initialize(Timestamp timestamp) { - this.pPageIdNameMap = new HashMap<>(); - this.cPageIdNameMap = new HashMap<>(); - this.pNamePageIdMap = new HashMap<>(); - this.cNamePageIdMap = new HashMap<>(); - this.rPageIdNameMap = new HashMap<>(); - this.disambiguations = new IntArraySet(); - this.textIdPageIdMap = new Int2IntOpenHashMap(); - - } - - @Override - public void processCategoryLinksRow(CategorylinksParser clParser) - throws IOException { - - int cl_from; - String cl_to; - - cl_from = clParser.getClFrom(); - cl_to = clParser.getClTo(); - if (!cNamePageIdMap.containsKey(cl_to)) { - // discard links with non-registered targets - return; - } - // if the link source is a page then write the link in - // category_pages and - // page_categories - if (pPageIdNameMap.containsKey(cl_from)) { - categoryPages.addRow(cNamePageIdMap.get(cl_to), cl_from); - pageCategories.addRow(cl_from, cNamePageIdMap.get(cl_to)); - if (cl_to.equals(disambiguationsCategory)) { - disambiguations.add(cl_from); - nrOfDisambiguations++; - } - } else { - // if the link source is a category than write the link in - // category_inlinks and category_outlinks - if (cPageIdNameMap.containsKey(cl_from)) { - categoryOutlinks.addRow(cNamePageIdMap.get(cl_to), cl_from); - categoryInlinks.addRow(cl_from, cNamePageIdMap.get(cl_to)); - } - } - - } - - @Override - public void processPageLinksRow(PagelinksParser plParser) - throws IOException { - - int pl_from; - String pl_to; - - pl_from = plParser.getPlFrom(); - pl_to = plParser.getPlTo(); - // skip redirects or page with other namespace than 0 - - if (skipPage && !pPageIdNameMap.containsKey(pl_from) - || !pNamePageIdMap.containsKey(pl_to)) { - return; - } - - pageOutlinks.addRow(pl_from, pNamePageIdMap.get(pl_to)); - pageInlinks.addRow(pNamePageIdMap.get(pl_to), pl_from); - } - - @Override - public void processPageRow(PageParser pageParser) throws IOException { - - int page_id; - int page_namespace; - String page_title; - - page_namespace = pageParser.getPageNamespace(); - // handle categories - if (page_namespace == 14) { - if (skipCategory) { - if (pageParser.getPageIsRedirect()) - // skip categories that are redirects - return; - } - // retrieve page id and page title - page_id = pageParser.getPageId(); - page_title = pageParser.getPageTitle(); - if (page_title.equals(disambiguationsCategory)) { - logger.log("Disambiguations Category found: " + page_title); - } - if (page_title.equals(mainCategory)) { - logger.log("Main Category found: " + page_title); - } - // cache the retrieved values - cPageIdNameMap.put(page_id, page_title); - cNamePageIdMap.put(page_title, page_id); - // write a new row in the table Category. - // Note that we also consider the page_id as id - txtFW.addRow(page_id, page_id, page_title); - return; - } - // handle pages - if (page_namespace == 0) { - // retrieve page id and title - page_id = pageParser.getPageId(); - page_title = pageParser.getPageTitle(); - // distinguish redirects - if (pageParser.getPageIsRedirect()) { - rPageIdNameMap.put(page_id, page_title); - } else { - pPageIdNameMap.put(page_id, page_title); - pNamePageIdMap.put(page_title, page_id); - } - } - - } - - @Override - public void processRevisionRow(RevisionParser revisionParser) { - textIdPageIdMap.put(revisionParser.getRevTextId(), revisionParser - .getRevPage()); - } - - @Override - public void processTextRow(TextParser textParser) throws IOException { - - String destination; - int text_id; - int page_id; - - text_id = textParser.getOldId(); - if (!textIdPageIdMap.containsKey(text_id)) - return; - page_id = textIdPageIdMap.get(text_id); - if (pPageIdNameMap.containsKey(page_id)) {// pages - page.addRow(page_id, page_id, pPageIdNameMap.get(page_id), - textParser.getOldText(), formatBoolean(disambiguations - .contains(page_id))); - pageMapLine.addRow(page_id, pPageIdNameMap.get(page_id), page_id, - "NULL", "NULL"); - return; - } - if (rPageIdNameMap.containsKey(page_id)) {// Redirects - destination = Redirects.getRedirectDestination(textParser - .getOldText()); - if (!pNamePageIdMap.containsKey(destination)) - return; - pageRedirects.addRow(pNamePageIdMap.get(destination), - rPageIdNameMap.get(page_id)); - pageMapLine.addRow(page_id, rPageIdNameMap.get(page_id), - pNamePageIdMap.get(destination), "NULL", "NULL"); - nrOfRedirects++; - } - - } - - @Override - public void setFiles(Files versionFiles) { - // galkin: only output directory will be used, other file names will be - // taken from original source code - outputDir = versionFiles.getOutputDirectory().getAbsolutePath(); - } - - @Override - public void setLogger(ILogger logger) { - this.logger = logger; - } - - @Override - public void setMetaData(MetaData commonMetaData) { - this.language = commonMetaData.getLanguage(); - this.mainCategory = commonMetaData.getMainCategory(); - this.disambiguationsCategory = commonMetaData - .getDisambiguationCategory(); - } - - @Override - public void writeMetaData() throws IOException { - TxtFileWriter metaData = new TxtFileWriter(outputDir + File.separator + "MetaData.txt"); - // ID,LANGUAGE,DISAMBIGUATION_CATEGORY,MAIN_CATEGORY,nrOfPages,nrOfRedirects,nrOfDisambiguationPages,nrOfCategories - metaData.addRow("null", language, disambiguationsCategory, - mainCategory, nrOfPages, nrOfRedirects, nrOfDisambiguations, nrOfCategories); - metaData.export(); - } - - @Override - public void setCategoryRedirectsSkip(boolean skipCategory) { - this.skipCategory = skipCategory; - } - - @Override - public void setPageRedirectsSkip(boolean skipPage) { - this.skipPage = skipPage; - } + // metadata + private String language; + private String mainCategory; + private String disambiguationsCategory; + + // statistics + private int nrOfDisambiguations = 0; + private int nrOfPages = 0; + private int nrOfCategories = 0; + private int nrOfRedirects = 0; + + private Map<Integer, String> pPageIdNameMap;// maps page id's of pages to + // their names + private Map<Integer, String> cPageIdNameMap;// maps page id's of categories + // to their names + private Map<String, Integer> pNamePageIdMap;// maps names of pages to their + // page id's. + private Map<String, Integer> cNamePageIdMap;// maps names of categories to + // their page id's. + private Map<Integer, String> rPageIdNameMap;// maps page id's of redirects + // to their names. + private IntSet disambiguations; // caches the page id's of + // disambiguation pages. + private Int2IntOpenHashMap textIdPageIdMap;// maps text id's to the page + + // id's. + + // galkin: moved from local variables to fields + private TxtFileWriter txtFW; + private TxtFileWriter pageCategories; + private TxtFileWriter categoryPages; + private TxtFileWriter categoryInlinks; + private TxtFileWriter categoryOutlinks; + private TxtFileWriter pageInlinks; + private TxtFileWriter pageOutlinks; + private TxtFileWriter page; + private TxtFileWriter pageMapLine; + private TxtFileWriter pageRedirects; + private String outputDir; + + // galkin: added + + private ILogger logger; + private boolean skipPage = true; + private boolean skipCategory = true; + + /** + * Returns the String value of the bit 1 if the given boolean is true<br> + * and an empty String otherwise. This the way bit values are written<br> + * in .txt dump files. + * + * @param b + * @return + */ + private String formatBoolean(boolean b) { + return b ? new String(new byte[]{1}) : ""; + } + + @Override + public void exportAfterCategoryLinksParsing() throws IOException { + pageCategories.export(); + categoryPages.export(); + categoryInlinks.export(); + categoryOutlinks.export(); + } + + @Override + public void exportAfterPageLinksParsing() throws IOException { + pageInlinks.export(); + pageOutlinks.export(); + } + + @Override + public void exportAfterPageParsing() throws IOException { + txtFW.export(); + + nrOfCategories = cPageIdNameMap.keySet().size(); + nrOfPages = pPageIdNameMap.keySet().size() + + rPageIdNameMap.keySet().size(); + } + + @Override + public void exportAfterRevisionParsing() throws IOException { + } + + @Override + public void exportAfterTextParsing() throws IOException { + page.export(); + pageRedirects.export(); + pageMapLine.export(); + } + + @Override + public void flushByTextParsing() throws IOException { + page.flush(); + pageRedirects.flush(); + pageMapLine.flush(); + } + + @Override + public void freeAfterCategoryLinksParsing() { + + } + + @Override + public void freeAfterPageLinksParsing() { + + } + + @Override + public void freeAfterPageParsing() { + + } + + @Override + public void freeAfterRevisonParsing() { + } + + @Override + public void freeAfterTextParsing() { + page.export(); + pageRedirects.export(); + pageMapLine.export(); + } + + @Override + public void initCategoryLinksParsing() throws IOException { + pageCategories = new TxtFileWriter(outputDir + File.separator + "page_categories.txt"); + categoryPages = new TxtFileWriter(outputDir + File.separator + "category_pages.txt"); + categoryInlinks = new TxtFileWriter(outputDir + File.separator + "category_inlinks.txt"); + categoryOutlinks = new TxtFileWriter(outputDir + File.separator + "category_outlinks.txt"); + } + + @Override + public void initPageLinksParsing() throws IOException { + pageInlinks = new TxtFileWriter(outputDir + File.separator + "page_inlinks.txt"); + pageOutlinks = new TxtFileWriter(outputDir + File.separator + "page_outlinks.txt"); + } + + @Override + public void initPageParsing() throws IOException { + txtFW = new TxtFileWriter(outputDir + File.separator + "Category.txt"); + + } + + @Override + public void initRevisionParsion() { + + } + + @Override + public void initTextParsing() throws IOException { + page = new TxtFileWriter(outputDir + File.separator + "Page.txt"); + pageMapLine = new TxtFileWriter(outputDir + File.separator + "PageMapLine.txt"); + pageRedirects = new TxtFileWriter(outputDir + File.separator + "page_redirects.txt"); + + } + + @Override + public void initialize(Timestamp timestamp) { + this.pPageIdNameMap = new HashMap<>(); + this.cPageIdNameMap = new HashMap<>(); + this.pNamePageIdMap = new HashMap<>(); + this.cNamePageIdMap = new HashMap<>(); + this.rPageIdNameMap = new HashMap<>(); + this.disambiguations = new IntArraySet(); + this.textIdPageIdMap = new Int2IntOpenHashMap(); + + } + + @Override + public void processCategoryLinksRow(CategorylinksParser clParser) throws IOException { + + int cl_from; + String cl_to; + + cl_from = clParser.getClFrom(); + cl_to = clParser.getClTo(); + if (!cNamePageIdMap.containsKey(cl_to)) { + // discard links with non-registered targets + return; + } + // if the link source is a page then write the link in + // category_pages and + // page_categories + if (pPageIdNameMap.containsKey(cl_from)) { + categoryPages.addRow(cNamePageIdMap.get(cl_to), cl_from); + pageCategories.addRow(cl_from, cNamePageIdMap.get(cl_to)); + if (cl_to.equals(disambiguationsCategory)) { + disambiguations.add(cl_from); + nrOfDisambiguations++; + } + } else { + // if the link source is a category than write the link in + // category_inlinks and category_outlinks + if (cPageIdNameMap.containsKey(cl_from)) { + categoryOutlinks.addRow(cNamePageIdMap.get(cl_to), cl_from); + categoryInlinks.addRow(cl_from, cNamePageIdMap.get(cl_to)); + } + } + + } + + @Override + public void processPageLinksRow(PagelinksParser plParser) throws IOException { + + int pl_from; + String pl_to; + + pl_from = plParser.getPlFrom(); + pl_to = plParser.getPlTo(); + // skip redirects or page with other namespace than 0 + + if (skipPage && !pPageIdNameMap.containsKey(pl_from) || !pNamePageIdMap.containsKey(pl_to)) { + return; + } + + pageOutlinks.addRow(pl_from, pNamePageIdMap.get(pl_to)); + pageInlinks.addRow(pNamePageIdMap.get(pl_to), pl_from); + } + + @Override + public void processPageRow(PageParser pageParser) throws IOException { + + int page_id; + int page_namespace; + String page_title; + + page_namespace = pageParser.getPageNamespace(); + // handle categories + if (page_namespace == 14) { + if (skipCategory) { + if (pageParser.getPageIsRedirect()) + // skip categories that are redirects + return; + } + // retrieve page id and page title + page_id = pageParser.getPageId(); + page_title = pageParser.getPageTitle(); + if (page_title.equals(disambiguationsCategory)) { + logger.log("Disambiguations Category found: " + page_title); + } + if (page_title.equals(mainCategory)) { + logger.log("Main Category found: " + page_title); + } + // cache the retrieved values + cPageIdNameMap.put(page_id, page_title); + cNamePageIdMap.put(page_title, page_id); + // write a new row in the table Category. + // Note that we also consider the page_id as id + txtFW.addRow(page_id, page_id, page_title); + return; + } + // handle pages + if (page_namespace == 0) { + // retrieve page id and title + page_id = pageParser.getPageId(); + page_title = pageParser.getPageTitle(); + // distinguish redirects + if (pageParser.getPageIsRedirect()) { + rPageIdNameMap.put(page_id, page_title); + } else { + pPageIdNameMap.put(page_id, page_title); + pNamePageIdMap.put(page_title, page_id); + } + } + + } + + @Override + public void processRevisionRow(RevisionParser revisionParser) { + textIdPageIdMap.put(revisionParser.getRevTextId(), revisionParser + .getRevPage()); + } + + @Override + public void processTextRow(TextParser textParser) throws IOException { + + String destination; + int text_id; + int page_id; + + text_id = textParser.getOldId(); + if (!textIdPageIdMap.containsKey(text_id)) + return; + page_id = textIdPageIdMap.get(text_id); + if (pPageIdNameMap.containsKey(page_id)) {// pages + page.addRow(page_id, page_id, pPageIdNameMap.get(page_id), + textParser.getOldText(), formatBoolean(disambiguations.contains(page_id))); + pageMapLine.addRow(page_id, pPageIdNameMap.get(page_id), page_id, "NULL", "NULL"); + return; + } + if (rPageIdNameMap.containsKey(page_id)) {// Redirects + destination = Redirects.getRedirectDestination(textParser.getOldText()); + if (!pNamePageIdMap.containsKey(destination)) + return; + pageRedirects.addRow(pNamePageIdMap.get(destination), rPageIdNameMap.get(page_id)); + pageMapLine.addRow(page_id, rPageIdNameMap.get(page_id), + pNamePageIdMap.get(destination), "NULL", "NULL"); + nrOfRedirects++; + } + + } + + @Override + public void setFiles(Files versionFiles) { + // galkin: only output directory will be used, other file names will be + // taken from original source code + outputDir = versionFiles.getOutputDirectory().getAbsolutePath(); + } + + @Override + public void setLogger(ILogger logger) { + this.logger = logger; + } + + @Override + public void setMetaData(MetaData commonMetaData) { + this.language = commonMetaData.getLanguage(); + this.mainCategory = commonMetaData.getMainCategory(); + this.disambiguationsCategory = commonMetaData + .getDisambiguationCategory(); + } + + @Override + public void writeMetaData() throws IOException { + try(TxtFileWriter metaData = new TxtFileWriter(outputDir + File.separator + "MetaData.txt")) { + // ID,LANGUAGE,DISAMBIGUATION_CATEGORY,MAIN_CATEGORY,nrOfPages,nrOfRedirects,nrOfDisambiguationPages,nrOfCategories + metaData.addRow("null", language, disambiguationsCategory, + mainCategory, nrOfPages, nrOfRedirects, nrOfDisambiguations, nrOfCategories); + metaData.export(); + } + } + + @Override + public void setCategoryRedirectsSkip(boolean skipCategory) { + this.skipCategory = skipCategory; + } + + @Override + public void setPageRedirectsSkip(boolean skipPage) { + this.skipPage = skipPage; + } } diff --git a/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/dump/xml/BinaryDumpTableInputStream.java b/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/dump/xml/BinaryDumpTableInputStream.java index d006f110..9d356e80 100644 --- a/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/dump/xml/BinaryDumpTableInputStream.java +++ b/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/dump/xml/BinaryDumpTableInputStream.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -25,18 +25,17 @@ public class BinaryDumpTableInputStream extends DumpTableInputStream { - protected InputStream inputStream = null; + protected InputStream inputStream = null; - @Override - public void initialize(InputStream inputStream, DumpTableEnum table) - throws IOException { - // just read from the stream without any data manipulations - this.inputStream = inputStream; - } + @Override + public void initialize(InputStream inputStream, DumpTableEnum table) throws IOException { + // just read from the stream without any data manipulations + this.inputStream = inputStream; + } - @Override - public int read() throws IOException { - return inputStream.read(); - } + @Override + public int read() throws IOException { + return inputStream.read(); + } } diff --git a/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/dump/xml/DataMachineRevisionParser.java b/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/dump/xml/DataMachineRevisionParser.java index 790132d4..8132b13e 100755 --- a/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/dump/xml/DataMachineRevisionParser.java +++ b/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/dump/xml/DataMachineRevisionParser.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -24,14 +24,15 @@ public class DataMachineRevisionParser extends RevisionParser { - public boolean next() throws IOException { - boolean hasNext = true; - try { - revPage = stream.readInt(); - revTextId = stream.readInt(); - } catch (EOFException e) { - hasNext = false; - } - return hasNext; - } + @Override + public boolean next() throws IOException { + boolean hasNext = true; + try { + revPage = stream.readInt(); + revTextId = stream.readInt(); + } catch (EOFException e) { + hasNext = false; + } + return hasNext; + } } diff --git a/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/dump/xml/SimpleBinaryDumpWriter.java b/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/dump/xml/SimpleBinaryDumpWriter.java index 6526c604..50453b86 100644 --- a/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/dump/xml/SimpleBinaryDumpWriter.java +++ b/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/dump/xml/SimpleBinaryDumpWriter.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -37,111 +37,111 @@ public class SimpleBinaryDumpWriter implements DumpWriter { - private UTFDataOutputStream pageFile; - private UTFDataOutputStream revisionFile; - private UTFDataOutputStream textFile; - private final DataMachineFiles files; - - private Page currentPage; - private Revision lastRevision; - - public SimpleBinaryDumpWriter(DataMachineFiles files) throws IOException { - this.files = files; - if (this.files.isCompressGeneratedFiles()) { - createCompressed(); - } else { - createUncompressed(); - } - } - - protected void createUncompressed() throws IOException { - pageFile = openUTFDataOutputStream(files.getGeneratedPage(), false); - revisionFile = openUTFDataOutputStream(files.getGeneratedRevision(), false); - textFile = openUTFDataOutputStream(files.getGeneratedText(), false); - } - - protected void createCompressed() throws IOException { - pageFile = openUTFDataOutputStream(files.getGeneratedPage(), true); - revisionFile = openUTFDataOutputStream(files.getGeneratedRevision(), true); - textFile = openUTFDataOutputStream(files.getGeneratedText(), true); - } - - private UTFDataOutputStream openUTFDataOutputStream(final String filePath, final boolean compressed) throws IOException { - UTFDataOutputStream utfDataOutputStream; - if(compressed) { - utfDataOutputStream = new UTFDataOutputStream(new GZIPOutputStream(openFileStreamAndRegisterDeletion(filePath))); - } else { - utfDataOutputStream = new UTFDataOutputStream(openFileStreamAndRegisterDeletion(filePath)); - } - return utfDataOutputStream; - } - - private BufferedOutputStream openFileStreamAndRegisterDeletion(final String filePath) throws IOException { - Path binaryOutputFilePath = Paths.get(filePath); - // JavaDoc says: - // "truncate and overwrite an existing file, or create the file if it doesn't initially exist" - OutputStream fileOutputStream = Files.newOutputStream(binaryOutputFilePath); - - // Register a delete hook on JVM shutdown for this path - DeleteFilesAtShutdown.register(binaryOutputFilePath); - - // Create a buffered version for this - return new BufferedOutputStream(fileOutputStream); - } - - @Override - public void close() throws IOException { - pageFile.close(); - revisionFile.close(); - textFile.close(); - } - - @Override - public void writeEndPage() throws IOException { - if (lastRevision != null) { - updatePage(currentPage, lastRevision); - } - currentPage = null; - lastRevision = null; - } - - @Override - public void writeEndWiki() throws IOException { - pageFile.flush(); - revisionFile.flush(); - textFile.flush(); - } - - @Override - public void writeRevision(Revision revision) throws IOException { - lastRevision = revision; - - revisionFile.writeInt(currentPage.Id); - revisionFile.writeInt(revision.Id); - - textFile.writeInt(revision.Id); - textFile.writeUTFAsArray(SQLEscape.escape(revision.Text)); - } - - @Override - public void writeSiteinfo(Siteinfo info) throws IOException { - } - - @Override - public void writeStartPage(Page page) throws IOException { - currentPage = page; - lastRevision = null; - } - - @Override - public void writeStartWiki() throws IOException { - } - - private void updatePage(Page page, Revision revision) throws IOException { - pageFile.writeInt(page.Id); - pageFile.writeInt(page.Title.Namespace); - pageFile.writeUTFAsArray(SQLEscape.escape(SQLEscape.titleFormat(page.Title.Text))); - // pageFile.writeBoolean(revision.isRedirect()); - pageFile.writeBoolean(Redirects.isRedirect(revision.Text)); - } + private UTFDataOutputStream pageFile; + private UTFDataOutputStream revisionFile; + private UTFDataOutputStream textFile; + private final DataMachineFiles files; + + private Page currentPage; + private Revision lastRevision; + + public SimpleBinaryDumpWriter(DataMachineFiles files) throws IOException { + this.files = files; + if (this.files.isCompressGeneratedFiles()) { + createCompressed(); + } else { + createUncompressed(); + } + } + + protected void createUncompressed() throws IOException { + pageFile = openUTFDataOutputStream(files.getGeneratedPage(), false); + revisionFile = openUTFDataOutputStream(files.getGeneratedRevision(), false); + textFile = openUTFDataOutputStream(files.getGeneratedText(), false); + } + + protected void createCompressed() throws IOException { + pageFile = openUTFDataOutputStream(files.getGeneratedPage(), true); + revisionFile = openUTFDataOutputStream(files.getGeneratedRevision(), true); + textFile = openUTFDataOutputStream(files.getGeneratedText(), true); + } + + private UTFDataOutputStream openUTFDataOutputStream(final String filePath, final boolean compressed) throws IOException { + UTFDataOutputStream utfDataOutputStream; + if (compressed) { + utfDataOutputStream = new UTFDataOutputStream(new GZIPOutputStream(openFileStreamAndRegisterDeletion(filePath))); + } else { + utfDataOutputStream = new UTFDataOutputStream(openFileStreamAndRegisterDeletion(filePath)); + } + return utfDataOutputStream; + } + + private BufferedOutputStream openFileStreamAndRegisterDeletion(final String filePath) throws IOException { + Path binaryOutputFilePath = Paths.get(filePath); + // JavaDoc says: + // "truncate and overwrite an existing file, or create the file if it doesn't initially exist" + OutputStream fileOutputStream = Files.newOutputStream(binaryOutputFilePath); + + // Register a delete hook on JVM shutdown for this path + DeleteFilesAtShutdown.register(binaryOutputFilePath); + + // Create a buffered version for this + return new BufferedOutputStream(fileOutputStream); + } + + @Override + public void close() throws IOException { + pageFile.close(); + revisionFile.close(); + textFile.close(); + } + + @Override + public void writeEndPage() throws IOException { + if (lastRevision != null) { + updatePage(currentPage, lastRevision); + } + currentPage = null; + lastRevision = null; + } + + @Override + public void writeEndWiki() throws IOException { + pageFile.flush(); + revisionFile.flush(); + textFile.flush(); + } + + @Override + public void writeRevision(Revision revision) throws IOException { + lastRevision = revision; + + revisionFile.writeInt(currentPage.Id); + revisionFile.writeInt(revision.Id); + + textFile.writeInt(revision.Id); + textFile.writeUTFAsArray(SQLEscape.escape(revision.Text)); + } + + @Override + public void writeSiteinfo(Siteinfo info) throws IOException { + } + + @Override + public void writeStartPage(Page page) throws IOException { + currentPage = page; + lastRevision = null; + } + + @Override + public void writeStartWiki() throws IOException { + } + + private void updatePage(Page page, Revision revision) throws IOException { + pageFile.writeInt(page.Id); + pageFile.writeInt(page.Title.Namespace); + pageFile.writeUTFAsArray(SQLEscape.escape(SQLEscape.titleFormat(page.Title.Text))); + // pageFile.writeBoolean(revision.isRedirect()); + pageFile.writeBoolean(Redirects.isRedirect(revision.Text)); + } } diff --git a/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/dump/xml/SimpleXmlDumpReader.java b/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/dump/xml/SimpleXmlDumpReader.java index 97dd9232..15eba9a7 100644 --- a/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/dump/xml/SimpleXmlDumpReader.java +++ b/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/dump/xml/SimpleXmlDumpReader.java @@ -32,37 +32,35 @@ /** * This class is a specified variant of XmlDumpReader. Please see its source for more * information about a functionality and a license.<br> - * - * */ public class SimpleXmlDumpReader extends AbstractXmlDumpReader { - public SimpleXmlDumpReader(InputStream inputStream, DumpWriter writer) { - super(inputStream, writer); + public SimpleXmlDumpReader(InputStream inputStream, DumpWriter writer) { + super(inputStream, writer); - } + } - @Override - protected void setupStartElements() { - startElements.put(REVISION, REVISION); - startElements.put(CONTRIBUTOR, CONTRIBUTOR); - startElements.put(PAGE, PAGE); - startElements.put(SITEINFO, SITEINFO); - startElements.put(NAMESPACES, NAMESPACES); - startElements.put(NAMESPACE, NAMESPACE); - } + @Override + protected void setupStartElements() { + startElements.put(REVISION, REVISION); + startElements.put(CONTRIBUTOR, CONTRIBUTOR); + startElements.put(PAGE, PAGE); + startElements.put(SITEINFO, SITEINFO); + startElements.put(NAMESPACES, NAMESPACES); + startElements.put(NAMESPACE, NAMESPACE); + } - @Override - protected void setupEndElements() { - endElements.put(REVISION, REVISION); - endElements.put(TEXT, TEXT); - endElements.put(CONTRIBUTOR, CONTRIBUTOR); - endElements.put(ID, ID); - endElements.put(PAGE, PAGE); - endElements.put(TITLE, TITLE); - endElements.put(SITEINFO, SITEINFO); - endElements.put(NAMESPACES, NAMESPACES); - endElements.put(NAMESPACE, NAMESPACE); + @Override + protected void setupEndElements() { + endElements.put(REVISION, REVISION); + endElements.put(TEXT, TEXT); + endElements.put(CONTRIBUTOR, CONTRIBUTOR); + endElements.put(ID, ID); + endElements.put(PAGE, PAGE); + endElements.put(TITLE, TITLE); + endElements.put(SITEINFO, SITEINFO); + endElements.put(NAMESPACES, NAMESPACES); + endElements.put(NAMESPACE, NAMESPACE); - } + } } diff --git a/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/dump/xml/XML2Binary.java b/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/dump/xml/XML2Binary.java index 6b8cc476..f683d314 100644 --- a/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/dump/xml/XML2Binary.java +++ b/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/dump/xml/XML2Binary.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -25,34 +25,28 @@ import org.dkpro.jwpl.mwdumper.importer.XmlDumpReader; /** - * * Use org.mediawiki.importer engine to parse the XML-Dump (only useful fields) * and store it to binary file. Compression of the output files is possible. - * - * - * */ public class XML2Binary { - /** - * Enable the main and category pages as well as discussions - */ - private static final String ENABLED_NAMESPACES = "NS_MAIN,NS_TALK,NS_CATEGORY"; + + /* + * Enable the main and category pages as well as discussions + */ + private static final String ENABLED_NAMESPACES = "NS_MAIN,NS_TALK,NS_CATEGORY"; - private static final boolean USE_MODIFED_PARSER = true; + private static final boolean USE_MODIFIED_PARSER = true; - public XML2Binary(InputStream iStream, DataMachineFiles files) - throws IOException { - if (USE_MODIFED_PARSER) { - // modified parser, skips faulty tags - new SimpleXmlDumpReader(iStream, new NamespaceFilter( - new SimpleBinaryDumpWriter(files), ENABLED_NAMESPACES)) - .readDump(); - } else { - // original MWDumper parser, very sensible to not closed tags - new XmlDumpReader(iStream, new NamespaceFilter( - new SimpleBinaryDumpWriter(files), ENABLED_NAMESPACES)) - .readDump(); - } - } + public XML2Binary(InputStream iStream, DataMachineFiles files) throws IOException { + if (USE_MODIFIED_PARSER) { + // modified parser, skips faulty tags + new SimpleXmlDumpReader(iStream, new NamespaceFilter( + new SimpleBinaryDumpWriter(files), ENABLED_NAMESPACES)).readDump(); + } else { + // original MWDumper parser, very sensible to not closed tags + new XmlDumpReader(iStream, new NamespaceFilter( + new SimpleBinaryDumpWriter(files), ENABLED_NAMESPACES)).readDump(); + } + } } diff --git a/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/file/DeleteFilesAtShutdown.java b/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/file/DeleteFilesAtShutdown.java index 276160cc..043c3ba9 100644 --- a/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/file/DeleteFilesAtShutdown.java +++ b/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/file/DeleteFilesAtShutdown.java @@ -34,39 +34,40 @@ * <a href="https://stackoverflow.com/a/42389029">https://stackoverflow.com/a/42389029</a> */ public final class DeleteFilesAtShutdown { - private static Set<Path> paths = new LinkedHashSet<>(); + private static Set<Path> paths = new LinkedHashSet<>(); - static { - // registers the call of 'shutdownHook' at JVM shutdown - Runtime.getRuntime().addShutdownHook(new Thread(DeleteFilesAtShutdown::cleanupRegisteredFiles)); - } + static { + // registers the call of 'shutdownHook' at JVM shutdown + Runtime.getRuntime().addShutdownHook(new Thread(DeleteFilesAtShutdown::cleanupRegisteredFiles)); + } - private static void cleanupRegisteredFiles() { - Set<Path> local; - synchronized(DeleteFilesAtShutdown.class){ - local = paths; - paths = null; - } + private static void cleanupRegisteredFiles() { + Set<Path> local; + synchronized (DeleteFilesAtShutdown.class) { + local = paths; + paths = null; + } - List<Path> toBeDeleted = new ArrayList<>(local); - Collections.reverse(toBeDeleted); - for (Path p : toBeDeleted) { - try { - Files.delete(p); - } catch (IOException | RuntimeException e) { - // do nothing - best-effort - } - } + List<Path> toBeDeleted = new ArrayList<>(local); + Collections.reverse(toBeDeleted); + for (Path p : toBeDeleted) { + try { + Files.delete(p); + } catch (IOException | RuntimeException e) { + // do nothing - best-effort + } } + } - /** - * Registers a {@link Path} to be removed at JVM shutdown. - * @param filePath A valid path pointing to a file. - */ - public static synchronized void register(Path filePath) { - if (paths == null) { - throw new IllegalStateException("Shutdown hook is already in progress. Adding paths is not allowed now!"); - } - paths.add(filePath); + /** + * Registers a {@link Path} to be removed at JVM shutdown. + * + * @param filePath A valid path pointing to a file. + */ + public static synchronized void register(Path filePath) { + if (paths == null) { + throw new IllegalStateException("Shutdown hook is already in progress. Adding paths is not allowed now!"); } + paths.add(filePath); + } } \ No newline at end of file diff --git a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/dumper/Dumper.java b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/dumper/Dumper.java index b74786b7..3fd7b418 100644 --- a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/dumper/Dumper.java +++ b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/dumper/Dumper.java @@ -89,217 +89,215 @@ import org.dkpro.jwpl.mwdumper.importer.XmlDumpWriter; class Dumper { - public static void main(String[] args) throws IOException, ParseException { - InputStream input = null; - OutputWrapper output = null; - DumpWriter sink = null; - MultiWriter writers = new MultiWriter(); - int progressInterval = 1000; - - for (int i = 0; i < args.length; i++) { - String arg = args[i]; - String[] bits = splitArg(arg); - if (bits != null) { - String opt = bits[0], val = bits[1], param = bits[2]; - if (opt.equals("output")) { - if (output != null) { - // Finish constructing the previous output... - if (sink == null) - sink = new XmlDumpWriter(output.getFileStream()); - writers.add(sink); - sink = null; - } - output = openOutputFile(val, param); - } else if (opt.equals("format")) { - if (output == null) - output = new OutputWrapper(Tools.openStandardOutput()); - if (sink != null) - throw new IllegalArgumentException("Only one format per output allowed."); - sink = openOutputSink(output, val, param); - } else if (opt.equals("filter")) { - if (sink == null) { - if (output == null) - output = new OutputWrapper(Tools.openStandardOutput()); - sink = new XmlDumpWriter(output.getFileStream()); - } - sink = addFilter(sink, val, param); - } else if (opt.equals("progress")) { - progressInterval = Integer.parseInt(val); - } else if (opt.equals("quiet")) { - progressInterval = 0; - } else { - throw new IllegalArgumentException("Unrecognized option " + opt); - } - } else if (arg.equals("-")) { - if (input != null) - throw new IllegalArgumentException("Input already set; can't set to stdin"); - input = Tools.openStandardInput(); - } else { - if (input != null) - throw new IllegalArgumentException("Input already set; can't set to " + arg); - input = Tools.openInputFile(arg); - } - } - - if (input == null) - input = Tools.openStandardInput(); - if (output == null) - output = new OutputWrapper(Tools.openStandardOutput()); - // Finish stacking the last output sink - if (sink == null) - sink = new XmlDumpWriter(output.getFileStream()); - writers.add(sink); - - DumpWriter outputSink = (progressInterval > 0) - ? (DumpWriter)new ProgressFilter(writers, progressInterval) - : (DumpWriter)writers; - - XmlDumpReader reader = new XmlDumpReader(input, outputSink); - reader.readDump(); - } + public static void main(String[] args) throws IOException, ParseException { + InputStream input = null; + OutputWrapper output = null; + DumpWriter sink = null; + MultiWriter writers = new MultiWriter(); + int progressInterval = 1000; - /** - * @param arg string in format "--option=value:parameter" - * @return array of option, value, and parameter, or null if no match - */ - static String[] splitArg(String arg) { - if (!arg.startsWith("--")) - return null; - - String opt; - String val = ""; - String param = ""; - - String[] bits = arg.substring(2).split("=", 2); - opt = bits[0]; - - if (bits.length > 1) { - String[] bits2 = bits[1].split(":", 2); - val = bits2[0]; - if (bits2.length > 1) - param = bits2[1]; - } - - return new String[] {opt, val, param}; - } - - // ---------------- - - static class OutputWrapper { - private OutputStream fileStream = null; - private Connection sqlConnection = null; - - OutputWrapper(OutputStream aFileStream) { - fileStream = aFileStream; - } - - OutputWrapper(Connection anSqlConnection) { - sqlConnection= anSqlConnection; - } - - OutputStream getFileStream() { - if (fileStream != null) - return fileStream; - if (sqlConnection != null) - throw new IllegalArgumentException("Expected file stream, got SQL connection?"); - throw new IllegalArgumentException("Have neither file nor SQL connection. Very confused!"); - } - - SqlStream getSqlStream() throws IOException { - if (fileStream != null) - return new SqlFileStream(fileStream); - if (sqlConnection != null) - return new SqlServerStream(sqlConnection); - throw new IllegalArgumentException("Have neither file nor SQL connection. Very confused!"); - } - } - - static OutputWrapper openOutputFile(String dest, String param) throws IOException { - if (dest.equals("stdout")) - return new OutputWrapper(Tools.openStandardOutput()); - else if (dest.equals("file")) - return new OutputWrapper(Tools.createOutputFile(param)); - else if (dest.equals("gzip")) - return new OutputWrapper(new GZIPOutputStream(Tools.createOutputFile(param))); - else if (dest.equals("bzip2")) - return new OutputWrapper(Tools.createBZip2File(param)); - else if (dest.equals("mysql")) - return connectMySql(param); - else if (dest.equals("postgresql")) - return connectPostgres(param); - else - throw new IllegalArgumentException("Destination sink not implemented: " + dest); - } + for (int i = 0; i < args.length; i++) { + String arg = args[i]; + String[] bits = splitArg(arg); + if (bits != null) { + String opt = bits[0], val = bits[1], param = bits[2]; + if (opt.equals("output")) { + if (output != null) { + // Finish constructing the previous output... + if (sink == null) + sink = new XmlDumpWriter(output.getFileStream()); + writers.add(sink); + sink = null; + } + output = openOutputFile(val, param); + } else if (opt.equals("format")) { + if (output == null) + output = new OutputWrapper(Tools.openStandardOutput()); + if (sink != null) + throw new IllegalArgumentException("Only one format per output allowed."); + sink = openOutputSink(output, val, param); + } else if (opt.equals("filter")) { + if (sink == null) { + if (output == null) + output = new OutputWrapper(Tools.openStandardOutput()); + sink = new XmlDumpWriter(output.getFileStream()); + } + sink = addFilter(sink, val, param); + } else if (opt.equals("progress")) { + progressInterval = Integer.parseInt(val); + } else if (opt.equals("quiet")) { + progressInterval = 0; + } else { + throw new IllegalArgumentException("Unrecognized option " + opt); + } + } else if (arg.equals("-")) { + if (input != null) + throw new IllegalArgumentException("Input already set; can't set to stdin"); + input = Tools.openStandardInput(); + } else { + if (input != null) + throw new IllegalArgumentException("Input already set; can't set to " + arg); + input = Tools.openInputFile(arg); + } + } - private static OutputWrapper connectMySql(String param) throws IOException { - try { - Class.forName("com.mysql.jdbc.Driver"); - Connection conn = DriverManager.getConnection("jdbc:mysql:" + param); - return new OutputWrapper(conn); - } catch (Exception e) { - throw (IOException) new IOException(e.getMessage()).initCause(e); - } - } - - private static OutputWrapper connectPostgres(String param) throws IOException { - try { - Class.forName("org.postgresql.Driver"); - Connection conn = DriverManager.getConnection("jdbc:postgresql:" + param); - return new OutputWrapper(conn); - } catch (Exception e) { - throw new IOException(e.toString()); - } - } + if (input == null) + input = Tools.openStandardInput(); + if (output == null) + output = new OutputWrapper(Tools.openStandardOutput()); + // Finish stacking the last output sink + if (sink == null) + sink = new XmlDumpWriter(output.getFileStream()); + writers.add(sink); - static DumpWriter openOutputSink(OutputWrapper output, String format, String param) throws IOException { - if (format.equals("xml")) - return new XmlDumpWriter(output.getFileStream()); - else if (format.equals("sphinx")) - return new SphinxWriter(output.getFileStream()); - else if (format.equals("mysql") || format.equals("pgsql") || format.equals("sql")) { - SqlStream sqlStream = output.getSqlStream(); - SqlWriter ret; + DumpWriter outputSink = (progressInterval > 0) ? new ProgressFilter(writers, progressInterval) : writers; - SqlWriter.Traits tr; - if (format.equals("pgsql")) - tr = new SqlWriter.PostgresTraits(); - else - tr = new SqlWriter.MySQLTraits(); + XmlDumpReader reader = new XmlDumpReader(input, outputSink); + reader.readDump(); + } - if (param.equals("1.4")) - ret = new SqlWriter14(tr, sqlStream); - else if (param.equals("1.5")) - ret = new SqlWriter15(tr, sqlStream); - else - throw new IllegalArgumentException("SQL version not known: " + param); + /** + * @param arg string in format "--option=value:parameter" + * @return array of option, value, and parameter, or null if no match + */ + static String[] splitArg(String arg) { + if (!arg.startsWith("--")) + return null; - return ret; - } else - throw new IllegalArgumentException("Output format not known: " + format); - } - - // ---------------- - - static DumpWriter addFilter(DumpWriter sink, String filter, String param) throws IOException, ParseException { - if (filter.equals("latest")) - return new LatestFilter(sink); - else if (filter.equals("namespace")) - return new NamespaceFilter(sink, param); - else if (filter.equals("notalk")) - return new NotalkFilter(sink); - else if (filter.equals("titlematch")) - return new TitleMatchFilter(sink, param); - else if (filter.equals("list")) - return new ListFilter(sink, param); - else if (filter.equals("exactlist")) - return new ExactListFilter(sink, param); - else if (filter.equals("revlist")) - return new RevisionListFilter(sink, param); - else if (filter.equals("before")) - return new BeforeTimeStampFilter(sink, param); - else if (filter.equals("after")) - return new AfterTimeStampFilter(sink, param); - else - throw new IllegalArgumentException("Filter unknown: " + filter); - } + String opt; + String val = ""; + String param = ""; + + String[] bits = arg.substring(2).split("=", 2); + opt = bits[0]; + + if (bits.length > 1) { + String[] bits2 = bits[1].split(":", 2); + val = bits2[0]; + if (bits2.length > 1) + param = bits2[1]; + } + + return new String[]{opt, val, param}; + } + + // ---------------- + + static class OutputWrapper { + private OutputStream fileStream = null; + private Connection sqlConnection = null; + + OutputWrapper(OutputStream aFileStream) { + fileStream = aFileStream; + } + + OutputWrapper(Connection anSqlConnection) { + sqlConnection = anSqlConnection; + } + + OutputStream getFileStream() { + if (fileStream != null) + return fileStream; + if (sqlConnection != null) + throw new IllegalArgumentException("Expected file stream, got SQL connection?"); + throw new IllegalArgumentException("Have neither file nor SQL connection. Very confused!"); + } + + SqlStream getSqlStream() throws IOException { + if (fileStream != null) + return new SqlFileStream(fileStream); + if (sqlConnection != null) + return new SqlServerStream(sqlConnection); + throw new IllegalArgumentException("Have neither file nor SQL connection. Very confused!"); + } + } + + static OutputWrapper openOutputFile(String dest, String param) throws IOException { + if (dest.equals("stdout")) + return new OutputWrapper(Tools.openStandardOutput()); + else if (dest.equals("file")) + return new OutputWrapper(Tools.createOutputFile(param)); + else if (dest.equals("gzip")) + return new OutputWrapper(new GZIPOutputStream(Tools.createOutputFile(param))); + else if (dest.equals("bzip2")) + return new OutputWrapper(Tools.createBZip2File(param)); + else if (dest.equals("mysql")) + return connectMySql(param); + else if (dest.equals("postgresql")) + return connectPostgres(param); + else + throw new IllegalArgumentException("Destination sink not implemented: " + dest); + } + + private static OutputWrapper connectMySql(String param) throws IOException { + try { + Class.forName("com.mysql.jdbc.Driver"); + Connection conn = DriverManager.getConnection("jdbc:mysql:" + param); + return new OutputWrapper(conn); + } catch (Exception e) { + throw (IOException) new IOException(e.getMessage()).initCause(e); + } + } + + private static OutputWrapper connectPostgres(String param) throws IOException { + try { + Class.forName("org.postgresql.Driver"); + Connection conn = DriverManager.getConnection("jdbc:postgresql:" + param); + return new OutputWrapper(conn); + } catch (Exception e) { + throw new IOException(e.toString()); + } + } + + static DumpWriter openOutputSink(OutputWrapper output, String format, String param) throws IOException { + if (format.equals("xml")) + return new XmlDumpWriter(output.getFileStream()); + else if (format.equals("sphinx")) + return new SphinxWriter(output.getFileStream()); + else if (format.equals("mysql") || format.equals("pgsql") || format.equals("sql")) { + SqlStream sqlStream = output.getSqlStream(); + SqlWriter ret; + + SqlWriter.Traits tr; + if (format.equals("pgsql")) + tr = new SqlWriter.PostgresTraits(); + else + tr = new SqlWriter.MySQLTraits(); + + if (param.equals("1.4")) + ret = new SqlWriter14(tr, sqlStream); + else if (param.equals("1.5")) + ret = new SqlWriter15(tr, sqlStream); + else + throw new IllegalArgumentException("SQL version not known: " + param); + + return ret; + } else + throw new IllegalArgumentException("Output format not known: " + format); + } + + // ---------------- + + static DumpWriter addFilter(DumpWriter sink, String filter, String param) throws IOException, ParseException { + if (filter.equals("latest")) + return new LatestFilter(sink); + else if (filter.equals("namespace")) + return new NamespaceFilter(sink, param); + else if (filter.equals("notalk")) + return new NotalkFilter(sink); + else if (filter.equals("titlematch")) + return new TitleMatchFilter(sink, param); + else if (filter.equals("list")) + return new ListFilter(sink, param); + else if (filter.equals("exactlist")) + return new ExactListFilter(sink, param); + else if (filter.equals("revlist")) + return new RevisionListFilter(sink, param); + else if (filter.equals("before")) + return new BeforeTimeStampFilter(sink, param); + else if (filter.equals("after")) + return new AfterTimeStampFilter(sink, param); + else + throw new IllegalArgumentException("Filter unknown: " + filter); + } } diff --git a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/dumper/ProgressFilter.java b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/dumper/ProgressFilter.java index 4f2a045f..04ac00bb 100644 --- a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/dumper/ProgressFilter.java +++ b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/dumper/ProgressFilter.java @@ -34,60 +34,61 @@ import org.dkpro.jwpl.mwdumper.importer.Revision; public class ProgressFilter extends PageFilter { - int pages = 0; - int revisions = 0; - final int interval; - final MessageFormat format = new MessageFormat("{0} pages ({1}/sec), {2} revs ({3}/sec)"); - final long start = System.currentTimeMillis(); - - public ProgressFilter(DumpWriter sink, int interval) { - super(sink); - this.interval = interval; - if (interval <= 0) - throw new IllegalArgumentException("Reporting interval must be positive."); - } - - public void writeStartPage(Page page) throws IOException { - super.writeStartPage(page); - pages++; - } - - public void writeRevision(Revision rev) throws IOException { - super.writeRevision(rev); - revisions++; - reportProgress(); - } - - /** - * If we didn't just show a progress report on the last revision, - * show the final results. - * @throws IOException - */ - public void writeEndWiki() throws IOException { - super.writeEndWiki(); - if (revisions % interval != 0) - showProgress(); - } + int pages = 0; + int revisions = 0; + final int interval; + final MessageFormat format = new MessageFormat("{0} pages ({1}/sec), {2} revs ({3}/sec)"); + final long start = System.currentTimeMillis(); - private void reportProgress() { - if (revisions % interval == 0) - showProgress(); - } - - private void showProgress() { - long delta = System.currentTimeMillis() - start; - sendOutput(format.format(new Object[] { + public ProgressFilter(DumpWriter sink, int interval) { + super(sink); + this.interval = interval; + if (interval <= 0) + throw new IllegalArgumentException("Reporting interval must be positive."); + } + + public void writeStartPage(Page page) throws IOException { + super.writeStartPage(page); + pages++; + } + + public void writeRevision(Revision rev) throws IOException { + super.writeRevision(rev); + revisions++; + reportProgress(); + } + + /** + * If we didn't just show a progress report on the last revision, + * show the final results. + * + * @throws IOException + */ + public void writeEndWiki() throws IOException { + super.writeEndWiki(); + if (revisions % interval != 0) + showProgress(); + } + + private void reportProgress() { + if (revisions % interval == 0) + showProgress(); + } + + private void showProgress() { + long delta = System.currentTimeMillis() - start; + sendOutput(format.format(new Object[]{ pages, rate(delta, pages), revisions, rate(delta, revisions)})); - } - - protected void sendOutput(String text) { - System.err.println(text); - } + } + + protected void sendOutput(String text) { + System.err.println(text); + } - private static Object rate(long delta, int count) { - return (delta > 0.001) - ? (Double) (1000.0 * (double) count / (double) delta) - : (Object)"-"; - } + private static Object rate(long delta, int count) { + return (delta > 0.001) + ? (Double) (1000.0 * (double) count / (double) delta) + : (Object) "-"; + } } diff --git a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/dumper/Tools.java b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/dumper/Tools.java index c768c1fe..ace8ea6f 100644 --- a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/dumper/Tools.java +++ b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/dumper/Tools.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -31,57 +31,55 @@ import java.util.zip.GZIPInputStream; public class Tools { - static final int IN_BUF_SZ = 1024 * 1024; - private static final int OUT_BUF_SZ = 1024 * 1024; + static final int IN_BUF_SZ = 1024 * 1024; + private static final int OUT_BUF_SZ = 1024 * 1024; - public static InputStream openInputFile(String arg) throws IOException { - if (arg.equals("-")) { - return openStandardInput(); - } - InputStream infile = new BufferedInputStream(new FileInputStream(arg), IN_BUF_SZ); - if (arg.endsWith(".gz")) { - return new GZIPInputStream(infile); - } - else if (arg.endsWith(".bz2")) { - return openBZip2Stream(infile); - } - else { - return infile; - } - } + public static InputStream openInputFile(String arg) throws IOException { + if (arg.equals("-")) { + return openStandardInput(); + } + InputStream infile = new BufferedInputStream(new FileInputStream(arg), IN_BUF_SZ); + if (arg.endsWith(".gz")) { + return new GZIPInputStream(infile); + } else if (arg.endsWith(".bz2")) { + return openBZip2Stream(infile); + } else { + return infile; + } + } - static InputStream openStandardInput() throws IOException { - return new BufferedInputStream(System.in, IN_BUF_SZ); - } + static InputStream openStandardInput() throws IOException { + return new BufferedInputStream(System.in, IN_BUF_SZ); + } - static InputStream openBZip2Stream(InputStream infile) throws IOException { - int first = infile.read(); - int second = infile.read(); - if (first != 'B' || second != 'Z') { - throw new IOException("Didn't find BZ file signature in .bz2 file"); - } - return new BZip2CompressorInputStream(infile); - } + static InputStream openBZip2Stream(InputStream infile) throws IOException { + int first = infile.read(); + int second = infile.read(); + if (first != 'B' || second != 'Z') { + throw new IOException("Didn't find BZ file signature in .bz2 file"); + } + return new BZip2CompressorInputStream(infile); + } - static OutputStream openStandardOutput() { - return new BufferedOutputStream(System.out, OUT_BUF_SZ); - } + static OutputStream openStandardOutput() { + return new BufferedOutputStream(System.out, OUT_BUF_SZ); + } - static OutputStream createBZip2File(String param) throws IOException { - OutputStream outfile = createOutputFile(param); - // bzip2 expects a two-byte 'BZ' signature header - outfile.write('B'); - outfile.write('Z'); - return new BZip2CompressorOutputStream(outfile); - } + static OutputStream createBZip2File(String param) throws IOException { + OutputStream outfile = createOutputFile(param); + // bzip2 expects a two-byte 'BZ' signature header + outfile.write('B'); + outfile.write('Z'); + return new BZip2CompressorOutputStream(outfile); + } - static OutputStream createOutputFile(String param) throws IOException { - File file = new File(param); - file.createNewFile(); - return new BufferedOutputStream(new FileOutputStream(file), OUT_BUF_SZ); - } + static OutputStream createOutputFile(String param) throws IOException { + File file = new File(param); + file.createNewFile(); + return new BufferedOutputStream(new FileOutputStream(file), OUT_BUF_SZ); + } - // ---------------- + // ---------------- } diff --git a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/AfterTimeStampFilter.java b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/AfterTimeStampFilter.java index 3f9f2261..258221fe 100644 --- a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/AfterTimeStampFilter.java +++ b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/AfterTimeStampFilter.java @@ -29,13 +29,13 @@ public class AfterTimeStampFilter extends TimeStampFilter { - public AfterTimeStampFilter(DumpWriter sink, String timeStamp) throws ParseException { - super(sink, timeStamp); - } + public AfterTimeStampFilter(DumpWriter sink, String timeStamp) throws ParseException { + super(sink, timeStamp); + } - public void writeRevision(Revision revision) throws IOException { - if (revision.Timestamp.after(super.filterTimeStamp)) { - super.writeRevision(revision); - } - } + public void writeRevision(Revision revision) throws IOException { + if (revision.Timestamp.after(super.filterTimeStamp)) { + super.writeRevision(revision); + } + } } diff --git a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/BeforeTimeStampFilter.java b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/BeforeTimeStampFilter.java index dbaead12..65af0bbe 100644 --- a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/BeforeTimeStampFilter.java +++ b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/BeforeTimeStampFilter.java @@ -29,13 +29,13 @@ public class BeforeTimeStampFilter extends TimeStampFilter { - public BeforeTimeStampFilter(DumpWriter sink, String timeStamp) throws ParseException { - super(sink, timeStamp); - } + public BeforeTimeStampFilter(DumpWriter sink, String timeStamp) throws ParseException { + super(sink, timeStamp); + } - public void writeRevision(Revision revision) throws IOException { - if (revision.Timestamp.before(super.filterTimeStamp)) { - super.writeRevision(revision); - } - } + public void writeRevision(Revision revision) throws IOException { + if (revision.Timestamp.before(super.filterTimeStamp)) { + super.writeRevision(revision); + } + } } diff --git a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/Buffer.java b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/Buffer.java index 93d59ff3..e9cb0396 100644 --- a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/Buffer.java +++ b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/Buffer.java @@ -29,36 +29,37 @@ public final class Buffer { - private Buffer() {} + private Buffer() { + } - private static final IdentityHashMap<Thread, char[]> BUFFERS = new IdentityHashMap<>(); + private static final IdentityHashMap<Thread, char[]> BUFFERS = new IdentityHashMap<>(); - private static Thread lastThread; - private static char[] lastBuffer; + private static Thread lastThread; + private static char[] lastBuffer; - public static synchronized char[] get(int capacity) { - final Thread thread = Thread.currentThread(); - char[] buffer; + public static synchronized char[] get(int capacity) { + final Thread thread = Thread.currentThread(); + char[] buffer; - if (lastThread == thread) { - buffer = lastBuffer; - } else { - lastThread = thread; - buffer = lastBuffer = BUFFERS.get(thread); - } + if (lastThread == thread) { + buffer = lastBuffer; + } else { + lastThread = thread; + buffer = lastBuffer = BUFFERS.get(thread); + } - if (buffer == null) { - buffer = lastBuffer = new char[capacity]; - BUFFERS.put(thread, buffer); - } else if (buffer.length < capacity) { - int newsize = buffer.length * 2; - if (newsize < capacity) - newsize = capacity; + if (buffer == null) { + buffer = lastBuffer = new char[capacity]; + BUFFERS.put(thread, buffer); + } else if (buffer.length < capacity) { + int newsize = buffer.length * 2; + if (newsize < capacity) + newsize = capacity; - buffer = lastBuffer = new char[newsize]; - BUFFERS.put(thread, buffer); - } + buffer = lastBuffer = new char[newsize]; + BUFFERS.put(thread, buffer); + } - return buffer; - } + return buffer; + } } diff --git a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/Contributor.java b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/Contributor.java index 45229136..3706b3cd 100644 --- a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/Contributor.java +++ b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/Contributor.java @@ -26,17 +26,17 @@ package org.dkpro.jwpl.mwdumper.importer; public class Contributor { - public String Username; - public int Id; - public boolean isIP = false; + public String Username; + public int Id; + public boolean isIP = false; + + public Contributor() { + this(null, 0); + } + + public Contributor(String username, int id) { + Username = username; + Id = id; + } - public Contributor() { - this(null, 0); - } - - public Contributor(String username, int id) { - Username = username; - Id = id; - } - } diff --git a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/DumpWriter.java b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/DumpWriter.java index ebb3c85d..0185d888 100644 --- a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/DumpWriter.java +++ b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/DumpWriter.java @@ -28,16 +28,18 @@ import java.io.IOException; public interface DumpWriter { - void close() throws IOException; - - void writeStartWiki() throws IOException; - void writeEndWiki() throws IOException; - - void writeSiteinfo(Siteinfo info) throws IOException; - - void writeStartPage(Page page) throws IOException; - void writeEndPage() throws IOException; - - void writeRevision(Revision revision) throws IOException; - //void WriteUpload(Upload upload) throws IOException; // for the future + void close() throws IOException; + + void writeStartWiki() throws IOException; + + void writeEndWiki() throws IOException; + + void writeSiteinfo(Siteinfo info) throws IOException; + + void writeStartPage(Page page) throws IOException; + + void writeEndPage() throws IOException; + + void writeRevision(Revision revision) throws IOException; + //void WriteUpload(Upload upload) throws IOException; // for the future } diff --git a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/ExactListFilter.java b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/ExactListFilter.java index 7aeaac78..fa6088d7 100644 --- a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/ExactListFilter.java +++ b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/ExactListFilter.java @@ -28,11 +28,11 @@ import java.io.IOException; public class ExactListFilter extends ListFilter { - public ExactListFilter(DumpWriter sink, String sourceFileName) throws IOException { - super(sink, sourceFileName); - } - - protected boolean pass(Page page) { - return list.containsKey(page.Title.toString()); - } + public ExactListFilter(DumpWriter sink, String sourceFileName) throws IOException { + super(sink, sourceFileName); + } + + protected boolean pass(Page page) { + return list.containsKey(page.Title.toString()); + } } diff --git a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/LatestFilter.java b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/LatestFilter.java index 26ea2c92..c94946e6 100644 --- a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/LatestFilter.java +++ b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/LatestFilter.java @@ -28,42 +28,42 @@ import java.io.IOException; public class LatestFilter implements DumpWriter { - final DumpWriter sink; - Revision lastRevision; - - public LatestFilter(DumpWriter sink) { - this.sink = sink; - } - - public void close() throws IOException { - sink.close(); - } - - public void writeStartWiki() throws IOException { - sink.writeStartWiki(); - } - - public void writeEndWiki() throws IOException { - sink.writeEndWiki(); - } - - public void writeSiteinfo(Siteinfo info) throws IOException { - sink.writeSiteinfo(info); - } - - public void writeStartPage(Page page) throws IOException { - sink.writeStartPage(page); - } - - public void writeEndPage() throws IOException { - if (lastRevision != null) { - sink.writeRevision(lastRevision); - lastRevision = null; - } - sink.writeEndPage(); - } - - public void writeRevision(Revision revision) { - lastRevision = revision; - } + final DumpWriter sink; + Revision lastRevision; + + public LatestFilter(DumpWriter sink) { + this.sink = sink; + } + + public void close() throws IOException { + sink.close(); + } + + public void writeStartWiki() throws IOException { + sink.writeStartWiki(); + } + + public void writeEndWiki() throws IOException { + sink.writeEndWiki(); + } + + public void writeSiteinfo(Siteinfo info) throws IOException { + sink.writeSiteinfo(info); + } + + public void writeStartPage(Page page) throws IOException { + sink.writeStartPage(page); + } + + public void writeEndPage() throws IOException { + if (lastRevision != null) { + sink.writeRevision(lastRevision); + lastRevision = null; + } + sink.writeEndPage(); + } + + public void writeRevision(Revision revision) { + lastRevision = revision; + } } diff --git a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/ListFilter.java b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/ListFilter.java index e215cce1..8701ea0a 100644 --- a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/ListFilter.java +++ b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/ListFilter.java @@ -35,31 +35,31 @@ import java.util.Map; public class ListFilter extends PageFilter { - protected final Map<String, String> list; - - public ListFilter(DumpWriter sink, String sourceFileName) throws IOException { - super(sink); - list = new HashMap<>(); - BufferedReader input = new BufferedReader(new InputStreamReader(new BufferedInputStream( - new FileInputStream(sourceFileName)), StandardCharsets.UTF_8)); - String line = input.readLine(); - while (line != null) { - if (!line.startsWith("#")) { - String title = line.trim(); - title = title.replace("_", " "); - if (title.startsWith(":")) - title = line.substring(1); - - if (title.length() > 0) - list.put(title, title); - } - line = input.readLine(); - } - input.close(); - } - - protected boolean pass(Page page) { - return list.containsKey(page.Title.subjectPage().toString()) - || list.containsKey(page.Title.talkPage().toString()); - } + protected final Map<String, String> list; + + public ListFilter(DumpWriter sink, String sourceFileName) throws IOException { + super(sink); + list = new HashMap<>(); + BufferedReader input = new BufferedReader(new InputStreamReader(new BufferedInputStream( + new FileInputStream(sourceFileName)), StandardCharsets.UTF_8)); + String line = input.readLine(); + while (line != null) { + if (!line.startsWith("#")) { + String title = line.trim(); + title = title.replace("_", " "); + if (title.startsWith(":")) + title = line.substring(1); + + if (title.length() > 0) + list.put(title, title); + } + line = input.readLine(); + } + input.close(); + } + + protected boolean pass(Page page) { + return list.containsKey(page.Title.subjectPage().toString()) + || list.containsKey(page.Title.talkPage().toString()); + } } diff --git a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/MultiWriter.java b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/MultiWriter.java index d2d11560..cd93c524 100644 --- a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/MultiWriter.java +++ b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/MultiWriter.java @@ -30,62 +30,62 @@ import java.util.List; public class MultiWriter implements DumpWriter { - private final List<DumpWriter> sinks; - - public MultiWriter() { - sinks = new ArrayList<>(); - } - - public void close() throws IOException { - for (int i = 0; i < sinks.size(); i++) { - DumpWriter sink = sinks.get(i); - sink.close(); - } - } - - public void writeStartWiki() throws IOException { - for (int i = 0; i < sinks.size(); i++) { - DumpWriter sink = sinks.get(i); - sink.writeStartWiki(); - } - } - - public void writeEndWiki() throws IOException { - for (int i = 0; i < sinks.size(); i++) { - DumpWriter sink = sinks.get(i); - sink.writeEndWiki(); - } - } - - public void writeSiteinfo(Siteinfo info) throws IOException { - for (int i = 0; i < sinks.size(); i++) { - DumpWriter sink = sinks.get(i); - sink.writeSiteinfo(info); - } - } - - public void writeStartPage(Page page) throws IOException { - for (int i = 0; i < sinks.size(); i++) { - DumpWriter sink = sinks.get(i); - sink.writeStartPage(page); - } - } - - public void writeEndPage() throws IOException { - for (int i = 0; i < sinks.size(); i++) { - DumpWriter sink = sinks.get(i); - sink.writeEndPage(); - } - } - - public void writeRevision(Revision revision) throws IOException { - for (int i = 0; i < sinks.size(); i++) { - DumpWriter sink = sinks.get(i); - sink.writeRevision(revision); - } - } - - public void add(DumpWriter sink) { - sinks.add(sink); - } + private final List<DumpWriter> sinks; + + public MultiWriter() { + sinks = new ArrayList<>(); + } + + public void close() throws IOException { + for (int i = 0; i < sinks.size(); i++) { + DumpWriter sink = sinks.get(i); + sink.close(); + } + } + + public void writeStartWiki() throws IOException { + for (int i = 0; i < sinks.size(); i++) { + DumpWriter sink = sinks.get(i); + sink.writeStartWiki(); + } + } + + public void writeEndWiki() throws IOException { + for (int i = 0; i < sinks.size(); i++) { + DumpWriter sink = sinks.get(i); + sink.writeEndWiki(); + } + } + + public void writeSiteinfo(Siteinfo info) throws IOException { + for (int i = 0; i < sinks.size(); i++) { + DumpWriter sink = sinks.get(i); + sink.writeSiteinfo(info); + } + } + + public void writeStartPage(Page page) throws IOException { + for (int i = 0; i < sinks.size(); i++) { + DumpWriter sink = sinks.get(i); + sink.writeStartPage(page); + } + } + + public void writeEndPage() throws IOException { + for (int i = 0; i < sinks.size(); i++) { + DumpWriter sink = sinks.get(i); + sink.writeEndPage(); + } + } + + public void writeRevision(Revision revision) throws IOException { + for (int i = 0; i < sinks.size(); i++) { + DumpWriter sink = sinks.get(i); + sink.writeRevision(revision); + } + } + + public void add(DumpWriter sink) { + sinks.add(sink); + } } diff --git a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/NamespaceFilter.java b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/NamespaceFilter.java index 1ce2b01a..9bb0a053 100644 --- a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/NamespaceFilter.java +++ b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/NamespaceFilter.java @@ -29,52 +29,52 @@ import java.util.Map; public class NamespaceFilter extends PageFilter { - final boolean invert; - final Map<Integer, String> matches; - - public NamespaceFilter(DumpWriter sink, String configString) { - super(sink); - - invert = configString.startsWith("!"); - if (invert) - configString = configString.substring(1); - matches = new HashMap<>(); - - String[] namespaceKeys = { - "NS_MAIN", - "NS_TALK", - "NS_USER", - "NS_USER_TALK", - "NS_PROJECT", - "NS_PROJECT_TALK", - "NS_IMAGE", - "NS_IMAGE_TALK", - "NS_MEDIAWIKI", - "NS_MEDIAWIKI_TALK", - "NS_TEMPLATE", - "NS_TEMPLATE_TALK", - "NS_HELP", - "NS_HELP_TALK", - "NS_CATEGORY", - "NS_CATEGORY_TALK" }; - - String[] itemList = configString.trim().split(","); - for (int i = 0; i < itemList.length; i++) { - String keyString = itemList[i]; - String trimmed = keyString.trim(); - try { - int key = Integer.parseInt(trimmed); - matches.put(key, trimmed); - } catch (NumberFormatException e) { - for (int key = 0; key < namespaceKeys.length; key++) { - if (trimmed.equalsIgnoreCase(namespaceKeys[key])) - matches.put(key, trimmed); - } - } - } - } - - protected boolean pass(Page page) { - return invert ^ matches.containsKey(page.Title.Namespace); - } + final boolean invert; + final Map<Integer, String> matches; + + public NamespaceFilter(DumpWriter sink, String configString) { + super(sink); + + invert = configString.startsWith("!"); + if (invert) + configString = configString.substring(1); + matches = new HashMap<>(); + + String[] namespaceKeys = { + "NS_MAIN", + "NS_TALK", + "NS_USER", + "NS_USER_TALK", + "NS_PROJECT", + "NS_PROJECT_TALK", + "NS_IMAGE", + "NS_IMAGE_TALK", + "NS_MEDIAWIKI", + "NS_MEDIAWIKI_TALK", + "NS_TEMPLATE", + "NS_TEMPLATE_TALK", + "NS_HELP", + "NS_HELP_TALK", + "NS_CATEGORY", + "NS_CATEGORY_TALK"}; + + String[] itemList = configString.trim().split(","); + for (int i = 0; i < itemList.length; i++) { + String keyString = itemList[i]; + String trimmed = keyString.trim(); + try { + int key = Integer.parseInt(trimmed); + matches.put(key, trimmed); + } catch (NumberFormatException e) { + for (int key = 0; key < namespaceKeys.length; key++) { + if (trimmed.equalsIgnoreCase(namespaceKeys[key])) + matches.put(key, trimmed); + } + } + } + } + + protected boolean pass(Page page) { + return invert ^ matches.containsKey(page.Title.Namespace); + } } diff --git a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/NamespaceSet.java b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/NamespaceSet.java index ee65aacb..ef3f1c95 100644 --- a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/NamespaceSet.java +++ b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/NamespaceSet.java @@ -31,43 +31,43 @@ import java.util.Map; public class NamespaceSet { - private final Map<String, Integer> byname; - private final Map<Integer, String> bynumber; - - public NamespaceSet() { - byname = new HashMap<>(); - bynumber = new LinkedHashMap<>(); - } - - public void add(Integer index, String prefix) { - byname.put(prefix, index); - bynumber.put(index, prefix); - } - - public boolean hasPrefix(String prefix) { - return byname.containsKey(prefix); - } - - public boolean hasIndex(Integer index) { - return bynumber.containsKey(index); - } - - public String getPrefix(Integer index) { - return bynumber.get(index); - } - - public Integer getIndex(String prefix) { - return byname.get(prefix); - } - - public String getColonPrefix(Integer index) { - String prefix = getPrefix(index); - if (index != 0) - return prefix.concat(":"); - return prefix; - } - - public Iterator<Map.Entry<Integer, String>> orderedEntries() { - return bynumber.entrySet().iterator(); - } + private final Map<String, Integer> byname; + private final Map<Integer, String> bynumber; + + public NamespaceSet() { + byname = new HashMap<>(); + bynumber = new LinkedHashMap<>(); + } + + public void add(Integer index, String prefix) { + byname.put(prefix, index); + bynumber.put(index, prefix); + } + + public boolean hasPrefix(String prefix) { + return byname.containsKey(prefix); + } + + public boolean hasIndex(Integer index) { + return bynumber.containsKey(index); + } + + public String getPrefix(Integer index) { + return bynumber.get(index); + } + + public Integer getIndex(String prefix) { + return byname.get(prefix); + } + + public String getColonPrefix(Integer index) { + String prefix = getPrefix(index); + if (index != 0) + return prefix.concat(":"); + return prefix; + } + + public Iterator<Map.Entry<Integer, String>> orderedEntries() { + return bynumber.entrySet().iterator(); + } } diff --git a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/NotalkFilter.java b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/NotalkFilter.java index e1101c3f..f8b8bcf3 100644 --- a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/NotalkFilter.java +++ b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/NotalkFilter.java @@ -26,11 +26,11 @@ package org.dkpro.jwpl.mwdumper.importer; public class NotalkFilter extends PageFilter { - public NotalkFilter(DumpWriter sink) { - super(sink); - } + public NotalkFilter(DumpWriter sink) { + super(sink); + } - protected boolean pass(Page page) { - return !page.Title.isTalk(); - } + protected boolean pass(Page page) { + return !page.Title.isTalk(); + } } diff --git a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/Page.java b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/Page.java index 440b712e..91cdb13d 100644 --- a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/Page.java +++ b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/Page.java @@ -28,14 +28,14 @@ import java.util.Hashtable; public class Page { - public Title Title; - public int Id; - public final Hashtable<String, Object> DiscussionThreadingInfo; - public String Restrictions; - - public Page() { - // <restrictions> is optional... - Restrictions = ""; - DiscussionThreadingInfo = new Hashtable<>(); - } + public Title Title; + public int Id; + public final Hashtable<String, Object> DiscussionThreadingInfo; + public String Restrictions; + + public Page() { + // <restrictions> is optional... + Restrictions = ""; + DiscussionThreadingInfo = new Hashtable<>(); + } } diff --git a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/PageFilter.java b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/PageFilter.java index 99315144..26b28d5d 100644 --- a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/PageFilter.java +++ b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/PageFilter.java @@ -28,46 +28,46 @@ import java.io.IOException; public abstract class PageFilter implements DumpWriter { - final DumpWriter sink; - boolean showThisPage; - - public PageFilter(DumpWriter sink) { - this.sink = sink; - } - - public void close() throws IOException { - sink.close(); - } - - public void writeStartWiki() throws IOException { - sink.writeStartWiki(); - } - - public void writeEndWiki() throws IOException { - sink.writeEndWiki(); - } - - public void writeSiteinfo(Siteinfo info) throws IOException { - sink.writeSiteinfo(info); - } - - public void writeStartPage(Page page) throws IOException { - showThisPage = pass(page); - if (showThisPage) - sink.writeStartPage(page); - } - - public void writeEndPage() throws IOException { - if (showThisPage) - sink.writeEndPage(); - } - - public void writeRevision(Revision revision) throws IOException { - if (showThisPage) - sink.writeRevision(revision); - } - - protected boolean pass(Page page) { - return true; - } + final DumpWriter sink; + boolean showThisPage; + + public PageFilter(DumpWriter sink) { + this.sink = sink; + } + + public void close() throws IOException { + sink.close(); + } + + public void writeStartWiki() throws IOException { + sink.writeStartWiki(); + } + + public void writeEndWiki() throws IOException { + sink.writeEndWiki(); + } + + public void writeSiteinfo(Siteinfo info) throws IOException { + sink.writeSiteinfo(info); + } + + public void writeStartPage(Page page) throws IOException { + showThisPage = pass(page); + if (showThisPage) + sink.writeStartPage(page); + } + + public void writeEndPage() throws IOException { + if (showThisPage) + sink.writeEndPage(); + } + + public void writeRevision(Revision revision) throws IOException { + if (showThisPage) + sink.writeRevision(revision); + } + + protected boolean pass(Page page) { + return true; + } } diff --git a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/Revision.java b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/Revision.java index a05986d4..611f9b6e 100644 --- a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/Revision.java +++ b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/Revision.java @@ -28,21 +28,21 @@ import java.util.Calendar; public class Revision { - public int Id; - public Calendar Timestamp; - public Contributor Contributor; - public String Comment; - public String Text; - public boolean Minor; - - public boolean isRedirect() { - // FIXME - return Text.startsWith("#REDIRECT ") || Text.startsWith("#redirect "); - } - - public Revision() { - Comment = ""; - Text = ""; - Minor = false; - } + public int Id; + public Calendar Timestamp; + public Contributor Contributor; + public String Comment; + public String Text; + public boolean Minor; + + public boolean isRedirect() { + // FIXME + return Text.startsWith("#REDIRECT ") || Text.startsWith("#redirect "); + } + + public Revision() { + Comment = ""; + Text = ""; + Minor = false; + } } diff --git a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/RevisionListFilter.java b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/RevisionListFilter.java index f8d1e2b0..a0c6bd08 100644 --- a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/RevisionListFilter.java +++ b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/RevisionListFilter.java @@ -35,61 +35,61 @@ import java.util.TreeSet; public class RevisionListFilter implements DumpWriter { - final DumpWriter sink; - protected final Set<String> revIds; - protected Page currentPage; - protected boolean pageWritten; - - public RevisionListFilter(DumpWriter sink, String sourceFileName) throws IOException { - this.sink = sink; - revIds = new TreeSet<>(); - BufferedReader input = new BufferedReader(new InputStreamReader(new BufferedInputStream( - new FileInputStream(sourceFileName)), StandardCharsets.UTF_8)); - String line = input.readLine(); - while (line != null) { - line = line.trim(); - if (line.length() > 0 && !line.startsWith("#")) { - revIds.add(line); - } - line = input.readLine(); - } - input.close(); - } - - public void close() throws IOException { - sink.close(); - } - - public void writeStartWiki() throws IOException { - sink.writeStartWiki(); - } - - public void writeEndWiki() throws IOException { - sink.writeEndWiki(); - } - - public void writeSiteinfo(Siteinfo info) throws IOException { - sink.writeSiteinfo(info); - } - - public void writeStartPage(Page page) throws IOException { - currentPage = page; - pageWritten = false; - } - - public void writeEndPage() throws IOException { - if (pageWritten) { - sink.writeEndPage(); - } - } - - public void writeRevision(Revision revision) throws IOException { - if (revIds.contains(Integer.valueOf(revision.Id).toString())) { - if (!pageWritten) { - sink.writeStartPage(currentPage); - pageWritten = true; - } - sink.writeRevision(revision); - } - } + final DumpWriter sink; + protected final Set<String> revIds; + protected Page currentPage; + protected boolean pageWritten; + + public RevisionListFilter(DumpWriter sink, String sourceFileName) throws IOException { + this.sink = sink; + revIds = new TreeSet<>(); + BufferedReader input = new BufferedReader(new InputStreamReader(new BufferedInputStream( + new FileInputStream(sourceFileName)), StandardCharsets.UTF_8)); + String line = input.readLine(); + while (line != null) { + line = line.trim(); + if (line.length() > 0 && !line.startsWith("#")) { + revIds.add(line); + } + line = input.readLine(); + } + input.close(); + } + + public void close() throws IOException { + sink.close(); + } + + public void writeStartWiki() throws IOException { + sink.writeStartWiki(); + } + + public void writeEndWiki() throws IOException { + sink.writeEndWiki(); + } + + public void writeSiteinfo(Siteinfo info) throws IOException { + sink.writeSiteinfo(info); + } + + public void writeStartPage(Page page) throws IOException { + currentPage = page; + pageWritten = false; + } + + public void writeEndPage() throws IOException { + if (pageWritten) { + sink.writeEndPage(); + } + } + + public void writeRevision(Revision revision) throws IOException { + if (revIds.contains(Integer.valueOf(revision.Id).toString())) { + if (!pageWritten) { + sink.writeStartPage(currentPage); + pageWritten = true; + } + sink.writeRevision(revision); + } + } } diff --git a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/Siteinfo.java b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/Siteinfo.java index c4487e1b..846dcddd 100644 --- a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/Siteinfo.java +++ b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/Siteinfo.java @@ -26,9 +26,9 @@ package org.dkpro.jwpl.mwdumper.importer; public class Siteinfo { - public String Sitename; - public String Base; - public String Generator; - public String Case; - public NamespaceSet Namespaces; + public String Sitename; + public String Base; + public String Generator; + public String Case; + public NamespaceSet Namespaces; } diff --git a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/SphinxWriter.java b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/SphinxWriter.java index fa27c7fc..8d29b875 100644 --- a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/SphinxWriter.java +++ b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/SphinxWriter.java @@ -33,62 +33,62 @@ * Generates XML stream suitable for the Sphinx search engine's xmlpipe input. */ public class SphinxWriter implements DumpWriter { - protected final OutputStream stream; - protected final XmlWriter writer; - protected Page _page; - protected Revision _rev; - - public SphinxWriter(OutputStream output) { - stream = output; - writer = new XmlWriter(stream); - } - - public void close() throws IOException { - writer.close(); - } - - public void writeStartWiki() throws IOException { - writer.openXml(); - // No containing element to open - } - - public void writeEndWiki() throws IOException { - // No containing element to close - writer.closeXml(); - } - - public void writeSiteinfo(Siteinfo info) throws IOException { - // Nothing! - } - - public void writeStartPage(Page page) throws IOException { - _page = page; - } - - /** - * FIXME What's the "group" number here do? - * FIXME preprocess the text to strip some formatting? - */ - public void writeEndPage() throws IOException { - writer.openElement("document"); - writer.textElement("id", Integer.toString(_page.Id)); - writer.textElement("group", "0"); - writer.textElement("timestamp", formatTimestamp(_rev.Timestamp)); - writer.textElement("title", _page.Title.toString()); - writer.textElement("body", _rev.Text); - writer.closeElement(); - _rev = null; - _page = null; - } - - public void writeRevision(Revision rev) throws IOException { - _rev = rev; - } - - /** - * FIXME double-check that it wants Unix timestamp - */ - static String formatTimestamp(Calendar ts) { - return Long.toString(ts.getTimeInMillis() / 1000L); - } + protected final OutputStream stream; + protected final XmlWriter writer; + protected Page _page; + protected Revision _rev; + + public SphinxWriter(OutputStream output) { + stream = output; + writer = new XmlWriter(stream); + } + + public void close() throws IOException { + writer.close(); + } + + public void writeStartWiki() throws IOException { + writer.openXml(); + // No containing element to open + } + + public void writeEndWiki() throws IOException { + // No containing element to close + writer.closeXml(); + } + + public void writeSiteinfo(Siteinfo info) throws IOException { + // Nothing! + } + + public void writeStartPage(Page page) throws IOException { + _page = page; + } + + /** + * FIXME What's the "group" number here do? + * FIXME preprocess the text to strip some formatting? + */ + public void writeEndPage() throws IOException { + writer.openElement("document"); + writer.textElement("id", Integer.toString(_page.Id)); + writer.textElement("group", "0"); + writer.textElement("timestamp", formatTimestamp(_rev.Timestamp)); + writer.textElement("title", _page.Title.toString()); + writer.textElement("body", _rev.Text); + writer.closeElement(); + _rev = null; + _page = null; + } + + public void writeRevision(Revision rev) throws IOException { + _rev = rev; + } + + /** + * FIXME double-check that it wants Unix timestamp + */ + static String formatTimestamp(Calendar ts) { + return Long.toString(ts.getTimeInMillis() / 1000L); + } } diff --git a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/SqlFileStream.java b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/SqlFileStream.java index 4f83b4fe..7037f7be 100644 --- a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/SqlFileStream.java +++ b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/SqlFileStream.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -23,23 +23,23 @@ import java.nio.charset.StandardCharsets; public class SqlFileStream implements SqlStream { - protected final PrintStream stream; - - public SqlFileStream(OutputStream output) throws IOException { - this.stream = new PrintStream(output, false, StandardCharsets.UTF_8); - } - - public void writeComment(CharSequence sql) { - stream.println(sql.toString()); - } - - public void writeStatement(CharSequence sql) { - stream.print(sql.toString()); - stream.println(';'); - } - - public void close() { - stream.flush(); - stream.close(); - } + protected final PrintStream stream; + + public SqlFileStream(OutputStream output) throws IOException { + this.stream = new PrintStream(output, false, StandardCharsets.UTF_8); + } + + public void writeComment(CharSequence sql) { + stream.println(sql.toString()); + } + + public void writeStatement(CharSequence sql) { + stream.print(sql.toString()); + stream.println(';'); + } + + public void close() { + stream.flush(); + stream.close(); + } } diff --git a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/SqlLiteral.java b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/SqlLiteral.java index 1ee5d85a..5c15b0e5 100644 --- a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/SqlLiteral.java +++ b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/SqlLiteral.java @@ -29,13 +29,13 @@ * Quickie wrapper class for including literal SQL expressions. */ public class SqlLiteral { - final String contents; - - public SqlLiteral(String contents) { - this.contents = contents; - } - - public String toString() { - return contents; - } + final String contents; + + public SqlLiteral(String contents) { + this.contents = contents; + } + + public String toString() { + return contents; + } } diff --git a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/SqlServerStream.java b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/SqlServerStream.java index 29f5b456..c33e5815 100644 --- a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/SqlServerStream.java +++ b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/SqlServerStream.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -24,35 +24,35 @@ import java.sql.Statement; public class SqlServerStream implements SqlStream { - private final Connection connection; - - public SqlServerStream(Connection conn) { - connection = conn; // TODO - } - - public void writeComment(CharSequence sql) { - // do nothing - } - - public void writeStatement(CharSequence sql) throws IOException { - Statement statement; - try { - statement = connection.createStatement(); - statement.setEscapeProcessing(false); - statement.execute(sql.toString()); - } catch (SQLException e) { - throw new IOException(e.toString()); - } - } - - public void close() throws IOException { - try { - connection.close(); - } catch (SQLWarning e) { - e.printStackTrace(); - } catch (SQLException e) { - throw new IOException(e.toString()); - } - } + private final Connection connection; + + public SqlServerStream(Connection conn) { + connection = conn; // TODO + } + + public void writeComment(CharSequence sql) { + // do nothing + } + + public void writeStatement(CharSequence sql) throws IOException { + Statement statement; + try { + statement = connection.createStatement(); + statement.setEscapeProcessing(false); + statement.execute(sql.toString()); + } catch (SQLException e) { + throw new IOException(e.toString()); + } + } + + public void close() throws IOException { + try { + connection.close(); + } catch (SQLWarning e) { + e.printStackTrace(); + } catch (SQLException e) { + throw new IOException(e.toString()); + } + } } diff --git a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/SqlStream.java b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/SqlStream.java index ff706282..5c5c4a1e 100644 --- a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/SqlStream.java +++ b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/SqlStream.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -20,7 +20,9 @@ import java.io.IOException; public interface SqlStream { - void writeComment(CharSequence sql) throws IOException; - void writeStatement(CharSequence sql) throws IOException; - void close() throws IOException; + void writeComment(CharSequence sql) throws IOException; + + void writeStatement(CharSequence sql) throws IOException; + + void close() throws IOException; } diff --git a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/SqlWriter.java b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/SqlWriter.java index 718ce2e5..609f2f16 100644 --- a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/SqlWriter.java +++ b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/SqlWriter.java @@ -36,323 +36,343 @@ public abstract class SqlWriter implements DumpWriter { - public static abstract class Traits { - public abstract SqlLiteral getCurrentTime(); - public abstract SqlLiteral getRandom(); - public abstract String getTextTable(); - public abstract boolean supportsMultiRowInsert(); - public abstract MessageFormat getTimestampFormatter(); - public String getWikiPrologue() { - return null; - } - public String getWikiEpilogue() { - return null; - } - } - - public static class MySQLTraits extends Traits { - // UTC_TIMESTAMP() is new in MySQL 4.1 or 5.0, so using this - // godawful hack found in documentation comments: - public SqlLiteral getCurrentTime() { - return new SqlLiteral("DATE_ADD('1970-01-01', INTERVAL UNIX_TIMESTAMP() SECOND)+0"); - } - public SqlLiteral getRandom() { - return new SqlLiteral("RAND()"); - } - public boolean supportsMultiRowInsert() { - return true; - } - public String getTextTable() { - return "text"; - } - private static final MessageFormat timestampFormatter = new MessageFormat( - "{0,number,0000}{1,number,00}{2,number,00}{3,number,00}{4,number,00}{5,number,00}"); - public MessageFormat getTimestampFormatter() { - return timestampFormatter; - } - } - - public static class PostgresTraits extends Traits { - public SqlLiteral getCurrentTime() { - return new SqlLiteral("current_timestamp AT TIME ZONE 'UTC'"); - } - public SqlLiteral getRandom() { - return new SqlLiteral("RANDOM()"); - } - public boolean supportsMultiRowInsert() { - return false; - } - public String getTextTable() { - return "pagecontent"; - } - private static final MessageFormat timestampFormatter = new MessageFormat( - "{0,number,0000}-{1,number,00}-{2,number,00} {3,number,00}:{4,number,00}:{5,number,00}"); - public MessageFormat getTimestampFormatter() { - return timestampFormatter; - } - public String getWikiPrologue() { - return -"ALTER TABLE revision DISABLE TRIGGER ALL;" + -"ALTER TABLE page DISABLE TRIGGER ALL;"; - } - public String getWikiEpilogue() { - return -"ALTER TABLE revision ENABLE TRIGGER ALL;" + -"ALTER TABLE page ENABLE TRIGGER ALL;"; - } - } - - private final SqlStream stream; - private String tablePrefix = ""; - - protected static final Integer ONE = 1; - protected static final Integer ZERO = 0; - protected final Traits traits; - - public SqlWriter(Traits tr, SqlStream output) { - stream = output; - traits = tr; - } - - public SqlWriter(Traits tr, SqlStream output, String prefix) { - stream = output; - tablePrefix = prefix; - traits = tr; - } - - public void close() throws IOException { - stream.close(); - } - - public void writeStartWiki() throws IOException { - stream.writeComment("-- MediaWiki XML dump converted to SQL by mwdumper"); - stream.writeStatement("BEGIN"); - - String prologue = traits.getWikiPrologue(); - if (prologue != null) - stream.writeStatement(prologue); - } - - public void writeEndWiki() throws IOException { - flushInsertBuffers(); - - String epilogue = traits.getWikiEpilogue(); - if (epilogue != null) - stream.writeStatement(epilogue); - stream.writeStatement("COMMIT"); - stream.writeComment("-- DONE"); - } - - public void writeSiteinfo(Siteinfo info) throws IOException { - stream.writeComment(""); - stream.writeComment("-- Site: " + commentSafe(info.Sitename)); - stream.writeComment("-- URL: " + commentSafe(info.Base)); - stream.writeComment("-- Generator: " + commentSafe(info.Generator)); - stream.writeComment("-- Case: " + commentSafe(info.Case)); - stream.writeComment("--"); - stream.writeComment("-- Namespaces:"); - for (Iterator<Map.Entry<Integer, String>> i = info.Namespaces.orderedEntries(); i.hasNext();) { - Map.Entry<Integer, String> e = i.next(); - stream.writeComment("-- " + e.getKey() + ": " + e.getValue()); - } - stream.writeComment(""); - } - - public abstract void writeStartPage(Page page) throws IOException; - - public abstract void writeEndPage() throws IOException; - - public abstract void writeRevision(Revision revision) throws IOException; - - protected String commentSafe(String text) { - return text; - } - - private final Map<CharSequence, StringBuffer> insertBuffers = new HashMap<>(); - private static final int blockSize = 1024 * 512; // default 512k inserts - - protected void bufferInsertRow(String table, Object[][] row) throws IOException { - StringBuffer sql = insertBuffers.get(table); - if (sql != null) { - if (traits.supportsMultiRowInsert() && (sql.length() < blockSize)) { - sql.append(','); - appendInsertValues(sql, row); - return; - } else { - flushInsertBuffer(table); - } - } - sql = new StringBuffer(blockSize); - synchronized (sql) { //only for StringBuffer - appendInsertStatement(sql, table, row); - insertBuffers.put(table, sql); - } - } - - protected void flushInsertBuffer(String table) throws IOException { - stream.writeStatement(insertBuffers.get(table)); - insertBuffers.remove(table); - } - - protected void flushInsertBuffers() throws IOException { + public static abstract class Traits { + public abstract SqlLiteral getCurrentTime(); + + public abstract SqlLiteral getRandom(); + + public abstract String getTextTable(); + + public abstract boolean supportsMultiRowInsert(); + + public abstract MessageFormat getTimestampFormatter(); + + public String getWikiPrologue() { + return null; + } + + public String getWikiEpilogue() { + return null; + } + } + + public static class MySQLTraits extends Traits { + // UTC_TIMESTAMP() is new in MySQL 4.1 or 5.0, so using this + // godawful hack found in documentation comments: + public SqlLiteral getCurrentTime() { + return new SqlLiteral("DATE_ADD('1970-01-01', INTERVAL UNIX_TIMESTAMP() SECOND)+0"); + } + + public SqlLiteral getRandom() { + return new SqlLiteral("RAND()"); + } + + public boolean supportsMultiRowInsert() { + return true; + } + + public String getTextTable() { + return "text"; + } + + private static final MessageFormat timestampFormatter = new MessageFormat( + "{0,number,0000}{1,number,00}{2,number,00}{3,number,00}{4,number,00}{5,number,00}"); + + public MessageFormat getTimestampFormatter() { + return timestampFormatter; + } + } + + public static class PostgresTraits extends Traits { + public SqlLiteral getCurrentTime() { + return new SqlLiteral("current_timestamp AT TIME ZONE 'UTC'"); + } + + public SqlLiteral getRandom() { + return new SqlLiteral("RANDOM()"); + } + + public boolean supportsMultiRowInsert() { + return false; + } + + public String getTextTable() { + return "pagecontent"; + } + + private static final MessageFormat timestampFormatter = new MessageFormat( + "{0,number,0000}-{1,number,00}-{2,number,00} {3,number,00}:{4,number,00}:{5,number,00}"); + + public MessageFormat getTimestampFormatter() { + return timestampFormatter; + } + + public String getWikiPrologue() { + return + "ALTER TABLE revision DISABLE TRIGGER ALL;" + + "ALTER TABLE page DISABLE TRIGGER ALL;"; + } + + public String getWikiEpilogue() { + return + "ALTER TABLE revision ENABLE TRIGGER ALL;" + + "ALTER TABLE page ENABLE TRIGGER ALL;"; + } + } + + private final SqlStream stream; + private String tablePrefix = ""; + + protected static final Integer ONE = 1; + protected static final Integer ZERO = 0; + protected final Traits traits; + + public SqlWriter(Traits tr, SqlStream output) { + stream = output; + traits = tr; + } + + public SqlWriter(Traits tr, SqlStream output, String prefix) { + stream = output; + tablePrefix = prefix; + traits = tr; + } + + public void close() throws IOException { + stream.close(); + } + + public void writeStartWiki() throws IOException { + stream.writeComment("-- MediaWiki XML dump converted to SQL by mwdumper"); + stream.writeStatement("BEGIN"); + + String prologue = traits.getWikiPrologue(); + if (prologue != null) + stream.writeStatement(prologue); + } + + public void writeEndWiki() throws IOException { + flushInsertBuffers(); + + String epilogue = traits.getWikiEpilogue(); + if (epilogue != null) + stream.writeStatement(epilogue); + stream.writeStatement("COMMIT"); + stream.writeComment("-- DONE"); + } + + public void writeSiteinfo(Siteinfo info) throws IOException { + stream.writeComment(""); + stream.writeComment("-- Site: " + commentSafe(info.Sitename)); + stream.writeComment("-- URL: " + commentSafe(info.Base)); + stream.writeComment("-- Generator: " + commentSafe(info.Generator)); + stream.writeComment("-- Case: " + commentSafe(info.Case)); + stream.writeComment("--"); + stream.writeComment("-- Namespaces:"); + for (Iterator<Map.Entry<Integer, String>> i = info.Namespaces.orderedEntries(); i.hasNext(); ) { + Map.Entry<Integer, String> e = i.next(); + stream.writeComment("-- " + e.getKey() + ": " + e.getValue()); + } + stream.writeComment(""); + } + + public abstract void writeStartPage(Page page) throws IOException; + + public abstract void writeEndPage() throws IOException; + + public abstract void writeRevision(Revision revision) throws IOException; + + protected String commentSafe(String text) { + return text; + } + + private final Map<CharSequence, StringBuffer> insertBuffers = new HashMap<>(); + private static final int blockSize = 1024 * 512; // default 512k inserts + + protected void bufferInsertRow(String table, Object[][] row) throws IOException { + StringBuffer sql = insertBuffers.get(table); + if (sql != null) { + if (traits.supportsMultiRowInsert() && (sql.length() < blockSize)) { + sql.append(','); + appendInsertValues(sql, row); + return; + } else { + flushInsertBuffer(table); + } + } + sql = new StringBuffer(blockSize); + synchronized (sql) { //only for StringBuffer + appendInsertStatement(sql, table, row); + insertBuffers.put(table, sql); + } + } + + protected void flushInsertBuffer(String table) throws IOException { + stream.writeStatement(insertBuffers.get(table)); + insertBuffers.remove(table); + } + + protected void flushInsertBuffers() throws IOException { for (StringBuffer stringBuffer : insertBuffers.values()) { stream.writeStatement(stringBuffer); } - insertBuffers.clear(); - } - - protected void insertRow(String table, Object[][] row) throws IOException { - StringBuffer sql = new StringBuffer(65536); - appendInsertStatement(sql, table, row); - stream.writeStatement(sql); - } - - private void appendInsertStatement(StringBuffer sql, String table, Object[][] row) { - sql.append("INSERT INTO "); - sql.append(tablePrefix); - sql.append(table); - sql.append(" ("); - - for (int i = 0; i < row.length; i++) { - String field = (String)row[i][0]; - if (i > 0) - sql.append(','); - sql.append(field); - } - sql.append(") VALUES "); - appendInsertValues(sql, row); - } - - private static void appendInsertValues(StringBuffer sql, Object[][] row) { - sql.append('('); - for (int i = 0; i < row.length; i++) { - Object val = row[i][1]; - if (i > 0) - sql.append(','); - sql.append(sqlSafe(val)); - } - sql.append(')'); - } - - protected void updateRow(String table, Object[][] row, String keyField, Object keyValue) throws IOException { - StringBuffer sql = new StringBuffer(65536); - synchronized (sql) { //only for StringBuffer - sql.append("UPDATE "); - sql.append(tablePrefix); - sql.append(table); - sql.append(" SET "); - - for (int i = 0; i < row.length; i++) { - String field = (String)row[i][0]; - Object val = row[i][1]; - if (i > 0) - sql.append(','); - sql.append(field); - sql.append('='); - sql.append(sqlSafe(val)); - } - - sql.append(" WHERE "); - sql.append(keyField); - sql.append('='); - sql.append(sqlSafe(keyValue)); - - stream.writeStatement(sql); - } - } - - protected static String sqlSafe(Object val) { - if (val == null) - return "NULL"; - - String str = val.toString(); - if (val instanceof String) { - return sqlEscape(str); - } else if (val instanceof Integer) { - return str; - } else if (val instanceof Double) { - return str; - } else if (val instanceof SqlLiteral) { - return str; - } else { - throw new IllegalArgumentException("Unknown type in SQL"); - } - } - - protected static String sqlEscape(String str) { - if (str.length() == 0) - return "''"; //TODO "NULL",too ? - final int len = str.length(); - StringBuffer sql = new StringBuffer(len * 2); - synchronized (sql) { //only for StringBuffer - sql.append('\''); - for (int i = 0; i < len; i++) { - char c = str.charAt(i); - switch (c) { - case '\u0000': - sql.append('\\').append('0'); - break; - case '\n': - sql.append('\\').append('n'); - break; - case '\r': - sql.append('\\').append('r'); - break; - case '\u001a': - sql.append('\\').append('Z'); - break; - case '"': - case '\'': - case '\\': - sql.append('\\'); - // fall through - default: - sql.append(c); - break; - } - } - sql.append('\''); - return sql.toString(); - } - } - - protected static String titleFormat(String title) { - return title.replace(' ', '_'); - } - - protected String timestampFormat(Calendar time) { - return traits.getTimestampFormatter().format(new Object[] { - time.get(Calendar.YEAR), - time.get(Calendar.MONTH) + 1, - time.get(Calendar.DAY_OF_MONTH), - time.get(Calendar.HOUR_OF_DAY), - time.get(Calendar.MINUTE), - time.get(Calendar.SECOND)}); - } - - protected String inverseTimestamp(Calendar time) { - return traits.getTimestampFormatter().format(new Object[] { - 9999 - time.get(Calendar.YEAR), - 99 - time.get(Calendar.MONTH) - 1, - 99 - time.get(Calendar.DAY_OF_MONTH), - 99 - time.get(Calendar.HOUR_OF_DAY), - 99 - time.get(Calendar.MINUTE), - 99 - time.get(Calendar.SECOND)}); - } - - private static final TimeZone utc = TimeZone.getTimeZone("UTC"); - protected static GregorianCalendar now() { - return new GregorianCalendar(utc); - } - - final int commitInterval = 1000; // Commit a transaction every n pages - int pageCount = 0; - protected void checkpoint() throws IOException { - pageCount++; - if (pageCount % commitInterval == 0) { - flushInsertBuffers(); - stream.writeStatement("COMMIT"); - stream.writeStatement("BEGIN"); - } - } + insertBuffers.clear(); + } + + protected void insertRow(String table, Object[][] row) throws IOException { + StringBuffer sql = new StringBuffer(65536); + appendInsertStatement(sql, table, row); + stream.writeStatement(sql); + } + + private void appendInsertStatement(StringBuffer sql, String table, Object[][] row) { + sql.append("INSERT INTO "); + sql.append(tablePrefix); + sql.append(table); + sql.append(" ("); + + for (int i = 0; i < row.length; i++) { + String field = (String) row[i][0]; + if (i > 0) + sql.append(','); + sql.append(field); + } + sql.append(") VALUES "); + appendInsertValues(sql, row); + } + + private static void appendInsertValues(StringBuffer sql, Object[][] row) { + sql.append('('); + for (int i = 0; i < row.length; i++) { + Object val = row[i][1]; + if (i > 0) + sql.append(','); + sql.append(sqlSafe(val)); + } + sql.append(')'); + } + + protected void updateRow(String table, Object[][] row, String keyField, Object keyValue) throws IOException { + StringBuffer sql = new StringBuffer(65536); + synchronized (sql) { //only for StringBuffer + sql.append("UPDATE "); + sql.append(tablePrefix); + sql.append(table); + sql.append(" SET "); + + for (int i = 0; i < row.length; i++) { + String field = (String) row[i][0]; + Object val = row[i][1]; + if (i > 0) + sql.append(','); + sql.append(field); + sql.append('='); + sql.append(sqlSafe(val)); + } + + sql.append(" WHERE "); + sql.append(keyField); + sql.append('='); + sql.append(sqlSafe(keyValue)); + + stream.writeStatement(sql); + } + } + + protected static String sqlSafe(Object val) { + if (val == null) + return "NULL"; + + String str = val.toString(); + if (val instanceof String) { + return sqlEscape(str); + } else if (val instanceof Integer) { + return str; + } else if (val instanceof Double) { + return str; + } else if (val instanceof SqlLiteral) { + return str; + } else { + throw new IllegalArgumentException("Unknown type in SQL"); + } + } + + protected static String sqlEscape(String str) { + if (str.length() == 0) + return "''"; //TODO "NULL",too ? + final int len = str.length(); + StringBuffer sql = new StringBuffer(len * 2); + synchronized (sql) { //only for StringBuffer + sql.append('\''); + for (int i = 0; i < len; i++) { + char c = str.charAt(i); + switch (c) { + case '\u0000': + sql.append('\\').append('0'); + break; + case '\n': + sql.append('\\').append('n'); + break; + case '\r': + sql.append('\\').append('r'); + break; + case '\u001a': + sql.append('\\').append('Z'); + break; + case '"': + case '\'': + case '\\': + sql.append('\\'); + // fall through + default: + sql.append(c); + break; + } + } + sql.append('\''); + return sql.toString(); + } + } + + protected static String titleFormat(String title) { + return title.replace(' ', '_'); + } + + protected String timestampFormat(Calendar time) { + return traits.getTimestampFormatter().format(new Object[]{ + time.get(Calendar.YEAR), + time.get(Calendar.MONTH) + 1, + time.get(Calendar.DAY_OF_MONTH), + time.get(Calendar.HOUR_OF_DAY), + time.get(Calendar.MINUTE), + time.get(Calendar.SECOND)}); + } + + protected String inverseTimestamp(Calendar time) { + return traits.getTimestampFormatter().format(new Object[]{ + 9999 - time.get(Calendar.YEAR), + 99 - time.get(Calendar.MONTH) - 1, + 99 - time.get(Calendar.DAY_OF_MONTH), + 99 - time.get(Calendar.HOUR_OF_DAY), + 99 - time.get(Calendar.MINUTE), + 99 - time.get(Calendar.SECOND)}); + } + + private static final TimeZone utc = TimeZone.getTimeZone("UTC"); + + protected static GregorianCalendar now() { + return new GregorianCalendar(utc); + } + + final int commitInterval = 1000; // Commit a transaction every n pages + int pageCount = 0; + + protected void checkpoint() throws IOException { + pageCount++; + if (pageCount % commitInterval == 0) { + flushInsertBuffers(); + stream.writeStatement("COMMIT"); + stream.writeStatement("BEGIN"); + } + } } diff --git a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/SqlWriter14.java b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/SqlWriter14.java index 2af04d6d..c854e1ce 100644 --- a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/SqlWriter14.java +++ b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/SqlWriter14.java @@ -28,67 +28,67 @@ import java.io.IOException; public class SqlWriter14 extends SqlWriter { - private Page currentPage; - private Revision lastRevision; - - public SqlWriter14(SqlWriter.Traits tr, SqlStream output) { - super(tr, output); - } - - public SqlWriter14(SqlWriter.Traits tr, SqlStream output, String prefix) { - super(tr, output, prefix); - } - - public void writeStartPage(Page page) { - currentPage = page; - lastRevision = null; - } - - public void writeEndPage() throws IOException { - if (lastRevision != null) - writeCurRevision(currentPage, lastRevision); - currentPage = null; - lastRevision = null; - } - - public void writeRevision(Revision revision) throws IOException { - if (lastRevision != null) - writeOldRevision(currentPage, lastRevision); - lastRevision = revision; - } - - private void writeOldRevision(Page page, Revision revision) throws IOException { - bufferInsertRow("old", new Object[][] { - {"old_id", revision.Id}, - {"old_namespace", page.Title.Namespace}, - {"old_title", titleFormat(page.Title.Text)}, - {"old_text", revision.Text == null ? "" : revision.Text}, - {"old_comment", revision.Comment == null ? "" : revision.Comment}, - {"old_user", revision.Contributor.Username == null ? ZERO : revision.Contributor.Id}, - {"old_user_text", revision.Contributor.Username == null ? "" : revision.Contributor.Username}, - {"old_timestamp", timestampFormat(revision.Timestamp)}, - {"old_minor_edit", revision.Minor ? ONE : ZERO}, - {"old_flags", "utf-8"}, - {"inverse_timestamp", inverseTimestamp(revision.Timestamp)}}); - } - - private void writeCurRevision(Page page, Revision revision) throws IOException { - bufferInsertRow("cur", new Object[][] { - {"cur_id", page.Id}, - {"cur_namespace", page.Title.Namespace}, - {"cur_title", titleFormat(page.Title.Text)}, - {"cur_text", revision.Text == null ? "" : revision.Text}, - {"cur_comment", revision.Comment == null ? "" : revision.Comment}, - {"cur_user", revision.Contributor.Username == null ? ZERO : Integer.valueOf(revision.Contributor.Id)}, - {"cur_user_text", revision.Contributor.Username == null ? "" : revision.Contributor.Username}, - {"cur_timestamp", timestampFormat(revision.Timestamp)}, - {"cur_restrictions", page.Restrictions}, - {"cur_counter", ZERO}, - {"cur_is_redirect", revision.isRedirect() ? ONE : ZERO}, - {"cur_minor_edit", revision.Minor ? ONE : ZERO}, - {"cur_random", traits.getRandom()}, - {"cur_touched", traits.getCurrentTime()}, - {"inverse_timestamp", inverseTimestamp(revision.Timestamp)}}); - checkpoint(); - } + private Page currentPage; + private Revision lastRevision; + + public SqlWriter14(SqlWriter.Traits tr, SqlStream output) { + super(tr, output); + } + + public SqlWriter14(SqlWriter.Traits tr, SqlStream output, String prefix) { + super(tr, output, prefix); + } + + public void writeStartPage(Page page) { + currentPage = page; + lastRevision = null; + } + + public void writeEndPage() throws IOException { + if (lastRevision != null) + writeCurRevision(currentPage, lastRevision); + currentPage = null; + lastRevision = null; + } + + public void writeRevision(Revision revision) throws IOException { + if (lastRevision != null) + writeOldRevision(currentPage, lastRevision); + lastRevision = revision; + } + + private void writeOldRevision(Page page, Revision revision) throws IOException { + bufferInsertRow("old", new Object[][]{ + {"old_id", revision.Id}, + {"old_namespace", page.Title.Namespace}, + {"old_title", titleFormat(page.Title.Text)}, + {"old_text", revision.Text == null ? "" : revision.Text}, + {"old_comment", revision.Comment == null ? "" : revision.Comment}, + {"old_user", revision.Contributor.Username == null ? ZERO : revision.Contributor.Id}, + {"old_user_text", revision.Contributor.Username == null ? "" : revision.Contributor.Username}, + {"old_timestamp", timestampFormat(revision.Timestamp)}, + {"old_minor_edit", revision.Minor ? ONE : ZERO}, + {"old_flags", "utf-8"}, + {"inverse_timestamp", inverseTimestamp(revision.Timestamp)}}); + } + + private void writeCurRevision(Page page, Revision revision) throws IOException { + bufferInsertRow("cur", new Object[][]{ + {"cur_id", page.Id}, + {"cur_namespace", page.Title.Namespace}, + {"cur_title", titleFormat(page.Title.Text)}, + {"cur_text", revision.Text == null ? "" : revision.Text}, + {"cur_comment", revision.Comment == null ? "" : revision.Comment}, + {"cur_user", revision.Contributor.Username == null ? ZERO : Integer.valueOf(revision.Contributor.Id)}, + {"cur_user_text", revision.Contributor.Username == null ? "" : revision.Contributor.Username}, + {"cur_timestamp", timestampFormat(revision.Timestamp)}, + {"cur_restrictions", page.Restrictions}, + {"cur_counter", ZERO}, + {"cur_is_redirect", revision.isRedirect() ? ONE : ZERO}, + {"cur_minor_edit", revision.Minor ? ONE : ZERO}, + {"cur_random", traits.getRandom()}, + {"cur_touched", traits.getCurrentTime()}, + {"inverse_timestamp", inverseTimestamp(revision.Timestamp)}}); + checkpoint(); + } } diff --git a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/SqlWriter15.java b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/SqlWriter15.java index 7cab23b2..9e797ba1 100644 --- a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/SqlWriter15.java +++ b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/SqlWriter15.java @@ -30,101 +30,101 @@ import java.io.IOException; public class SqlWriter15 extends SqlWriter { - private Page currentPage; - private Revision lastRevision; - - public SqlWriter15(SqlWriter.Traits tr, SqlStream output) { - super(tr, output); - } - - public SqlWriter15(SqlWriter.Traits tr, SqlStream output, String prefix) { - super(tr, output, prefix); - } - - public void writeEndWiki() throws IOException { - flushInsertBuffers(); - super.writeEndWiki(); - } - - public void writeStartPage(Page page) { - currentPage = page; - lastRevision = null; - } - - public void writeEndPage() throws IOException { - if (lastRevision != null) { - updatePage(currentPage, lastRevision); - } - currentPage = null; - lastRevision = null; - } - - static final int DELETED_TEXT = 1; - static final int DELETED_COMMENT = 2; - static final int DELETED_USER = 4; - static final int DELETED_RESTRICTED = 8; - - public void writeRevision(Revision revision) throws IOException { - bufferInsertRow(traits.getTextTable(), new Object[][] { - {"old_id", revision.Id}, - {"old_text", revision.Text == null ? "" : revision.Text}, - {"old_flags", "utf-8"}}); - - int rev_deleted = 0; - if (revision.Contributor.Username==null) rev_deleted |= DELETED_USER; - if (revision.Comment==null) rev_deleted |= DELETED_COMMENT; - if (revision.Text==null) rev_deleted |= DELETED_TEXT; - - bufferInsertRow("revision", new Object[][] { - {"rev_id", revision.Id}, - {"rev_page", currentPage.Id}, - {"rev_text_id", revision.Id}, - {"rev_comment", revision.Comment == null ? "" : revision.Comment}, - {"rev_user", revision.Contributor.Username == null ? ZERO : Integer.valueOf(revision.Contributor.Id)}, - {"rev_user_text", revision.Contributor.Username == null ? "" : revision.Contributor.Username}, - {"rev_timestamp", timestampFormat(revision.Timestamp)}, - {"rev_minor_edit", revision.Minor ? ONE : ZERO}, - {"rev_deleted", rev_deleted==0 ? ZERO : Integer.valueOf(rev_deleted) }}); - - lastRevision = revision; - } - - private static int lengthUtf8(String s) { - final int slen = s.length(); - final char[] buf = Buffer.get(slen); - s.getChars(0, slen, buf, 0); - int len = 0; - for (int i = 0; i < slen; i++) { - char c = buf[i]; - if (c < 0x80) - len++; - else if (c < 0x800) - len+=2; - else if (c < 0xD800 || c >= 0xE000) - len+=3; - else { - // Surrogate pairs are assumed to be valid. - len+=4; - i++; - } - } - return len; - } - - private void updatePage(Page page, Revision revision) throws IOException { - bufferInsertRow("page", new Object[][] { - {"page_id", page.Id}, - {"page_namespace", page.Title.Namespace}, - {"page_title", titleFormat(page.Title.Text)}, - {"page_restrictions", page.Restrictions}, - {"page_counter", ZERO}, - {"page_is_redirect", revision.isRedirect() ? ONE : ZERO}, - {"page_is_new", ZERO}, - {"page_random", traits.getRandom()}, - {"page_touched", traits.getCurrentTime()}, - {"page_latest", revision.Id}, - {"page_len", lengthUtf8(revision.Text)}}); - checkpoint(); - } + private Page currentPage; + private Revision lastRevision; + + public SqlWriter15(SqlWriter.Traits tr, SqlStream output) { + super(tr, output); + } + + public SqlWriter15(SqlWriter.Traits tr, SqlStream output, String prefix) { + super(tr, output, prefix); + } + + public void writeEndWiki() throws IOException { + flushInsertBuffers(); + super.writeEndWiki(); + } + + public void writeStartPage(Page page) { + currentPage = page; + lastRevision = null; + } + + public void writeEndPage() throws IOException { + if (lastRevision != null) { + updatePage(currentPage, lastRevision); + } + currentPage = null; + lastRevision = null; + } + + static final int DELETED_TEXT = 1; + static final int DELETED_COMMENT = 2; + static final int DELETED_USER = 4; + static final int DELETED_RESTRICTED = 8; + + public void writeRevision(Revision revision) throws IOException { + bufferInsertRow(traits.getTextTable(), new Object[][]{ + {"old_id", revision.Id}, + {"old_text", revision.Text == null ? "" : revision.Text}, + {"old_flags", "utf-8"}}); + + int rev_deleted = 0; + if (revision.Contributor.Username == null) rev_deleted |= DELETED_USER; + if (revision.Comment == null) rev_deleted |= DELETED_COMMENT; + if (revision.Text == null) rev_deleted |= DELETED_TEXT; + + bufferInsertRow("revision", new Object[][]{ + {"rev_id", revision.Id}, + {"rev_page", currentPage.Id}, + {"rev_text_id", revision.Id}, + {"rev_comment", revision.Comment == null ? "" : revision.Comment}, + {"rev_user", revision.Contributor.Username == null ? ZERO : Integer.valueOf(revision.Contributor.Id)}, + {"rev_user_text", revision.Contributor.Username == null ? "" : revision.Contributor.Username}, + {"rev_timestamp", timestampFormat(revision.Timestamp)}, + {"rev_minor_edit", revision.Minor ? ONE : ZERO}, + {"rev_deleted", rev_deleted == 0 ? ZERO : Integer.valueOf(rev_deleted)}}); + + lastRevision = revision; + } + + private static int lengthUtf8(String s) { + final int slen = s.length(); + final char[] buf = Buffer.get(slen); + s.getChars(0, slen, buf, 0); + int len = 0; + for (int i = 0; i < slen; i++) { + char c = buf[i]; + if (c < 0x80) + len++; + else if (c < 0x800) + len += 2; + else if (c < 0xD800 || c >= 0xE000) + len += 3; + else { + // Surrogate pairs are assumed to be valid. + len += 4; + i++; + } + } + return len; + } + + private void updatePage(Page page, Revision revision) throws IOException { + bufferInsertRow("page", new Object[][]{ + {"page_id", page.Id}, + {"page_namespace", page.Title.Namespace}, + {"page_title", titleFormat(page.Title.Text)}, + {"page_restrictions", page.Restrictions}, + {"page_counter", ZERO}, + {"page_is_redirect", revision.isRedirect() ? ONE : ZERO}, + {"page_is_new", ZERO}, + {"page_random", traits.getRandom()}, + {"page_touched", traits.getCurrentTime()}, + {"page_latest", revision.Id}, + {"page_len", lengthUtf8(revision.Text)}}); + checkpoint(); + } } diff --git a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/TimeStampFilter.java b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/TimeStampFilter.java index e7947927..c080734f 100644 --- a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/TimeStampFilter.java +++ b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/TimeStampFilter.java @@ -30,49 +30,49 @@ import java.util.Calendar; public class TimeStampFilter implements DumpWriter { - final DumpWriter sink; - protected final Calendar filterTimeStamp; - protected Page currentPage; - protected boolean pageWritten; + final DumpWriter sink; + protected final Calendar filterTimeStamp; + protected Page currentPage; + protected boolean pageWritten; - public TimeStampFilter(DumpWriter sink, String timeStamp) throws ParseException { - this.sink = sink; - filterTimeStamp = Calendar.getInstance(); - filterTimeStamp.setTime(new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'").parse(timeStamp)); - } + public TimeStampFilter(DumpWriter sink, String timeStamp) throws ParseException { + this.sink = sink; + filterTimeStamp = Calendar.getInstance(); + filterTimeStamp.setTime(new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'").parse(timeStamp)); + } - public void close() throws IOException { - sink.close(); - } + public void close() throws IOException { + sink.close(); + } - public void writeStartWiki() throws IOException { - sink.writeStartWiki(); - } + public void writeStartWiki() throws IOException { + sink.writeStartWiki(); + } - public void writeEndWiki() throws IOException { - sink.writeEndWiki(); - } + public void writeEndWiki() throws IOException { + sink.writeEndWiki(); + } - public void writeSiteinfo(Siteinfo info) throws IOException { - sink.writeSiteinfo(info); - } + public void writeSiteinfo(Siteinfo info) throws IOException { + sink.writeSiteinfo(info); + } - public void writeStartPage(Page page) throws IOException { - currentPage = page; - pageWritten = false; - } + public void writeStartPage(Page page) throws IOException { + currentPage = page; + pageWritten = false; + } - public void writeEndPage() throws IOException { - if (pageWritten) { - sink.writeEndPage(); - } - } + public void writeEndPage() throws IOException { + if (pageWritten) { + sink.writeEndPage(); + } + } - public void writeRevision(Revision revision) throws IOException { - if (!pageWritten) { - sink.writeStartPage(currentPage); - pageWritten = true; - } - sink.writeRevision(revision); - } + public void writeRevision(Revision revision) throws IOException { + if (!pageWritten) { + sink.writeStartPage(currentPage); + pageWritten = true; + } + sink.writeRevision(revision); + } } diff --git a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/Title.java b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/Title.java index f4b7faa0..9caf0c6d 100644 --- a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/Title.java +++ b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/Title.java @@ -26,80 +26,80 @@ package org.dkpro.jwpl.mwdumper.importer; public class Title { - public final Integer Namespace; - public final String Text; - - private final NamespaceSet namespaces; - - public Title(Integer namespaceKey, String text, NamespaceSet namespaces) { - this.namespaces = namespaces; - Namespace = namespaceKey; - Text = text; - } - - public Title(String prefixedTitle, NamespaceSet namespaces) { - this.namespaces = namespaces; - int colon = prefixedTitle.indexOf(':'); - if (colon > 0) { - String prefix = prefixedTitle.substring(0, colon); - if (namespaces.hasPrefix(prefix)) { - Namespace = namespaces.getIndex(prefix); - Text = prefixedTitle.substring(colon + 1); - return; - } - } - Namespace = 0; - Text = prefixedTitle; - } - - public static String ValidateTitleChars(String text) { - // FIXME - return text; - } - - public String toString() { - String prefix = namespaces.getPrefix(Namespace); - if (Namespace == 0) - return prefix.concat(Text); - return prefix + ':' + Text; - } - - public boolean isSpecial() { - return Namespace < 0; - } - - public boolean isTalk() { - return !isSpecial() && (Namespace.intValue() % 2 == 1); - } - - public Title talkPage() { - if (isTalk()) - return this; - else if (isSpecial()) - return null; - else - return new Title(Namespace + 1, Text, namespaces); - } - - public Title subjectPage() { - if (isTalk()) - return new Title(Namespace - 1, Text, namespaces); - else - return this; - } - - public int hashCode() { - return Namespace.hashCode() ^ Text.hashCode(); - } - - public boolean equals(Object other) { - if (other == this) - return true; - if (other instanceof Title) { - Title ot = (Title)other; - return Namespace.equals(ot.Namespace) && - Text.equals(ot.Text); - } - return false; - } + public final Integer Namespace; + public final String Text; + + private final NamespaceSet namespaces; + + public Title(Integer namespaceKey, String text, NamespaceSet namespaces) { + this.namespaces = namespaces; + Namespace = namespaceKey; + Text = text; + } + + public Title(String prefixedTitle, NamespaceSet namespaces) { + this.namespaces = namespaces; + int colon = prefixedTitle.indexOf(':'); + if (colon > 0) { + String prefix = prefixedTitle.substring(0, colon); + if (namespaces.hasPrefix(prefix)) { + Namespace = namespaces.getIndex(prefix); + Text = prefixedTitle.substring(colon + 1); + return; + } + } + Namespace = 0; + Text = prefixedTitle; + } + + public static String ValidateTitleChars(String text) { + // FIXME + return text; + } + + public String toString() { + String prefix = namespaces.getPrefix(Namespace); + if (Namespace == 0) + return prefix.concat(Text); + return prefix + ':' + Text; + } + + public boolean isSpecial() { + return Namespace < 0; + } + + public boolean isTalk() { + return !isSpecial() && (Namespace.intValue() % 2 == 1); + } + + public Title talkPage() { + if (isTalk()) + return this; + else if (isSpecial()) + return null; + else + return new Title(Namespace + 1, Text, namespaces); + } + + public Title subjectPage() { + if (isTalk()) + return new Title(Namespace - 1, Text, namespaces); + else + return this; + } + + public int hashCode() { + return Namespace.hashCode() ^ Text.hashCode(); + } + + public boolean equals(Object other) { + if (other == this) + return true; + if (other instanceof Title) { + Title ot = (Title) other; + return Namespace.equals(ot.Namespace) && + Text.equals(ot.Text); + } + return false; + } } diff --git a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/TitleMatchFilter.java b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/TitleMatchFilter.java index 07e1f3b8..65f24b02 100644 --- a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/TitleMatchFilter.java +++ b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/TitleMatchFilter.java @@ -28,14 +28,14 @@ import java.util.regex.Pattern; public class TitleMatchFilter extends PageFilter { - final Pattern regex; - - public TitleMatchFilter(DumpWriter sink, String regexString) { - super(sink); - regex = Pattern.compile(regexString); - } - - protected boolean pass(Page page) { - return regex.matcher(page.Title.toString()).matches(); - } + final Pattern regex; + + public TitleMatchFilter(DumpWriter sink, String regexString) { + super(sink); + regex = Pattern.compile(regexString); + } + + protected boolean pass(Page page) { + return regex.matcher(page.Title.toString()).matches(); + } } diff --git a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/XmlDumpReader.java b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/XmlDumpReader.java index 638da14a..62dfa8bf 100644 --- a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/XmlDumpReader.java +++ b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/XmlDumpReader.java @@ -42,364 +42,368 @@ import org.xml.sax.SAXException; import org.xml.sax.helpers.DefaultHandler; -public class XmlDumpReader extends DefaultHandler { - InputStream input; - DumpWriter writer; - - private char[] buffer; - private int len; - private boolean hasContent; - private boolean deleted = false; - - Siteinfo siteinfo; - Page page; - boolean pageSent; - Contributor contrib; - Revision rev; - int nskey; - - boolean abortFlag; - - /** - * Initialize a processor for a MediaWiki XML dump stream. - * Events are sent to a single DumpWriter output sink, but you - * can chain multiple output processors with a MultiWriter. - * @param inputStream Stream to read XML from. - * @param writer Output sink to send processed events to. - */ - public XmlDumpReader(InputStream inputStream, DumpWriter writer) { - input = inputStream; - this.writer = writer; - buffer = new char[4096]; - len = 0; - hasContent = false; - } - - /** - * Reads through the entire XML dump on the input stream, sending - * events to the DumpWriter as it goes. May throw exceptions on - * invalid input or due to problems with the output. - * @throws IOException - */ - public void readDump() throws IOException { - try { - SAXParserFactory factory = SAXParserFactory.newInstance(); - factory.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, false); - SAXParser parser = factory.newSAXParser(); - - parser.parse(input, this); - } catch (ParserConfigurationException | SAXException e) { - throw (IOException)new IOException(e.getMessage()).initCause(e); - } +public class XmlDumpReader extends DefaultHandler { + InputStream input; + DumpWriter writer; + + private char[] buffer; + private int len; + private boolean hasContent; + private boolean deleted = false; + + Siteinfo siteinfo; + Page page; + boolean pageSent; + Contributor contrib; + Revision rev; + int nskey; + + boolean abortFlag; + + /** + * Initialize a processor for a MediaWiki XML dump stream. + * Events are sent to a single DumpWriter output sink, but you + * can chain multiple output processors with a MultiWriter. + * + * @param inputStream Stream to read XML from. + * @param writer Output sink to send processed events to. + */ + public XmlDumpReader(InputStream inputStream, DumpWriter writer) { + input = inputStream; + this.writer = writer; + buffer = new char[4096]; + len = 0; + hasContent = false; + } + + /** + * Reads through the entire XML dump on the input stream, sending + * events to the DumpWriter as it goes. May throw exceptions on + * invalid input or due to problems with the output. + * + * @throws IOException + */ + public void readDump() throws IOException { + try { + SAXParserFactory factory = SAXParserFactory.newInstance(); + factory.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, false); + SAXParser parser = factory.newSAXParser(); + + parser.parse(input, this); + } catch (ParserConfigurationException | SAXException e) { + throw (IOException) new IOException(e.getMessage()).initCause(e); + } writer.close(); - } - - /** - * Request that the dump processing be aborted. - * At the next element, an exception will be thrown to stop the XML parser. - * FIXME Is setting a bool thread-safe? It should be atomic... - */ - public void abort() { - abortFlag = true; - } - - // -------------------------- - // SAX handler interface methods: - - private static final Map<String, String> startElements = new HashMap<>(64); - private static final Map<String, String> endElements = new HashMap<>(64); - static { - startElements.put("revision","revision"); - startElements.put("contributor","contributor"); - startElements.put("page","page"); - startElements.put("mediawiki", "mediawiki"); - startElements.put("siteinfo","siteinfo"); - startElements.put("namespaces","namespaces"); - startElements.put("namespace","namespace"); - - endElements.put("ThreadSubject","ThreadSubject"); - endElements.put("ThreadParent","ThreadParent"); - endElements.put("ThreadAncestor","ThreadAncestor"); - endElements.put("ThreadPage","ThreadPage"); - endElements.put("ThreadID","ThreadID"); - endElements.put("ThreadSummaryPage","ThreadSummaryPage"); - endElements.put("ThreadAuthor","ThreadAuthor"); - endElements.put("ThreadEditStatus","ThreadEditStatus"); - endElements.put("ThreadType","ThreadType"); - endElements.put("base","base"); - endElements.put("case","case"); - endElements.put("comment","comment"); - endElements.put("contributor","contributor"); - endElements.put("generator","generator"); - endElements.put("id","id"); - endElements.put("ip","ip"); - endElements.put("mediawiki", "mediawiki"); - endElements.put("minor","minor"); - endElements.put("namespaces","namespaces"); - endElements.put("namespace","namespace"); - endElements.put("page","page"); - endElements.put("restrictions","restrictions"); - endElements.put("revision","revision"); - endElements.put("siteinfo","siteinfo"); - endElements.put("sitename","sitename"); - endElements.put("text","text"); - endElements.put("timestamp","timestamp"); - endElements.put("title","title"); - endElements.put("username","username"); - } - - public void startElement(String uri, String localname, String qName, Attributes attributes) throws SAXException { - // Clear the buffer for character data; we'll initialize it - // if and when character data arrives -- at that point we - // have a length. - len = 0; - hasContent = false; - - if (abortFlag) - throw new SAXException("XmlDumpReader set abort flag."); - - // check for deleted="deleted", and set deleted flag for the current element. - String d = attributes.getValue("deleted"); - deleted = (d!=null && d.equals("deleted")); - - try { - qName = startElements.get(qName); - if (qName == null) - return; - // frequent tags: - if (qName == "revision") openRevision(); - else if (qName == "contributor") openContributor(); - else if (qName == "page") openPage(); - // rare tags: - else if (qName == "mediawiki") openMediaWiki(); - else if (qName == "siteinfo") openSiteinfo(); - else if (qName == "namespaces") openNamespaces(); - else if (qName == "namespace") openNamespace(attributes); - } catch (IOException e) { - throw new SAXException(e); - } - } - - public void characters(char[] ch, int start, int length) { - if (buffer.length < len + length) { - int maxlen = buffer.length * 2; - if (maxlen < len + length) - maxlen = len + length; - char[] tmp = new char[maxlen]; - System.arraycopy(buffer, 0, tmp, 0, len); - buffer = tmp; - } - System.arraycopy(ch, start, buffer, len, length); - len += length; - hasContent = true; - } - - public void endElement(String uri, String localname, String qName) throws SAXException { - try { - qName = endElements.get(qName); - if (qName == null) - return; - // frequent tags: - if (qName == "id") readId(); - else if (qName == "revision") closeRevision(); - else if (qName == "timestamp") readTimestamp(); - else if (qName == "text") readText(); - else if (qName == "contributor") closeContributor(); - else if (qName == "username") readUsername(); - else if (qName == "ip") readIp(); - else if (qName == "comment") readComment(); - else if (qName == "minor") readMinor(); - else if (qName == "page") closePage(); - else if (qName == "title") readTitle(); - else if (qName == "restrictions") readRestrictions(); - // rare tags: - else if (qName.startsWith("Thread")) threadAttribute(qName); - else if (qName == "mediawiki") closeMediaWiki(); - else if (qName == "siteinfo") closeSiteinfo(); - else if (qName == "sitename") readSitename(); - else if (qName == "base") readBase(); - else if (qName == "generator") readGenerator(); - else if (qName == "case") readCase(); - else if (qName == "namespaces") closeNamespaces(); - else if (qName == "namespace") closeNamespace(); + } + + /** + * Request that the dump processing be aborted. + * At the next element, an exception will be thrown to stop the XML parser. + * FIXME Is setting a bool thread-safe? It should be atomic... + */ + public void abort() { + abortFlag = true; + } + + // -------------------------- + // SAX handler interface methods: + + private static final Map<String, String> startElements = new HashMap<>(64); + private static final Map<String, String> endElements = new HashMap<>(64); + + static { + startElements.put("revision", "revision"); + startElements.put("contributor", "contributor"); + startElements.put("page", "page"); + startElements.put("mediawiki", "mediawiki"); + startElements.put("siteinfo", "siteinfo"); + startElements.put("namespaces", "namespaces"); + startElements.put("namespace", "namespace"); + + endElements.put("ThreadSubject", "ThreadSubject"); + endElements.put("ThreadParent", "ThreadParent"); + endElements.put("ThreadAncestor", "ThreadAncestor"); + endElements.put("ThreadPage", "ThreadPage"); + endElements.put("ThreadID", "ThreadID"); + endElements.put("ThreadSummaryPage", "ThreadSummaryPage"); + endElements.put("ThreadAuthor", "ThreadAuthor"); + endElements.put("ThreadEditStatus", "ThreadEditStatus"); + endElements.put("ThreadType", "ThreadType"); + endElements.put("base", "base"); + endElements.put("case", "case"); + endElements.put("comment", "comment"); + endElements.put("contributor", "contributor"); + endElements.put("generator", "generator"); + endElements.put("id", "id"); + endElements.put("ip", "ip"); + endElements.put("mediawiki", "mediawiki"); + endElements.put("minor", "minor"); + endElements.put("namespaces", "namespaces"); + endElements.put("namespace", "namespace"); + endElements.put("page", "page"); + endElements.put("restrictions", "restrictions"); + endElements.put("revision", "revision"); + endElements.put("siteinfo", "siteinfo"); + endElements.put("sitename", "sitename"); + endElements.put("text", "text"); + endElements.put("timestamp", "timestamp"); + endElements.put("title", "title"); + endElements.put("username", "username"); + } + + public void startElement(String uri, String localname, String qName, Attributes attributes) throws SAXException { + // Clear the buffer for character data; we'll initialize it + // if and when character data arrives -- at that point we + // have a length. + len = 0; + hasContent = false; + + if (abortFlag) + throw new SAXException("XmlDumpReader set abort flag."); + + // check for deleted="deleted", and set deleted flag for the current element. + String d = attributes.getValue("deleted"); + deleted = (d != null && d.equals("deleted")); + + try { + qName = startElements.get(qName); + if (qName == null) + return; + // frequent tags: + if (qName == "revision") openRevision(); + else if (qName == "contributor") openContributor(); + else if (qName == "page") openPage(); + // rare tags: + else if (qName == "mediawiki") openMediaWiki(); + else if (qName == "siteinfo") openSiteinfo(); + else if (qName == "namespaces") openNamespaces(); + else if (qName == "namespace") openNamespace(attributes); + } catch (IOException e) { + throw new SAXException(e); + } + } + + public void characters(char[] ch, int start, int length) { + if (buffer.length < len + length) { + int maxlen = buffer.length * 2; + if (maxlen < len + length) + maxlen = len + length; + char[] tmp = new char[maxlen]; + System.arraycopy(buffer, 0, tmp, 0, len); + buffer = tmp; + } + System.arraycopy(ch, start, buffer, len, length); + len += length; + hasContent = true; + } + + public void endElement(String uri, String localname, String qName) throws SAXException { + try { + qName = endElements.get(qName); + if (qName == null) + return; + // frequent tags: + if (qName == "id") readId(); + else if (qName == "revision") closeRevision(); + else if (qName == "timestamp") readTimestamp(); + else if (qName == "text") readText(); + else if (qName == "contributor") closeContributor(); + else if (qName == "username") readUsername(); + else if (qName == "ip") readIp(); + else if (qName == "comment") readComment(); + else if (qName == "minor") readMinor(); + else if (qName == "page") closePage(); + else if (qName == "title") readTitle(); + else if (qName == "restrictions") readRestrictions(); + // rare tags: + else if (qName.startsWith("Thread")) threadAttribute(qName); + else if (qName == "mediawiki") closeMediaWiki(); + else if (qName == "siteinfo") closeSiteinfo(); + else if (qName == "sitename") readSitename(); + else if (qName == "base") readBase(); + else if (qName == "generator") readGenerator(); + else if (qName == "case") readCase(); + else if (qName == "namespaces") closeNamespaces(); + else if (qName == "namespace") closeNamespace(); // else throw(SAXException)new SAXException("Unrecognised "+qName+"(substring "+qName.length()+qName.substring(0,6)+")"); - } catch (IOException e) { - throw (SAXException)new SAXException(e.getMessage()).initCause(e); - } - } - - // ---------- - - void threadAttribute(String attrib) throws IOException { - if(attrib.equals("ThreadPage")) // parse title - page.DiscussionThreadingInfo.put(attrib, new Title(bufferContents(), siteinfo.Namespaces)); - else - page.DiscussionThreadingInfo.put(attrib, bufferContents()); - } - - void openMediaWiki() throws IOException { - siteinfo = null; - writer.writeStartWiki(); - } - - void closeMediaWiki() throws IOException { - writer.writeEndWiki(); - siteinfo = null; - } - - // ------------------ - - void openSiteinfo() { - siteinfo = new Siteinfo(); - } - - void closeSiteinfo() throws IOException { - writer.writeSiteinfo(siteinfo); - } - - private String bufferContentsOrNull() { - if (!hasContent) return null; - else return bufferContents(); - } - - private String bufferContents() { - return len == 0 ? "" : new String(buffer, 0, len); - } - - void readSitename() { - siteinfo.Sitename = bufferContents(); - } - - void readBase() { - siteinfo.Base = bufferContents(); - } - - void readGenerator() { - siteinfo.Generator = bufferContents(); - } - - void readCase() { - siteinfo.Case = bufferContents(); - } - - void openNamespaces() { - siteinfo.Namespaces = new NamespaceSet(); - } - - void openNamespace(Attributes attribs) { - nskey = Integer.parseInt(attribs.getValue("key")); - } - - void closeNamespace() { - siteinfo.Namespaces.add(nskey, bufferContents()); - } - - void closeNamespaces() { - // NOP - } - - // ----------- - - void openPage() { - page = new Page(); - pageSent = false; - } - - void closePage() throws IOException { - if (pageSent) - writer.writeEndPage(); - page = null; - } - - void readTitle() { - page.Title = new Title(bufferContents(), siteinfo.Namespaces); - } - - void readId() { - int id = Integer.parseInt(bufferContents()); - if (contrib != null) - contrib.Id = id; - else if (rev != null) - rev.Id = id; - else if (page != null) - page.Id = id; - else - throw new IllegalArgumentException("Unexpected <id> outside a <page>, <revision>, or <contributor>"); - } - - void readRestrictions() { - page.Restrictions = bufferContents(); - } - - // ------ - - void openRevision() throws IOException { - if (!pageSent) { - writer.writeStartPage(page); - pageSent = true; - } - - rev = new Revision(); - } - - void closeRevision() throws IOException { - writer.writeRevision(rev); - rev = null; - } - - void readTimestamp() { - rev.Timestamp = parseUTCTimestamp(bufferContents()); - } - - void readComment() { - rev.Comment = bufferContentsOrNull(); - if (rev.Comment==null && !deleted) rev.Comment = ""; //NOTE: null means deleted/supressed - } - - void readMinor() { - rev.Minor = true; - } - - void readText() { - rev.Text = bufferContentsOrNull(); - if (rev.Text==null && !deleted) rev.Text = ""; //NOTE: null means deleted/supressed - } - - // ----------- - void openContributor() { - //XXX: record deleted flag?! as it is, any empty <contributor> tag counts as "deleted" - contrib = new Contributor(); - } - - void closeContributor() { - //NOTE: if the contributor was supressed, nither username nor id have been set in the Contributor object - rev.Contributor = contrib; - contrib = null; - } - - - void readUsername() { - contrib.Username = bufferContentsOrNull(); - } - - void readIp() { - contrib.Username = bufferContents(); - contrib.isIP = true; - } - - private static final TimeZone utc = TimeZone.getTimeZone("UTC"); - private static Calendar parseUTCTimestamp(String text) { - // 2003-10-26T04:50:47Z - // We're doing this manually for now, though DateFormatter might work... - String trimmed = text.trim(); - GregorianCalendar ts = new GregorianCalendar(utc); - ts.set( - Integer.parseInt(trimmed.substring(0,0+4)), // year - Integer.parseInt(trimmed.substring(5,5+2)) - 1, // month is 0-based! - Integer.parseInt(trimmed.substring(8,8+2)), // day - Integer.parseInt(trimmed.substring(11,11+2)), // hour - Integer.parseInt(trimmed.substring(14,14+2)), // minute - Integer.parseInt(trimmed.substring(17,17+2))); // second - return ts; - } + } catch (IOException e) { + throw (SAXException) new SAXException(e.getMessage()).initCause(e); + } + } + + // ---------- + + void threadAttribute(String attrib) throws IOException { + if (attrib.equals("ThreadPage")) // parse title + page.DiscussionThreadingInfo.put(attrib, new Title(bufferContents(), siteinfo.Namespaces)); + else + page.DiscussionThreadingInfo.put(attrib, bufferContents()); + } + + void openMediaWiki() throws IOException { + siteinfo = null; + writer.writeStartWiki(); + } + + void closeMediaWiki() throws IOException { + writer.writeEndWiki(); + siteinfo = null; + } + + // ------------------ + + void openSiteinfo() { + siteinfo = new Siteinfo(); + } + + void closeSiteinfo() throws IOException { + writer.writeSiteinfo(siteinfo); + } + + private String bufferContentsOrNull() { + if (!hasContent) return null; + else return bufferContents(); + } + + private String bufferContents() { + return len == 0 ? "" : new String(buffer, 0, len); + } + + void readSitename() { + siteinfo.Sitename = bufferContents(); + } + + void readBase() { + siteinfo.Base = bufferContents(); + } + + void readGenerator() { + siteinfo.Generator = bufferContents(); + } + + void readCase() { + siteinfo.Case = bufferContents(); + } + + void openNamespaces() { + siteinfo.Namespaces = new NamespaceSet(); + } + + void openNamespace(Attributes attribs) { + nskey = Integer.parseInt(attribs.getValue("key")); + } + + void closeNamespace() { + siteinfo.Namespaces.add(nskey, bufferContents()); + } + + void closeNamespaces() { + // NOP + } + + // ----------- + + void openPage() { + page = new Page(); + pageSent = false; + } + + void closePage() throws IOException { + if (pageSent) + writer.writeEndPage(); + page = null; + } + + void readTitle() { + page.Title = new Title(bufferContents(), siteinfo.Namespaces); + } + + void readId() { + int id = Integer.parseInt(bufferContents()); + if (contrib != null) + contrib.Id = id; + else if (rev != null) + rev.Id = id; + else if (page != null) + page.Id = id; + else + throw new IllegalArgumentException("Unexpected <id> outside a <page>, <revision>, or <contributor>"); + } + + void readRestrictions() { + page.Restrictions = bufferContents(); + } + + // ------ + + void openRevision() throws IOException { + if (!pageSent) { + writer.writeStartPage(page); + pageSent = true; + } + + rev = new Revision(); + } + + void closeRevision() throws IOException { + writer.writeRevision(rev); + rev = null; + } + + void readTimestamp() { + rev.Timestamp = parseUTCTimestamp(bufferContents()); + } + + void readComment() { + rev.Comment = bufferContentsOrNull(); + if (rev.Comment == null && !deleted) rev.Comment = ""; //NOTE: null means deleted/supressed + } + + void readMinor() { + rev.Minor = true; + } + + void readText() { + rev.Text = bufferContentsOrNull(); + if (rev.Text == null && !deleted) rev.Text = ""; //NOTE: null means deleted/supressed + } + + // ----------- + void openContributor() { + //XXX: record deleted flag?! as it is, any empty <contributor> tag counts as "deleted" + contrib = new Contributor(); + } + + void closeContributor() { + //NOTE: if the contributor was supressed, nither username nor id have been set in the Contributor object + rev.Contributor = contrib; + contrib = null; + } + + + void readUsername() { + contrib.Username = bufferContentsOrNull(); + } + + void readIp() { + contrib.Username = bufferContents(); + contrib.isIP = true; + } + + private static final TimeZone utc = TimeZone.getTimeZone("UTC"); + + private static Calendar parseUTCTimestamp(String text) { + // 2003-10-26T04:50:47Z + // We're doing this manually for now, though DateFormatter might work... + String trimmed = text.trim(); + GregorianCalendar ts = new GregorianCalendar(utc); + ts.set( + Integer.parseInt(trimmed.substring(0, 0 + 4)), // year + Integer.parseInt(trimmed.substring(5, 5 + 2)) - 1, // month is 0-based! + Integer.parseInt(trimmed.substring(8, 8 + 2)), // day + Integer.parseInt(trimmed.substring(11, 11 + 2)), // hour + Integer.parseInt(trimmed.substring(14, 14 + 2)), // minute + Integer.parseInt(trimmed.substring(17, 17 + 2))); // second + return ts; + } } diff --git a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/XmlDumpWriter.java b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/XmlDumpWriter.java index ccf2fb45..af1d7e74 100644 --- a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/XmlDumpWriter.java +++ b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/XmlDumpWriter.java @@ -35,125 +35,124 @@ import java.util.TimeZone; public class XmlDumpWriter implements DumpWriter { - protected OutputStream stream; - protected XmlWriter writer; - - protected static final String version = "0.3"; - protected static final String ns = "http://www.mediawiki.org/xml/export-" + version + "/"; - protected static final String schema = "http://www.mediawiki.org/xml/export-" + version + ".xsd"; - protected static final DateFormat dateFormat = new SimpleDateFormat("yyyy'-'MM'-'dd'T'HH':'mm':'ss'Z'"); - static { - dateFormat.setTimeZone(TimeZone.getTimeZone("UTC")); - } - - public XmlDumpWriter(OutputStream output) { - stream = output; - writer = new XmlWriter(stream); - } - - public void close() throws IOException { - writer.close(); - } - - public void writeStartWiki() throws IOException { - writer.openXml(); - writer.openElement("mediawiki", new String[][] { - {"xmlns", ns}, - {"xmlns:xsi", "http://www.w3.org/2001/XMLSchema-instance"}, - {"xsi:schemaLocation", ns + " " + schema}, - {"version", version}, - {"xml:lang", "en"}}); - // TODO: store and keep the xml:lang - } - - public void writeEndWiki() throws IOException { - writer.closeElement(); - writer.closeXml(); - } - - public void writeSiteinfo(Siteinfo info) throws IOException { - XmlWriter writer = this.writer; - writer.openElement("siteinfo"); - writer.textElement("sitename", info.Sitename); - writer.textElement("base", info.Base); - writer.textElement("generator", info.Generator); - writer.textElement("case", info.Case); - - writer.openElement("namespaces"); - for (Iterator<Map.Entry<Integer, String>> i = info.Namespaces.orderedEntries(); i.hasNext();) { - Map.Entry<Integer, String> e = i.next(); - writer.textElement("namespace", e.getValue(), new String[][] { - {"key", e.getKey().toString()}}); - } - writer.closeElement(); - - writer.closeElement(); - } - - public void writeStartPage(Page page) throws IOException { - XmlWriter writer = this.writer; - writer.openElement("page"); - writer.textElement("title", page.Title.toString()); - if (page.Id != 0) - writer.textElement("id", Integer.toString(page.Id)); - if (page.Restrictions != null && page.Restrictions.length() != 0) - writer.textElement("restrictions", page.Restrictions); - } - - public void writeEndPage() throws IOException { - writer.closeElement(); - } - - public void writeRevision(Revision rev) throws IOException { - XmlWriter writer = this.writer; - writer.openElement("revision"); - if (rev.Id != 0) - writer.textElement("id", Integer.toString(rev.Id)); - - writer.textElement("timestamp", formatTimestamp(rev.Timestamp)); - - writeContributor(rev.Contributor); - - if (rev.Minor) { - writer.emptyElement("minor"); - } - - if (rev.Comment == null) { - writer.emptyElement("comment", deletedAttrib); - } - else if (rev.Comment.length() != 0) { - writer.textElement("comment", rev.Comment); - } - - writer.textElement("text", rev.Text, - rev.Text==null ? new String[][] {{"xml:space", "preserve"}, {"deleted", "deleted"}} - : new String[][] {{"xml:space", "preserve"}} - ); - - writer.closeElement(); - } - - static final String[][] deletedAttrib = new String[][] { {"deleted", "deleted"} }; - - static String formatTimestamp(Calendar ts) { - return dateFormat.format(ts.getTime()); - } - - void writeContributor(Contributor contrib) throws IOException { - XmlWriter writer = this.writer; - - if (contrib.Username==null) { - writer.emptyElement("contributor", deletedAttrib); - } - else { - writer.openElement("contributor"); - if (contrib.isIP) { - writer.textElement("ip", contrib.Username); - } else { - writer.textElement("username", contrib.Username); - writer.textElement("id", Integer.toString(contrib.Id)); - } - writer.closeElement(); - } - } + protected OutputStream stream; + protected XmlWriter writer; + + protected static final String version = "0.3"; + protected static final String ns = "http://www.mediawiki.org/xml/export-" + version + "/"; + protected static final String schema = "http://www.mediawiki.org/xml/export-" + version + ".xsd"; + protected static final DateFormat dateFormat = new SimpleDateFormat("yyyy'-'MM'-'dd'T'HH':'mm':'ss'Z'"); + + static { + dateFormat.setTimeZone(TimeZone.getTimeZone("UTC")); + } + + public XmlDumpWriter(OutputStream output) { + stream = output; + writer = new XmlWriter(stream); + } + + public void close() throws IOException { + writer.close(); + } + + public void writeStartWiki() throws IOException { + writer.openXml(); + writer.openElement("mediawiki", new String[][]{ + {"xmlns", ns}, + {"xmlns:xsi", "http://www.w3.org/2001/XMLSchema-instance"}, + {"xsi:schemaLocation", ns + " " + schema}, + {"version", version}, + {"xml:lang", "en"}}); + // TODO: store and keep the xml:lang + } + + public void writeEndWiki() throws IOException { + writer.closeElement(); + writer.closeXml(); + } + + public void writeSiteinfo(Siteinfo info) throws IOException { + XmlWriter writer = this.writer; + writer.openElement("siteinfo"); + writer.textElement("sitename", info.Sitename); + writer.textElement("base", info.Base); + writer.textElement("generator", info.Generator); + writer.textElement("case", info.Case); + + writer.openElement("namespaces"); + for (Iterator<Map.Entry<Integer, String>> i = info.Namespaces.orderedEntries(); i.hasNext(); ) { + Map.Entry<Integer, String> e = i.next(); + writer.textElement("namespace", e.getValue(), new String[][]{ + {"key", e.getKey().toString()}}); + } + writer.closeElement(); + + writer.closeElement(); + } + + public void writeStartPage(Page page) throws IOException { + XmlWriter writer = this.writer; + writer.openElement("page"); + writer.textElement("title", page.Title.toString()); + if (page.Id != 0) + writer.textElement("id", Integer.toString(page.Id)); + if (page.Restrictions != null && page.Restrictions.length() != 0) + writer.textElement("restrictions", page.Restrictions); + } + + public void writeEndPage() throws IOException { + writer.closeElement(); + } + + public void writeRevision(Revision rev) throws IOException { + XmlWriter writer = this.writer; + writer.openElement("revision"); + if (rev.Id != 0) + writer.textElement("id", Integer.toString(rev.Id)); + + writer.textElement("timestamp", formatTimestamp(rev.Timestamp)); + + writeContributor(rev.Contributor); + + if (rev.Minor) { + writer.emptyElement("minor"); + } + + if (rev.Comment == null) { + writer.emptyElement("comment", deletedAttrib); + } else if (rev.Comment.length() != 0) { + writer.textElement("comment", rev.Comment); + } + + writer.textElement("text", rev.Text, + rev.Text == null ? new String[][]{{"xml:space", "preserve"}, {"deleted", "deleted"}} + : new String[][]{{"xml:space", "preserve"}} + ); + + writer.closeElement(); + } + + static final String[][] deletedAttrib = new String[][]{{"deleted", "deleted"}}; + + static String formatTimestamp(Calendar ts) { + return dateFormat.format(ts.getTime()); + } + + void writeContributor(Contributor contrib) throws IOException { + XmlWriter writer = this.writer; + + if (contrib.Username == null) { + writer.emptyElement("contributor", deletedAttrib); + } else { + writer.openElement("contributor"); + if (contrib.isIP) { + writer.textElement("ip", contrib.Username); + } else { + writer.textElement("username", contrib.Username); + writer.textElement("id", Integer.toString(contrib.Id)); + } + writer.closeElement(); + } + } } diff --git a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/XmlWriter.java b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/XmlWriter.java index 5b5aa667..765c6c93 100644 --- a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/XmlWriter.java +++ b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/XmlWriter.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -31,178 +31,178 @@ * so prefixes and xmlns attributes must be managed manually. */ public class XmlWriter { - private final String encoding; - private final List<String> stack; - private final BufferedWriter writer; - - public XmlWriter(OutputStream stream) { - encoding = "utf-8"; - stack = new ArrayList<>(); - writer = new BufferedWriter(new OutputStreamWriter(stream, StandardCharsets.UTF_8)); - } - - /** - * @throws IOException Thrown if IO errors occurred. - */ - public void close() throws IOException { - writer.flush(); - writer.close(); - } - - /** - * Write the <?xml?> header. - * - * @throws IOException Thrown if IO errors occurred. - */ - public void openXml() throws IOException { - writeRaw("<?xml version=\"1.0\" encoding=\"" + encoding + "\" ?>\n"); - } - - /** - * In theory, we might close out open elements or such. - */ - public void closeXml() { - } - - - /** - * Write an empty element, such as <el/>, on a standalone line. - * Takes an optional dictionary of attributes. - * - * @throws IOException Thrown if IO errors occurred. - */ - public void emptyElement(String element) throws IOException { - emptyElement(element, null); - } - - public void emptyElement(String element, String[][] attributes) throws IOException { - startElement(element, attributes, "/>\n"); - deIndent(); - } - - /** - * Write an element open tag, such as <el/>, on a standalone line. - * Takes an optional dictionary of attributes. - * - * @throws IOException Thrown if IO errors occurred. - */ - public void openElement(String element) throws IOException { - openElement(element, null); - } - - public void openElement(String element, String[][] attributes) throws IOException { - startElement(element, attributes, ">\n"); - } - - /** - * Write an element close tag, such as <el/>, on a standalone line. - * If indent=False is passed, indentation will not be added. - * - * @throws IOException Thrown if IO errors occurred. - */ - public void closeElement() throws IOException { - closeElement(true); - } - - public void closeElement(boolean indent) throws IOException { - String[] bits = deIndent(); - String element = bits[0]; - String space = bits[1]; - if (indent) - writeRaw(space + "</" + element + ">\n"); - else - writeRaw("</" + element + ">\n"); - } - - /** - * Write an element with a text node included, such as <el/>foo<el/>, - * on a standalone line. If the text is empty, an empty element will - * be output as <el/>. Takes an optional list of tuples with attribute - * names and values. - * - * @throws IOException Thrown if IO errors occurred. - */ - public void textElement(String element, String text) throws IOException { - textElement(element, text, null); - } - - public void textElement(String element, String text, String[][] attributes) throws IOException { - if (text==null || text.length() == 0) { - emptyElement(element, attributes); - } else { - startElement(element, attributes, ">"); - writeEscaped(text); - closeElement(false); - } - } - - void startElement(String element, String[][] attributes, String terminator) throws IOException { - writeRaw(indent(element)); - writeRaw('<'); - writeRaw(element); - if (attributes != null) { - for (int i = 0; i < attributes.length; i++) { - writeRaw(' '); - writeRaw(attributes[i][0]); - writeRaw("=\""); - writeEscaped(attributes[i][1]); - writeRaw('"'); - } - } - writeRaw(terminator); - } - - /** - * Send an encoded Unicode string to the output stream. - * - * @throws IOException Thrown if IO errors occurred. - * */ - void writeRaw(String data) throws IOException { - writer.write(data); - } - - void writeRaw(char c) throws IOException { - writer.write(c); - } - - void writeEscaped(String data) throws IOException { - int end = data.length(); - for (int i = 0; i < end; i++) { - char c = data.charAt(i); - switch (c) { - case '&': - writer.write("&"); - break; - case '<': - writer.write("<"); - break; - case '>': - writer.write(">"); - break; - case '"': - writer.write("""); - break; - default: - writer.write(c); - } - } - } - - private String indent(String element) { - int level = stack.size(); - stack.add(element); - return spaces(level); - } - - private String[] deIndent() { - String element = stack.remove(stack.size() - 1); - String space = spaces(stack.size()); - return new String[] {element, space}; - } - - private String spaces(int level) { - StringBuilder buffer = new StringBuilder(); + private final String encoding; + private final List<String> stack; + private final BufferedWriter writer; + + public XmlWriter(OutputStream stream) { + encoding = "utf-8"; + stack = new ArrayList<>(); + writer = new BufferedWriter(new OutputStreamWriter(stream, StandardCharsets.UTF_8)); + } + + /** + * @throws IOException Thrown if IO errors occurred. + */ + public void close() throws IOException { + writer.flush(); + writer.close(); + } + + /** + * Write the <?xml?> header. + * + * @throws IOException Thrown if IO errors occurred. + */ + public void openXml() throws IOException { + writeRaw("<?xml version=\"1.0\" encoding=\"" + encoding + "\" ?>\n"); + } + + /** + * In theory, we might close out open elements or such. + */ + public void closeXml() { + } + + + /** + * Write an empty element, such as <el/>, on a standalone line. + * Takes an optional dictionary of attributes. + * + * @throws IOException Thrown if IO errors occurred. + */ + public void emptyElement(String element) throws IOException { + emptyElement(element, null); + } + + public void emptyElement(String element, String[][] attributes) throws IOException { + startElement(element, attributes, "/>\n"); + deIndent(); + } + + /** + * Write an element open tag, such as <el/>, on a standalone line. + * Takes an optional dictionary of attributes. + * + * @throws IOException Thrown if IO errors occurred. + */ + public void openElement(String element) throws IOException { + openElement(element, null); + } + + public void openElement(String element, String[][] attributes) throws IOException { + startElement(element, attributes, ">\n"); + } + + /** + * Write an element close tag, such as <el/>, on a standalone line. + * If indent=False is passed, indentation will not be added. + * + * @throws IOException Thrown if IO errors occurred. + */ + public void closeElement() throws IOException { + closeElement(true); + } + + public void closeElement(boolean indent) throws IOException { + String[] bits = deIndent(); + String element = bits[0]; + String space = bits[1]; + if (indent) + writeRaw(space + "</" + element + ">\n"); + else + writeRaw("</" + element + ">\n"); + } + + /** + * Write an element with a text node included, such as <el/>foo<el/>, + * on a standalone line. If the text is empty, an empty element will + * be output as <el/>. Takes an optional list of tuples with attribute + * names and values. + * + * @throws IOException Thrown if IO errors occurred. + */ + public void textElement(String element, String text) throws IOException { + textElement(element, text, null); + } + + public void textElement(String element, String text, String[][] attributes) throws IOException { + if (text == null || text.length() == 0) { + emptyElement(element, attributes); + } else { + startElement(element, attributes, ">"); + writeEscaped(text); + closeElement(false); + } + } + + void startElement(String element, String[][] attributes, String terminator) throws IOException { + writeRaw(indent(element)); + writeRaw('<'); + writeRaw(element); + if (attributes != null) { + for (int i = 0; i < attributes.length; i++) { + writeRaw(' '); + writeRaw(attributes[i][0]); + writeRaw("=\""); + writeEscaped(attributes[i][1]); + writeRaw('"'); + } + } + writeRaw(terminator); + } + + /** + * Send an encoded Unicode string to the output stream. + * + * @throws IOException Thrown if IO errors occurred. + */ + void writeRaw(String data) throws IOException { + writer.write(data); + } + + void writeRaw(char c) throws IOException { + writer.write(c); + } + + void writeEscaped(String data) throws IOException { + int end = data.length(); + for (int i = 0; i < end; i++) { + char c = data.charAt(i); + switch (c) { + case '&': + writer.write("&"); + break; + case '<': + writer.write("<"); + break; + case '>': + writer.write(">"); + break; + case '"': + writer.write("""); + break; + default: + writer.write(c); + } + } + } + + private String indent(String element) { + int level = stack.size(); + stack.add(element); + return spaces(level); + } + + private String[] deIndent() { + String element = stack.remove(stack.size() - 1); + String space = spaces(stack.size()); + return new String[]{element, space}; + } + + private String spaces(int level) { + StringBuilder buffer = new StringBuilder(); buffer.append(" ".repeat(Math.max(0, level * 2))); - return buffer.toString(); - } + return buffer.toString(); + } } diff --git a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/Content.java b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/Content.java index c37099dc..168db2c7 100755 --- a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/Content.java +++ b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/Content.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -24,99 +24,109 @@ * <br> * Be aware, that all retured Spans refer to the String returned by getText()<br> * this is true for any implementing class!<br> - * */ -public interface Content{ - - enum FormatType { - /** Bold Text */ - BOLD, - /** Italic Text */ - ITALIC, - /** The Content between Math Tags */ - MATH, - /** The Content between NoWiki Tags */ - NOWIKI, - /** The begin and end position of an unknown Tag defined by < and > */ - TAG, - } - - /** - * Returns the Text of the Element - */ +public interface Content { + + enum FormatType { + /** + * Bold Text + */ + BOLD, + /** + * Italic Text + */ + ITALIC, + /** + * The Content between Math Tags + */ + MATH, + /** + * The Content between NoWiki Tags + */ + NOWIKI, + /** + * The begin and end position of an unknown Tag defined by < and > + */ + TAG, + } + + /** + * Returns the Text of the Element + */ String getText(); - - /** - * Content.getText().length() == Content.length() - */ + + /** + * Content.getText().length() == Content.length() + */ int length(); - - /** - * Returns true, if there is no content in the element. - */ + + /** + * Returns true, if there is no content in the element. + */ boolean empty(); - /** - * returns the Format Spans of the Specified Type. - */ + /** + * returns the Format Spans of the Specified Type. + */ List<Span> getFormatSpans(FormatType t); - - /** - * returns the Format Spans of the Specified Type, in the Range from start to end. - */ + + /** + * returns the Format Spans of the Specified Type, in the Range from start to end. + */ List<Span> getFormatSpans(FormatType t, int start, int end); - - /** - * returns the Format Spans of the Specified Type, in the Range of s. - */ + + /** + * returns the Format Spans of the Specified Type, in the Range of s. + */ List<Span> getFormatSpans(FormatType t, Span s); - - /** - * returns the Formats uses in this element. - */ + + /** + * returns the Formats uses in this element. + */ List<FormatType> getFormats(); - - /** - * returns the Formats uses in this element, in the Range from start to end. - */ + + /** + * returns the Formats uses in this element, in the Range from start to end. + */ List<FormatType> getFormats(int start, int end); - /** - * returns the Formats uses in this element, in the Range of s. - */ + + /** + * returns the Formats uses in this element, in the Range of s. + */ List<FormatType> getFormats(Span s); - - /** - * returns all Links of this element. - */ + + /** + * returns all Links of this element. + */ List<Link> getLinks(); - - /** - * returns all Links of this element of the specified type. - */ + + /** + * returns all Links of this element of the specified type. + */ List<Link> getLinks(Link.type t); - - /** - * returns all Links of this element of the specified type, in the Range from start to end. - */ + + /** + * returns all Links of this element of the specified type, in the Range from start to end. + */ List<Link> getLinks(Link.type t, int start, int end); - - /** - * returns all Links of this element of the specified type, in the Range of s - */ + + /** + * returns all Links of this element of the specified type, in the Range of s + */ List<Link> getLinks(Link.type t, Span s); - - /** - * returns all Templates. - */ + + /** + * returns all Templates. + */ List<Template> getTemplates(); - - /** - * returns all Templates, in the Range from start to end. - */ + + /** + * returns all Templates, in the Range from start to end. + */ List<Template> getTemplates(int start, int end); - - /** - * returns all Templates, in the Range of s. - */ + + /** + * returns all Templates, in the Range of s. + */ List<Template> getTemplates(Span s); } diff --git a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/ContentContainer.java b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/ContentContainer.java index 1e218340..6ecd3877 100755 --- a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/ContentContainer.java +++ b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/ContentContainer.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -25,238 +25,237 @@ * ContentElement.class!) in a new Content element.<br> * For a description of the Functions of the Content Interface, take a look at * the Content.class documentation.<br> - * */ public abstract class ContentContainer extends ParsedPageObject implements Content { - - protected List<Content> ccl; - - public boolean empty(){ - return ccl.size() == 0; - } - - public String getText(){ - - StringBuilder result = new StringBuilder(); - for( Content cc: ccl){ - if(cc!=null) result.append( cc.getText()+" " ); - } - - final int temp = result.length()-1; - if( temp >= 0 ) result.deleteCharAt(temp); - - return result.toString(); - } - - /** - * Returns the Text in the Span List in a String...<br> - * all Spans must refer to the text returned by getText(). - */ - public String getText(List<Span> sl){ - final String temp = getText(); - StringBuilder result = new StringBuilder(); - for( Span s: sl ) - result.append( s.getText( temp )+' ' ); - result.deleteCharAt( result.length()-1 ); - return result.toString(); - } - - public int length(){ - int length = 0; - - for( Content cc: ccl ) - if( cc!=null ) length += cc.length()+1; - - if( length > 0 ) length--; - - return length; - } - - /** - * Retruns the Number of Content elements in this ContentContainer. - */ - public int size(){ - return ccl.size(); - } - - public List<Span> getFormatSpans(FormatType t){ - List<Span> result = new ArrayList<>(); - int offset = 0; - for( Content c: ccl ){ - for( Span b : c.getFormatSpans(t) ) - result.add( b.clone().adjust( offset )); - - offset += 1 + c.length(); - } - return result; - } - - public List<Span> getFormatSpans(FormatType t, int start, int end ){ - return getFormatSpans(t, new Span( start, end ) ); - } - - public List<Span> getFormatSpans(FormatType t, Span s){ - List<Span> result = new ArrayList<>(); - - Span a = new Span( -1, -1 ); - - for( Content c: ccl ){ - int offset = a.getEnd()+1; - a = new Span( offset, offset+ c.length() ); - - if( a.hits(s) ){ - for( Span b: c.getFormatSpans( t, s.clone().adjust( -offset ) ) ) - result.add( b.clone().adjust( offset ) ); - } - } - return result; - } - - public List<FormatType> getFormats(){ - - boolean bold = false; - boolean italic = false; - boolean tag = false; - boolean math = false; - boolean nowiki = false; - - for( Content c: ccl ){ - - for( FormatType t: c.getFormats()) - switch(t){ - case BOLD: - bold = true; - break; - case ITALIC: - italic = true; - break; - case TAG: - tag = true; - break; - case MATH: - math = true; - break; - case NOWIKI: - nowiki = true; - break; - } - - if( bold && italic && tag && math && nowiki )break; - } - - List<FormatType> result = new ArrayList<>(); - if(bold) result.add(FormatType.BOLD); - if(italic) result.add(FormatType.ITALIC); - if(tag) result.add( FormatType.TAG ); - if(math) result.add( FormatType.MATH ); - if(nowiki) result.add( FormatType.NOWIKI ); - return result; - } - - public List<FormatType> getFormats(int start, int end){ - return getFormats(new Span(start, end) ); - } - - public List<FormatType> getFormats(Span s){ - boolean bold = false; - boolean italic = false; - boolean tag = false; - boolean math = false; - boolean nowiki = false; - - Span a = new Span( -1, -1 ); - - for( Content c: ccl ){ - int offset = a.getEnd()+1; - a = new Span( offset, offset+ c.length() ); - - if( a.hits(s) ) - for( FormatType t: c.getFormats( s.clone().adjust( -offset ) ) ) - switch(t){ - case BOLD: - bold = true; - break; - case ITALIC: - italic = true; - break; - case TAG: - tag = true; - break; - case MATH: - math = true; - break; - case NOWIKI: - nowiki = true; - break; - } - - if( bold&&italic )break; - } - - List<FormatType> result = new ArrayList<>(); - if(bold) result.add(FormatType.BOLD); - if(italic) result.add(FormatType.ITALIC); - if(tag) result.add( FormatType.TAG ); - if(math) result.add( FormatType.MATH ); - if(nowiki) result.add( FormatType.NOWIKI ); - return result; - } - - public List<Link> getLinks( Link.type linkType ){ - List<Link> result= new ArrayList<>(); - for( Content c: ccl ) result.addAll( c.getLinks( linkType )); - return result; - } - - public List<Link> getLinks( Link.type linkType, int start, int end){ - return getLinks( linkType, new Span( start, end )); - } - - public List<Link> getLinks( Link.type linkType, Span s){ - List<Link> result = new ArrayList<>(); - - Span a = new Span( -1, -1 ); - - for( Content c: ccl ){ - int offset = a.getEnd()+1; - a = new Span( offset, offset+ c.length() ); - - if( a.hits(s) ) - result.addAll( c.getLinks( linkType, s.clone().adjust( -offset ) ) ); - } - return result; - } - - public List<Link> getLinks(){ - List<Link> result = new ArrayList<>(); - for( Content c: ccl ) - result.addAll( c.getLinks() ); - return result; - } - - public List<Template> getTemplates(){ - List<Template> result = new ArrayList<>(); - for( Content cc: ccl ) - result.addAll( cc.getTemplates() ); - return result; - } - - public List<Template> getTemplates(int start, int end){ - return getTemplates( new Span(start, end )); - } - - public List<Template> getTemplates(Span s){ - List<Template> result = new ArrayList<>(); - - Span a = new Span( -1, -1 ); - - for( Content c: ccl ){ - int offset = a.getEnd()+1; - a = new Span( offset, offset+ c.length() ); - - if( a.hits(s) ) - result.addAll( c.getTemplates( s.clone().adjust( -offset ) ) ); - } - return result; - } + + protected List<Content> ccl; + + public boolean empty() { + return ccl.size() == 0; + } + + public String getText() { + + StringBuilder result = new StringBuilder(); + for (Content cc : ccl) { + if (cc != null) result.append(cc.getText() + " "); + } + + final int temp = result.length() - 1; + if (temp >= 0) result.deleteCharAt(temp); + + return result.toString(); + } + + /** + * Returns the Text in the Span List in a String...<br> + * all Spans must refer to the text returned by getText(). + */ + public String getText(List<Span> sl) { + final String temp = getText(); + StringBuilder result = new StringBuilder(); + for (Span s : sl) + result.append(s.getText(temp) + ' '); + result.deleteCharAt(result.length() - 1); + return result.toString(); + } + + public int length() { + int length = 0; + + for (Content cc : ccl) + if (cc != null) length += cc.length() + 1; + + if (length > 0) length--; + + return length; + } + + /** + * Retruns the Number of Content elements in this ContentContainer. + */ + public int size() { + return ccl.size(); + } + + public List<Span> getFormatSpans(FormatType t) { + List<Span> result = new ArrayList<>(); + int offset = 0; + for (Content c : ccl) { + for (Span b : c.getFormatSpans(t)) + result.add(b.clone().adjust(offset)); + + offset += 1 + c.length(); + } + return result; + } + + public List<Span> getFormatSpans(FormatType t, int start, int end) { + return getFormatSpans(t, new Span(start, end)); + } + + public List<Span> getFormatSpans(FormatType t, Span s) { + List<Span> result = new ArrayList<>(); + + Span a = new Span(-1, -1); + + for (Content c : ccl) { + int offset = a.getEnd() + 1; + a = new Span(offset, offset + c.length()); + + if (a.hits(s)) { + for (Span b : c.getFormatSpans(t, s.clone().adjust(-offset))) + result.add(b.clone().adjust(offset)); + } + } + return result; + } + + public List<FormatType> getFormats() { + + boolean bold = false; + boolean italic = false; + boolean tag = false; + boolean math = false; + boolean nowiki = false; + + for (Content c : ccl) { + + for (FormatType t : c.getFormats()) + switch (t) { + case BOLD: + bold = true; + break; + case ITALIC: + italic = true; + break; + case TAG: + tag = true; + break; + case MATH: + math = true; + break; + case NOWIKI: + nowiki = true; + break; + } + + if (bold && italic && tag && math && nowiki) break; + } + + List<FormatType> result = new ArrayList<>(); + if (bold) result.add(FormatType.BOLD); + if (italic) result.add(FormatType.ITALIC); + if (tag) result.add(FormatType.TAG); + if (math) result.add(FormatType.MATH); + if (nowiki) result.add(FormatType.NOWIKI); + return result; + } + + public List<FormatType> getFormats(int start, int end) { + return getFormats(new Span(start, end)); + } + + public List<FormatType> getFormats(Span s) { + boolean bold = false; + boolean italic = false; + boolean tag = false; + boolean math = false; + boolean nowiki = false; + + Span a = new Span(-1, -1); + + for (Content c : ccl) { + int offset = a.getEnd() + 1; + a = new Span(offset, offset + c.length()); + + if (a.hits(s)) + for (FormatType t : c.getFormats(s.clone().adjust(-offset))) + switch (t) { + case BOLD: + bold = true; + break; + case ITALIC: + italic = true; + break; + case TAG: + tag = true; + break; + case MATH: + math = true; + break; + case NOWIKI: + nowiki = true; + break; + } + + if (bold && italic) break; + } + + List<FormatType> result = new ArrayList<>(); + if (bold) result.add(FormatType.BOLD); + if (italic) result.add(FormatType.ITALIC); + if (tag) result.add(FormatType.TAG); + if (math) result.add(FormatType.MATH); + if (nowiki) result.add(FormatType.NOWIKI); + return result; + } + + public List<Link> getLinks(Link.type linkType) { + List<Link> result = new ArrayList<>(); + for (Content c : ccl) result.addAll(c.getLinks(linkType)); + return result; + } + + public List<Link> getLinks(Link.type linkType, int start, int end) { + return getLinks(linkType, new Span(start, end)); + } + + public List<Link> getLinks(Link.type linkType, Span s) { + List<Link> result = new ArrayList<>(); + + Span a = new Span(-1, -1); + + for (Content c : ccl) { + int offset = a.getEnd() + 1; + a = new Span(offset, offset + c.length()); + + if (a.hits(s)) + result.addAll(c.getLinks(linkType, s.clone().adjust(-offset))); + } + return result; + } + + public List<Link> getLinks() { + List<Link> result = new ArrayList<>(); + for (Content c : ccl) + result.addAll(c.getLinks()); + return result; + } + + public List<Template> getTemplates() { + List<Template> result = new ArrayList<>(); + for (Content cc : ccl) + result.addAll(cc.getTemplates()); + return result; + } + + public List<Template> getTemplates(int start, int end) { + return getTemplates(new Span(start, end)); + } + + public List<Template> getTemplates(Span s) { + List<Template> result = new ArrayList<>(); + + Span a = new Span(-1, -1); + + for (Content c : ccl) { + int offset = a.getEnd() + 1; + a = new Span(offset, offset + c.length()); + + if (a.hits(s)) + result.addAll(c.getTemplates(s.clone().adjust(-offset))); + } + return result; + } } diff --git a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/ContentElement.java b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/ContentElement.java index a72f0b58..805378b7 100755 --- a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/ContentElement.java +++ b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/ContentElement.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -25,285 +25,290 @@ * for nearly all content containing classes... * <p> * Be aware, that all retured Spans refer to the String returned by getText()<br> - * */ public class ContentElement extends ParsedPageObject implements Content { - - private String text; - private List<Span> boldSpans; - private List<Span> italicSpans; - private List<Link> links; - private List<Template> templates; - private List<Span> tags; - private List<Span> mathSpans; - private List<Span> noWikiSpans; - - public ContentElement(){ - text = ""; - links = new ArrayList<>(); - templates = new ArrayList<>(); - boldSpans = new ArrayList<>(); - italicSpans = new ArrayList<>(); - tags = new ArrayList<>(); - mathSpans = new ArrayList<>(); - noWikiSpans = new ArrayList<>(); - } - - /** - * Look at getText() for Details... - */ - public void setText(String text){ - this.text = text; - } - - /** - * Returns the Text on wich all elements of this ContentElement are bases on. - */ - public String getText(){ - return text; - } - - /** - * Returns the Text defined with the Spans in the List divided by a WS - */ - public String getText( List<Span> sl ){ - StringBuilder result = new StringBuilder(); - - for( Span s: sl ){ - result.append( s.getText( text )+' '); - } - int delChar = result.length()-1; - if(delChar>0) result.deleteCharAt( delChar ); - - return result.toString(); - } - - /** - * Retruns the length of the Text. Alternativ you can use getText().length() - */ - public int length(){ - return text.length(); - } - - /** - * Returns true if there is no Content in this ContentElement. - */ - public boolean empty(){ - return - text.length() == 0 && - links.size() == 0 && - templates.size() == 0 && - tags.size() == 0 && - mathSpans.size() == 0; - } - - /** - * Look at getFormatSpans for Details... - */ - public void setFormatSpans(FormatType t, List<Span> spans){ - switch( t ){ - case BOLD: boldSpans = spans; - break; - - case ITALIC: italicSpans = spans; - break; - - case TAG: tags = spans; - break; - - case MATH: mathSpans = spans; - break; - - case NOWIKI: noWikiSpans = spans; - break; - } - } - - /** - * Returns all the Spans of the Format type t. - */ - public List<Span> getFormatSpans(FormatType t){ - switch( t ){ - case BOLD: return boldSpans; - case ITALIC: return italicSpans; - case TAG: return tags; - case MATH: return mathSpans; - case NOWIKI: return noWikiSpans; - default: return null; - } - } - - /** - * Returns all the Spans of the Format type t in the Range of start to end - */ - public List<Span> getFormatSpans(FormatType t, int start, int end ){ - return getFormatSpans( t, new Span(start, end)); - } - - /** - * Returns all the Spans of the Format type t in the Range of the Span s - */ - public List<Span> getFormatSpans(FormatType t, Span s){ - List<Span> result = new ArrayList<>(); - for( Span s2: getFormatSpans(t) ) - if( s2.hits(s) )result.add( s2 ); - return result; - } - - /** - * Returns the Formats wich are used in this ContentElement in a List. - */ - public List<FormatType> getFormats(){ - List<FormatType> ftl= new ArrayList<>(); - if( boldSpans.size() != 0 ) ftl.add(FormatType.BOLD); - if( italicSpans.size() != 0) ftl.add(FormatType.ITALIC); - if( tags.size() != 0 ) ftl.add( FormatType.TAG ); - if( mathSpans.size() != 0 ) ftl.add( FormatType.MATH ); - if( noWikiSpans.size() != 0 ) ftl.add( FormatType.NOWIKI ); - return ftl; - } - - /** - * Returns the Formats wich are used in this ContentElement, in the Range from start to end, in a List. - */ - public List<FormatType> getFormats(int start, int end){ - return getFormats(new Span(start, end)); - } - - /** - * Returns the Formats wich are used in this ContentElement, in the Range of the Span s, in a List. - */ - public List<FormatType> getFormats(Span s){ - List<FormatType> result= new ArrayList<>(); - for(Span s2: boldSpans) - if( s.hits(s2) ){ - result.add( FormatType.BOLD ); - break; - } - - for(Span s2: italicSpans) - if( s.hits(s2) ){ - result.add( FormatType.ITALIC ); - break; - } - - return result; - } - - /** - * Look at getLinks() for Details... - */ - public void setLinks(List<Link> links){ - this.links = links; - } - - /** - * Retruns a List of the Links of this ContentElement - */ - public List<Link> getLinks(){ - return links; - } - - /** - * Returns a List of the Links of this ContentElement of the Specified Link.type t - */ - public List<Link> getLinks( Link.type t ){ - List<Link> result = new ArrayList<>(); - for( Link l: links ) - if( l.getType()==t )result.add(l); - return result; - } - - /** - * Returns a List of the Links of this ContentElement of the Specified Link.type t in the Range of s - */ - public List<Link> getLinks( Link.type t, Span s){ - List<Link> result = new ArrayList<>(); - for( Link l: links) - if( l.getType()==t && l.getPos().hits(s) ) result.add(l); - return result; - } - - /** - * Returns a List of the Links of this ContentElement of the Specified Link.type t in the Range of start to end - */ - public List<Link> getLinks( Link.type t, int begin, int end){ - return getLinks( t, new Span(begin, end) ); - } - - /** - * Look at getTemplates for Details... - */ - public void setTemplates( List<Template> templates){ - this.templates = templates; - } - - /** - * Returns a List of the Templates of this ContentElement. - */ - public List<Template> getTemplates(){ - return templates; - } - - /** - * Returns a List of the Templates of this ContentElement in the Range from start to end - */ - public List<Template> getTemplates(int start, int end){ - return getTemplates( new Span(start, end) ); - } - - /** - * Returns a List of the Templates of this ContentElement in the Range of s - */ - public List<Template> getTemplates(Span s){ - List<Template> result = new ArrayList<>(); - for( Template t: templates) - if( t.getPos().hits( s ) ) result.add( t ); - return result; - } - - /** - * Try and find out ;-) - */ - public String toString(){ - StringBuilder result = new StringBuilder(); - result.append("CE_TEXT: \"" + text + "\"" ); - - result.append("\nCE_BOLD_SPANS: "); - if( boldSpans != null ){ - result.append( boldSpans.size() ); - for( Span s: boldSpans ) result.append("\n\t"+ s+ " : \""+ s.getText(text) + "\""); - } - else result.append("ERROR: boldSpans == null"); - - result.append("\nCE_ITALIC_SPANS: "); - if( italicSpans != null ){ - result.append( italicSpans.size() ); - for( Span s: italicSpans ) result.append("\n\t"+s+" : \""+s.getText(text)+"\""); - } - else result.append("ERROR: italicSpans == null"); - - result.append("\nCE_LINKS: "); - if( links != null ){ - result.append( links.size() ); - for( Link l: links) result.append("\n"+ l ); - } - else result.append("ERROR: links == null"); - - result.append("\nCE_TEMPLATES: "); - if( templates != null ){ - result.append( templates.size() ); - for( Template t: templates) result.append("\n"+ t ); - } - else result.append("ERROR: templates == null"); - - result.append("\nCE_TAGS: "); - if( templates != null ){ - result.append( tags.size() ); - for( Span s: tags) result.append("\n"+ s ); - } - else result.append("ERROR: templates == null"); - - return result.toString(); - } + + private String text; + private List<Span> boldSpans; + private List<Span> italicSpans; + private List<Link> links; + private List<Template> templates; + private List<Span> tags; + private List<Span> mathSpans; + private List<Span> noWikiSpans; + + public ContentElement() { + text = ""; + links = new ArrayList<>(); + templates = new ArrayList<>(); + boldSpans = new ArrayList<>(); + italicSpans = new ArrayList<>(); + tags = new ArrayList<>(); + mathSpans = new ArrayList<>(); + noWikiSpans = new ArrayList<>(); + } + + /** + * Look at getText() for Details... + */ + public void setText(String text) { + this.text = text; + } + + /** + * Returns the Text on wich all elements of this ContentElement are bases on. + */ + public String getText() { + return text; + } + + /** + * Returns the Text defined with the Spans in the List divided by a WS + */ + public String getText(List<Span> sl) { + StringBuilder result = new StringBuilder(); + + for (Span s : sl) { + result.append(s.getText(text) + ' '); + } + int delChar = result.length() - 1; + if (delChar > 0) result.deleteCharAt(delChar); + + return result.toString(); + } + + /** + * Retruns the length of the Text. Alternativ you can use getText().length() + */ + public int length() { + return text.length(); + } + + /** + * Returns true if there is no Content in this ContentElement. + */ + public boolean empty() { + return + text.length() == 0 && + links.size() == 0 && + templates.size() == 0 && + tags.size() == 0 && + mathSpans.size() == 0; + } + + /** + * Look at getFormatSpans for Details... + */ + public void setFormatSpans(FormatType t, List<Span> spans) { + switch (t) { + case BOLD: + boldSpans = spans; + break; + + case ITALIC: + italicSpans = spans; + break; + + case TAG: + tags = spans; + break; + + case MATH: + mathSpans = spans; + break; + + case NOWIKI: + noWikiSpans = spans; + break; + } + } + + /** + * Returns all the Spans of the Format type t. + */ + public List<Span> getFormatSpans(FormatType t) { + switch (t) { + case BOLD: + return boldSpans; + case ITALIC: + return italicSpans; + case TAG: + return tags; + case MATH: + return mathSpans; + case NOWIKI: + return noWikiSpans; + default: + return null; + } + } + + /** + * Returns all the Spans of the Format type t in the Range of start to end + */ + public List<Span> getFormatSpans(FormatType t, int start, int end) { + return getFormatSpans(t, new Span(start, end)); + } + + /** + * Returns all the Spans of the Format type t in the Range of the Span s + */ + public List<Span> getFormatSpans(FormatType t, Span s) { + List<Span> result = new ArrayList<>(); + for (Span s2 : getFormatSpans(t)) + if (s2.hits(s)) result.add(s2); + return result; + } + + /** + * Returns the Formats wich are used in this ContentElement in a List. + */ + public List<FormatType> getFormats() { + List<FormatType> ftl = new ArrayList<>(); + if (boldSpans.size() != 0) ftl.add(FormatType.BOLD); + if (italicSpans.size() != 0) ftl.add(FormatType.ITALIC); + if (tags.size() != 0) ftl.add(FormatType.TAG); + if (mathSpans.size() != 0) ftl.add(FormatType.MATH); + if (noWikiSpans.size() != 0) ftl.add(FormatType.NOWIKI); + return ftl; + } + + /** + * Returns the Formats wich are used in this ContentElement, in the Range from start to end, in a List. + */ + public List<FormatType> getFormats(int start, int end) { + return getFormats(new Span(start, end)); + } + + /** + * Returns the Formats wich are used in this ContentElement, in the Range of the Span s, in a List. + */ + public List<FormatType> getFormats(Span s) { + List<FormatType> result = new ArrayList<>(); + for (Span s2 : boldSpans) + if (s.hits(s2)) { + result.add(FormatType.BOLD); + break; + } + + for (Span s2 : italicSpans) + if (s.hits(s2)) { + result.add(FormatType.ITALIC); + break; + } + + return result; + } + + /** + * Look at getLinks() for Details... + */ + public void setLinks(List<Link> links) { + this.links = links; + } + + /** + * Retruns a List of the Links of this ContentElement + */ + public List<Link> getLinks() { + return links; + } + + /** + * Returns a List of the Links of this ContentElement of the Specified Link.type t + */ + public List<Link> getLinks(Link.type t) { + List<Link> result = new ArrayList<>(); + for (Link l : links) + if (l.getType() == t) result.add(l); + return result; + } + + /** + * Returns a List of the Links of this ContentElement of the Specified Link.type t in the Range of s + */ + public List<Link> getLinks(Link.type t, Span s) { + List<Link> result = new ArrayList<>(); + for (Link l : links) + if (l.getType() == t && l.getPos().hits(s)) result.add(l); + return result; + } + + /** + * Returns a List of the Links of this ContentElement of the Specified Link.type t in the Range of start to end + */ + public List<Link> getLinks(Link.type t, int begin, int end) { + return getLinks(t, new Span(begin, end)); + } + + /** + * Look at getTemplates for Details... + */ + public void setTemplates(List<Template> templates) { + this.templates = templates; + } + + /** + * Returns a List of the Templates of this ContentElement. + */ + public List<Template> getTemplates() { + return templates; + } + + /** + * Returns a List of the Templates of this ContentElement in the Range from start to end + */ + public List<Template> getTemplates(int start, int end) { + return getTemplates(new Span(start, end)); + } + + /** + * Returns a List of the Templates of this ContentElement in the Range of s + */ + public List<Template> getTemplates(Span s) { + List<Template> result = new ArrayList<>(); + for (Template t : templates) + if (t.getPos().hits(s)) result.add(t); + return result; + } + + /** + * Try and find out ;-) + */ + public String toString() { + StringBuilder result = new StringBuilder(); + result.append("CE_TEXT: \"" + text + "\""); + + result.append("\nCE_BOLD_SPANS: "); + if (boldSpans != null) { + result.append(boldSpans.size()); + for (Span s : boldSpans) result.append("\n\t" + s + " : \"" + s.getText(text) + "\""); + } else result.append("ERROR: boldSpans == null"); + + result.append("\nCE_ITALIC_SPANS: "); + if (italicSpans != null) { + result.append(italicSpans.size()); + for (Span s : italicSpans) result.append("\n\t" + s + " : \"" + s.getText(text) + "\""); + } else result.append("ERROR: italicSpans == null"); + + result.append("\nCE_LINKS: "); + if (links != null) { + result.append(links.size()); + for (Link l : links) result.append("\n" + l); + } else result.append("ERROR: links == null"); + + result.append("\nCE_TEMPLATES: "); + if (templates != null) { + result.append(templates.size()); + for (Template t : templates) result.append("\n" + t); + } else result.append("ERROR: templates == null"); + + result.append("\nCE_TAGS: "); + if (templates != null) { + result.append(tags.size()); + for (Span s : tags) result.append("\n" + s); + } else result.append("ERROR: templates == null"); + + return result.toString(); + } } diff --git a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/DefinitionList.java b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/DefinitionList.java index ea117247..8674126c 100755 --- a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/DefinitionList.java +++ b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/DefinitionList.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -23,90 +23,88 @@ /** * In a definition List exist a Defined Term with Zero or more Definitions. */ -public class DefinitionList extends ContentContainer{ - - private ContentElement definedTerm; - private final List<ContentElement> definitions; - - public DefinitionList(){ - this.ccl = new ArrayList<>(); - this.definedTerm = null; - this.definitions = new ArrayList<>(); - } - - public DefinitionList( ContentElement definedTerm, List<ContentElement> definitions){ - this.ccl = new ArrayList<>(); - this.definedTerm = definedTerm; - this.definitions = definitions; - ccl.add( definedTerm ); - ccl.addAll( definitions ); - } - - /** - * content = definedTerm[+definition]* - */ - public DefinitionList( List<ContentElement> content ){ - this.ccl = new ArrayList<>(content); - this.definitions = new ArrayList<>(); - - if( content.size()>0 ){ - this.definedTerm = content.get(0); - if( content.size()>1){ - this.definitions.addAll(content); - this.definitions.remove(0); - } - } - else this.definedTerm = null; - } - - public String toString(){ - StringBuilder result = new StringBuilder(); - - result.append( "DL_DEFINEDTERM:\n"); - result.append( definedTerm ); - - if( definitions.size() != 0 ){ - result.append( "\nDL_DEFINITIONS:"); - for( ContentElement ce: definitions) result.append( "\n"+ce ); - } - - return result.toString(); - } - - public ContentElement getDefinedTerm(){ - return definedTerm; - } - - public void setDefinedTerm( ContentElement definedTerm ){ - if( definedTerm != null ){ - if( this.definedTerm == null ) ccl.add( 0, definedTerm ); - else ccl.set( 0, definedTerm ); - } - else if( this.definedTerm != null ) ccl.remove( this.definedTerm ); - - this.definedTerm = definedTerm; - } - - public int nrOfDefinitions(){ - return definitions.size(); - } - - public void removeDefinition( ContentElement ce ){ - definitions.remove(ce); - ccl.remove(ce); - } - - public void addDefiniton( ContentElement ce ){ - definitions.add(ce); - ccl.add(ce); - } - - public ContentElement getDefinition(int i){ - if( definitions.size()>i ) return definitions.get(i); - else return null; - } - - public List<ContentElement> getDefinitions(){ - return new ArrayList<>(definitions); - } +public class DefinitionList extends ContentContainer { + + private ContentElement definedTerm; + private final List<ContentElement> definitions; + + public DefinitionList() { + this.ccl = new ArrayList<>(); + this.definedTerm = null; + this.definitions = new ArrayList<>(); + } + + public DefinitionList(ContentElement definedTerm, List<ContentElement> definitions) { + this.ccl = new ArrayList<>(); + this.definedTerm = definedTerm; + this.definitions = definitions; + ccl.add(definedTerm); + ccl.addAll(definitions); + } + + /** + * content = definedTerm[+definition]* + */ + public DefinitionList(List<ContentElement> content) { + this.ccl = new ArrayList<>(content); + this.definitions = new ArrayList<>(); + + if (content.size() > 0) { + this.definedTerm = content.get(0); + if (content.size() > 1) { + this.definitions.addAll(content); + this.definitions.remove(0); + } + } else this.definedTerm = null; + } + + public String toString() { + StringBuilder result = new StringBuilder(); + + result.append("DL_DEFINEDTERM:\n"); + result.append(definedTerm); + + if (definitions.size() != 0) { + result.append("\nDL_DEFINITIONS:"); + for (ContentElement ce : definitions) result.append("\n" + ce); + } + + return result.toString(); + } + + public ContentElement getDefinedTerm() { + return definedTerm; + } + + public void setDefinedTerm(ContentElement definedTerm) { + if (definedTerm != null) { + if (this.definedTerm == null) ccl.add(0, definedTerm); + else ccl.set(0, definedTerm); + } else if (this.definedTerm != null) ccl.remove(this.definedTerm); + + this.definedTerm = definedTerm; + } + + public int nrOfDefinitions() { + return definitions.size(); + } + + public void removeDefinition(ContentElement ce) { + definitions.remove(ce); + ccl.remove(ce); + } + + public void addDefiniton(ContentElement ce) { + definitions.add(ce); + ccl.add(ce); + } + + public ContentElement getDefinition(int i) { + if (definitions.size() > i) return definitions.get(i); + else return null; + } + + public List<ContentElement> getDefinitions() { + return new ArrayList<>(definitions); + } } diff --git a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/Link.java b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/Link.java index 2502e986..bd5a392a 100755 --- a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/Link.java +++ b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/Link.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -20,113 +20,127 @@ import java.util.ArrayList; import java.util.List; -public class Link extends ParsedPageObject{ - - private Content home_cc; - private final type t; - private final Span pos; - private final String target; - private final List<String> parameters; - - public enum type {EXTERNAL, INTERNAL, AUDIO, VIDEO, IMAGE, UNKNOWN} - - public Link( Content home_cc, Span linkPos, String target, type t, List<String> parameters ){ - this.home_cc = home_cc; - this.pos = linkPos; - this.target = target; - this.t = t; - this.parameters = (parameters==null? new ArrayList<>():parameters); - } - - /** - * Returns the Content Element in wich the Link occures. - */ - public Content getHomeElement(){ return home_cc; } - public Link setHomeElement(Content home_cc){ this.home_cc = home_cc; return this; } - - /** - * Returns the Type of the Link. - */ - public type getType(){ return t; } - - /** - * Retruns the Position Span of the Link, wich refers to getHomeElement().getText(). - */ - public Span getPos(){ return pos; } - - /** - * Retruns the Target of the Link. - */ - public String getTarget(){ return target; } - - /** - * Returns a List of Parameters for this Link, in most cases the size of the list will be 0. - */ - public List<String> getParameters(){ return parameters; } - - /** - * Retruns the Link text or link caption. - */ - public String getText(){ - if( home_cc == null ) { - return null; - } - return pos.getText( home_cc.getText() ); - } - - /** - * Returns the Number of Words left and right of the Link, in the Bounds of the - * HomeElement of this Link. - */ - public String getContext(int wordsLeft, int wordsRight){ - final String text = home_cc.getText(); - int temp; - - // get the left start position - int posLeft = pos.getStart(); - temp = posLeft-1; - while( posLeft != 0 && wordsLeft > 0 ){ - while( temp > 0 && text.charAt( temp ) < 48 ) { - temp--; - } - while( temp > 0 && text.charAt( temp ) >= 48 ) { - temp--; - } - posLeft = ( temp>0 ? temp+1 : 0 ); - wordsLeft--; - } - - // get the right end position - int posRight = pos.getEnd(); - temp = posRight; - while( posRight != text.length() && wordsRight > 0 ){ - while( temp < text.length() && text.charAt( temp ) < 48 ) { - temp++; - } - while( temp < text.length() && text.charAt( temp ) >= 48 ) { - temp++; - } - posRight = temp; - wordsRight--; - } - - // retrun a string... - return - text.substring(posLeft, pos.getStart() ) + - text.substring(pos.getEnd(), posRight); - } - - @Override - public String toString(){ - StringBuilder result = new StringBuilder(); - result.append( "LI_TYPE: "+t ); - result.append("\nLI_TARGET: \""+ target + "\""); - result.append("\nLI_TEXT: \""+ getText() +"\""); - result.append("\nLI_POSITION: \""+ pos + "\""); - result.append( "\nLI_PARAMETERS: "+parameters.size() ); - for( String s: parameters ) { - result.append("\nLI_PARAMETER: \""+ s +"\"" ); - } - return result.toString(); - } +public class Link extends ParsedPageObject { + + private Content home_cc; + private final type t; + private final Span pos; + private final String target; + private final List<String> parameters; + + public enum type {EXTERNAL, INTERNAL, AUDIO, VIDEO, IMAGE, UNKNOWN} + + public Link(Content home_cc, Span linkPos, String target, type t, List<String> parameters) { + this.home_cc = home_cc; + this.pos = linkPos; + this.target = target; + this.t = t; + this.parameters = (parameters == null ? new ArrayList<>() : parameters); + } + + /** + * Returns the Content Element in wich the Link occures. + */ + public Content getHomeElement() { + return home_cc; + } + + public Link setHomeElement(Content home_cc) { + this.home_cc = home_cc; + return this; + } + + /** + * Returns the Type of the Link. + */ + public type getType() { + return t; + } + + /** + * Retruns the Position Span of the Link, wich refers to getHomeElement().getText(). + */ + public Span getPos() { + return pos; + } + + /** + * Retruns the Target of the Link. + */ + public String getTarget() { + return target; + } + + /** + * Returns a List of Parameters for this Link, in most cases the size of the list will be 0. + */ + public List<String> getParameters() { + return parameters; + } + + /** + * Retruns the Link text or link caption. + */ + public String getText() { + if (home_cc == null) { + return null; + } + return pos.getText(home_cc.getText()); + } + + /** + * Returns the Number of Words left and right of the Link, in the Bounds of the + * HomeElement of this Link. + */ + public String getContext(int wordsLeft, int wordsRight) { + final String text = home_cc.getText(); + int temp; + + // get the left start position + int posLeft = pos.getStart(); + temp = posLeft - 1; + while (posLeft != 0 && wordsLeft > 0) { + while (temp > 0 && text.charAt(temp) < 48) { + temp--; + } + while (temp > 0 && text.charAt(temp) >= 48) { + temp--; + } + posLeft = (temp > 0 ? temp + 1 : 0); + wordsLeft--; + } + + // get the right end position + int posRight = pos.getEnd(); + temp = posRight; + while (posRight != text.length() && wordsRight > 0) { + while (temp < text.length() && text.charAt(temp) < 48) { + temp++; + } + while (temp < text.length() && text.charAt(temp) >= 48) { + temp++; + } + posRight = temp; + wordsRight--; + } + + // retrun a string... + return + text.substring(posLeft, pos.getStart()) + + text.substring(pos.getEnd(), posRight); + } + + @Override + public String toString() { + StringBuilder result = new StringBuilder(); + result.append("LI_TYPE: " + t); + result.append("\nLI_TARGET: \"" + target + "\""); + result.append("\nLI_TEXT: \"" + getText() + "\""); + result.append("\nLI_POSITION: \"" + pos + "\""); + result.append("\nLI_PARAMETERS: " + parameters.size()); + for (String s : parameters) { + result.append("\nLI_PARAMETER: \"" + s + "\""); + } + return result.toString(); + } } diff --git a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/LinkAnchorExtractor.java b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/LinkAnchorExtractor.java index e1b0b883..9726fd80 100644 --- a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/LinkAnchorExtractor.java +++ b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/LinkAnchorExtractor.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -29,97 +29,93 @@ import org.dkpro.jwpl.parser.mediawiki.MediaWikiParser; import org.dkpro.jwpl.parser.mediawiki.MediaWikiParserFactory; -public class LinkAnchorExtractor -{ +public class LinkAnchorExtractor { - private final MediaWikiParser parser; + private final MediaWikiParser parser; - public LinkAnchorExtractor(){ - MediaWikiParserFactory pf = new MediaWikiParserFactory(Language.english); - parser = pf.createParser(); - } + public LinkAnchorExtractor() { + MediaWikiParserFactory pf = new MediaWikiParserFactory(Language.english); + parser = pf.createParser(); + } - public LinkAnchorExtractor(Language lang){ - MediaWikiParserFactory pf = new MediaWikiParserFactory(lang); - parser = pf.createParser(); - } + public LinkAnchorExtractor(Language lang) { + MediaWikiParserFactory pf = new MediaWikiParserFactory(lang); + parser = pf.createParser(); + } - public LinkAnchorExtractor(MediaWikiParser parser){ - this.parser=parser; - } + public LinkAnchorExtractor(MediaWikiParser parser) { + this.parser = parser; + } - /** - * Note that this method only returns the anchors that are not equal to the page's title. - * Anchors might contain references to sections in an article in the form of "Page#Section". - * If you need the plain title, e.g. for checking whether the page exists in Wikipedia, the Title object can be used. - * - * @return A set of strings used as anchor texts in links pointing to that page. - * @throws WikiTitleParsingException - */ - public Set<String> getInlinkAnchors(Page page) - throws WikiTitleParsingException - { - Set<String> inAnchors = new HashSet<>(); - for (Page p : page.getInlinks()) { - ParsedPage pp = parser.parse(p.getText()); - if (pp == null) { - return inAnchors; - } - for (Link l : pp.getLinks()) { - String pageTitle = page.getTitle().getPlainTitle(); + /** + * Note that this method only returns the anchors that are not equal to the page's title. + * Anchors might contain references to sections in an article in the form of "Page#Section". + * If you need the plain title, e.g. for checking whether the page exists in Wikipedia, the Title object can be used. + * + * @return A set of strings used as anchor texts in links pointing to that page. + * @throws WikiTitleParsingException + */ + public Set<String> getInlinkAnchors(Page page) + throws WikiTitleParsingException { + Set<String> inAnchors = new HashSet<>(); + for (Page p : page.getInlinks()) { + ParsedPage pp = parser.parse(p.getText()); + if (pp == null) { + return inAnchors; + } + for (Link l : pp.getLinks()) { + String pageTitle = page.getTitle().getPlainTitle(); - String anchorText = l.getText(); - if (l.getTarget().equals(pageTitle) && !anchorText.equals(pageTitle)) { - inAnchors.add(anchorText); - } - } - } - return inAnchors; - } + String anchorText = l.getText(); + if (l.getTarget().equals(pageTitle) && !anchorText.equals(pageTitle)) { + inAnchors.add(anchorText); + } + } + } + return inAnchors; + } - /** - * Note that this method only returns the anchors that are not equal to the title of the page - * they are pointing to. - * Anchors might contain references to sections in an article in the form of "Page#Section". - * If you need the plain title, e.g. for checking whether the page exists in Wikipedia, the Title object can be used. - * - * @return A mapping from the page titles of links in that page to the anchor texts used in the - * links. - * @throws WikiTitleParsingException - */ - public Map<String, Set<String>> getOutlinkAnchors(Page page) - throws WikiTitleParsingException - { - Map<String, Set<String>> outAnchors = new HashMap<>(); - ParsedPage pp = parser.parse(page.getText()); - if (pp == null) { - return outAnchors; - } - for (Link l : pp.getLinks()) { - if (l.getTarget().length() == 0) { - continue; - } + /** + * Note that this method only returns the anchors that are not equal to the title of the page + * they are pointing to. + * Anchors might contain references to sections in an article in the form of "Page#Section". + * If you need the plain title, e.g. for checking whether the page exists in Wikipedia, the Title object can be used. + * + * @return A mapping from the page titles of links in that page to the anchor texts used in the + * links. + * @throws WikiTitleParsingException + */ + public Map<String, Set<String>> getOutlinkAnchors(Page page) + throws WikiTitleParsingException { + Map<String, Set<String>> outAnchors = new HashMap<>(); + ParsedPage pp = parser.parse(page.getText()); + if (pp == null) { + return outAnchors; + } + for (Link l : pp.getLinks()) { + if (l.getTarget().length() == 0) { + continue; + } - String targetTitle = new Title(l.getTarget()).getPlainTitle(); - if (!l.getType().equals(Link.type.EXTERNAL) && !l.getType().equals(Link.type.IMAGE) - && !l.getType().equals(Link.type.AUDIO) && !l.getType().equals(Link.type.VIDEO) - && !targetTitle.contains(":")) // Wikipedia titles only contain colons if they - // are categories or other meta data - { - String anchorText = l.getText(); - if (!anchorText.equals(targetTitle)) { - Set<String> anchors; - if (outAnchors.containsKey(targetTitle)) { - anchors = outAnchors.get(targetTitle); - } - else { - anchors = new HashSet<>(); - } - anchors.add(anchorText); - outAnchors.put(targetTitle, anchors); - } - } - } - return outAnchors; - } + String targetTitle = new Title(l.getTarget()).getPlainTitle(); + if (!l.getType().equals(Link.type.EXTERNAL) && !l.getType().equals(Link.type.IMAGE) + && !l.getType().equals(Link.type.AUDIO) && !l.getType().equals(Link.type.VIDEO) + && !targetTitle.contains(":")) // Wikipedia titles only contain colons if they + // are categories or other meta data + { + String anchorText = l.getText(); + if (!anchorText.equals(targetTitle)) { + Set<String> anchors; + if (outAnchors.containsKey(targetTitle)) { + anchors = outAnchors.get(targetTitle); + } else { + anchors = new HashSet<>(); + } + anchors.add(anchorText); + outAnchors.put(targetTitle, anchors); + } + } + } + return outAnchors; + } } diff --git a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/NestedList.java b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/NestedList.java index dba4280c..b56b7b40 100755 --- a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/NestedList.java +++ b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/NestedList.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -20,12 +20,10 @@ /** * A NestedList can contain ContentElements or other NestedLists, * for this purpose and to avoid a improper use, this interface has been created.<br> - * + * <p> * Now, we got a NestedListContainer wich contains NestedLists<br> * A NestedList can be a NestedListContainer or a NestedListElement. - * - * */ public interface NestedList extends Content { - SrcSpan getSrcSpan(); + SrcSpan getSrcSpan(); } diff --git a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/NestedListContainer.java b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/NestedListContainer.java index 20373e4b..42c66798 100755 --- a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/NestedListContainer.java +++ b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/NestedListContainer.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -22,52 +22,53 @@ /** * Take a Look a NestedList description first. - * */ -public class NestedListContainer extends ContentContainer implements NestedList{ - - private final List<NestedList> lists; - private final boolean numbered; - - public NestedListContainer(boolean numbered){ - this.ccl = new ArrayList<>(); - this.lists = new ArrayList<>(); - this.numbered = numbered; - } - - /** - * Returns if the NestedList is a numbered or a unnumbered/pointed NestedList - */ - public boolean isNumbered(){ return numbered; } - - /** - * Returns the NestedListContainer or NestedListElement at Positon i. - */ - public NestedList getNestedList(int i){ - if( i<lists.size() ) return lists.get(i); - else return null; - } - - public void add( NestedList nl ){ - lists.add(nl); - ccl.add(nl); - } - - public void remove( NestedList nl ){ - lists.remove( nl ); - ccl.remove( nl ); - } - - public List<NestedList> getNestedLists(){ - return new ArrayList<>(lists); - } - - public String toString(){ - StringBuilder result = new StringBuilder(); - result.append("NLS_NUMBERD: "+ numbered); - result.append("\nNLS_CONTENT: false"); - result.append("\nNLS_NESTEDTLISTS: "+lists.size()); - for( NestedList l: lists ) result.append( "\nNLS_NESTEDLIST:\n"+ l); - return result.toString(); - } +public class NestedListContainer extends ContentContainer implements NestedList { + + private final List<NestedList> lists; + private final boolean numbered; + + public NestedListContainer(boolean numbered) { + this.ccl = new ArrayList<>(); + this.lists = new ArrayList<>(); + this.numbered = numbered; + } + + /** + * Returns if the NestedList is a numbered or a unnumbered/pointed NestedList + */ + public boolean isNumbered() { + return numbered; + } + + /** + * Returns the NestedListContainer or NestedListElement at Positon i. + */ + public NestedList getNestedList(int i) { + if (i < lists.size()) return lists.get(i); + else return null; + } + + public void add(NestedList nl) { + lists.add(nl); + ccl.add(nl); + } + + public void remove(NestedList nl) { + lists.remove(nl); + ccl.remove(nl); + } + + public List<NestedList> getNestedLists() { + return new ArrayList<>(lists); + } + + public String toString() { + StringBuilder result = new StringBuilder(); + result.append("NLS_NUMBERD: " + numbered); + result.append("\nNLS_CONTENT: false"); + result.append("\nNLS_NESTEDTLISTS: " + lists.size()); + for (NestedList l : lists) result.append("\nNLS_NESTEDLIST:\n" + l); + return result.toString(); + } } diff --git a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/NestedListElement.java b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/NestedListElement.java index ab7cdf93..7ab89a79 100755 --- a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/NestedListElement.java +++ b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/NestedListElement.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -19,9 +19,10 @@ /** * This is a simple ContentElement, wich occures in a NestedList. - * */ -public class NestedListElement extends ContentElement implements NestedList{ +public class NestedListElement extends ContentElement implements NestedList { - public String toString(){ return "NLC_IS_CONTENT: true\n"+ super.toString(); } + public String toString() { + return "NLC_IS_CONTENT: true\n" + super.toString(); + } } diff --git a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/Paragraph.java b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/Paragraph.java index 5ce2ff63..89694505 100755 --- a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/Paragraph.java +++ b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/Paragraph.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -19,35 +19,34 @@ /** * This is a simple ContentElement extende with a Paragraph Type. - * */ public class Paragraph extends ContentElement { - - public enum type {NORMAL, BOXED, INDENTED} - - private type t; - - public Paragraph(){ - super(); - } - - public Paragraph( type t){ - super(); - this.t = t; - } - - public String toString(){ - StringBuilder result = new StringBuilder(); - result.append( super.toString() ); - result.append( System.getProperty("line.separator") + "PA_TYPE: " + t); - return result.toString(); - } - - public void setType( type t ){ - this.t = t; - } - - public type getType(){ - return t; - } + + public enum type {NORMAL, BOXED, INDENTED} + + private type t; + + public Paragraph() { + super(); + } + + public Paragraph(type t) { + super(); + this.t = t; + } + + public String toString() { + StringBuilder result = new StringBuilder(); + result.append(super.toString()); + result.append(System.getProperty("line.separator") + "PA_TYPE: " + t); + return result.toString(); + } + + public void setType(type t) { + this.t = t; + } + + public type getType() { + return t; + } } diff --git a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/ParsedPage.java b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/ParsedPage.java index 3f3083e1..7663b78d 100755 --- a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/ParsedPage.java +++ b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/ParsedPage.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -25,138 +25,148 @@ /** * Provides access to structured information about a MediaWiki article page. */ -public class ParsedPage{ - - private String name; - private int pageId; - private SectionContainer superSection; - private ContentElement categories; - private ContentElement languages; - - private int firstParagraphNr; +public class ParsedPage { + + private String name; + private int pageId; + private SectionContainer superSection; + private ContentElement categories; + private ContentElement languages; + + private int firstParagraphNr; // private ContentElement aboutArticle; - - /** - * Constructor for a blank ParsedPage.<br> - * Only needed, if you want to create a Wikipedia article from scratch. - * <p> - * Creating a ParsedPage from a Wikipedia article requires to create a parser object first. - * - */ - public ParsedPage(){ - this.superSection = new SectionContainer(null,0); - } - - /** - * Sets the name of a parsed page. - * @param name A name for the parsed page. - */ - public void setName( String name ){ - this.name = name; - } - - /** - * The name of a parsed page. - * @return The name of a parsed page. - */ - public String getName(){ - return name; - } - - /** - * Set the pageId of a parsed page. - * @param pageId A pageId for the parsed page. - */ - public void setPageId( int pageId ){ - this.pageId = pageId; - } - - /** - * The pageId of a parsed page. - * @return The pageId of a parsed page. - */ - public int getPageId(){ - return pageId; - } - - /** - * Sets the category element of a parsed page. - * @param categories A ContentElement containg the categories of a page. - */ - public void setCategoryElement( ContentElement categories ){ - this.categories = categories; - } - - /** - * The ContentElement with links to a page's categories. - * @return The ContentElement with links to a page's categories. - */ - public ContentElement getCategoryElement(){ - return this.categories; - } - - /** - * Returns a list of category Link objects. - * This is a shortcut for writing getCategoryElemement.getLinks(); - * @return A list of category links. - */ - public List<Link> getCategories(){ - if (categories == null) { - return new ArrayList<>(); - } - - return categories.getLinks(); - } - - /** - * Sets the number of the first paragraph. - * @param nr The number of the first paragraph. - */ - public void setFirstParagraphNr(int nr) { - this.firstParagraphNr = nr; - } - - /** - * Returns the number of the first paragraph. - * @return The number of the first paragraph. - */ - public int getFirstParagraphNr(){ - return firstParagraphNr; - } - - /** - * Returns the first paragraph.<br> - * This is a shortcut for getParagraph( getFirstParagraphNr() ). - * It is <b>not</b> the same as getParagraph( 0 ), because the physically first paragraph often contain tables etc. - */ - public Paragraph getFirstParagraph() { - return this.getParagraph(firstParagraphNr); - } - - /** - * Sets the language element of a parsed page. - * @param languages A ContentElement containg the languages of a page. - */ - public void setLanguagesElement( ContentElement languages ){ - this.languages = languages; - } - - /** - * Returns a ContentElement containing the languages that are linked inside the article. - * @return A ContentElement containing the languages that are linked inside the article. - */ - public ContentElement getLanguagesElement(){ - return languages; - } - - /** - * Returns a list of language Link objects. - * This is a shortcut for writing getLanguagesElement().getLinks(); - */ - public List<Link> getLanguages(){ - return languages.getLinks(); - } - + + /** + * Constructor for a blank ParsedPage.<br> + * Only needed, if you want to create a Wikipedia article from scratch. + * <p> + * Creating a ParsedPage from a Wikipedia article requires to create a parser object first. + */ + public ParsedPage() { + this.superSection = new SectionContainer(null, 0); + } + + /** + * Sets the name of a parsed page. + * + * @param name A name for the parsed page. + */ + public void setName(String name) { + this.name = name; + } + + /** + * The name of a parsed page. + * + * @return The name of a parsed page. + */ + public String getName() { + return name; + } + + /** + * Set the pageId of a parsed page. + * + * @param pageId A pageId for the parsed page. + */ + public void setPageId(int pageId) { + this.pageId = pageId; + } + + /** + * The pageId of a parsed page. + * + * @return The pageId of a parsed page. + */ + public int getPageId() { + return pageId; + } + + /** + * Sets the category element of a parsed page. + * + * @param categories A ContentElement containg the categories of a page. + */ + public void setCategoryElement(ContentElement categories) { + this.categories = categories; + } + + /** + * The ContentElement with links to a page's categories. + * + * @return The ContentElement with links to a page's categories. + */ + public ContentElement getCategoryElement() { + return this.categories; + } + + /** + * Returns a list of category Link objects. + * This is a shortcut for writing getCategoryElemement.getLinks(); + * + * @return A list of category links. + */ + public List<Link> getCategories() { + if (categories == null) { + return new ArrayList<>(); + } + + return categories.getLinks(); + } + + /** + * Sets the number of the first paragraph. + * + * @param nr The number of the first paragraph. + */ + public void setFirstParagraphNr(int nr) { + this.firstParagraphNr = nr; + } + + /** + * Returns the number of the first paragraph. + * + * @return The number of the first paragraph. + */ + public int getFirstParagraphNr() { + return firstParagraphNr; + } + + /** + * Returns the first paragraph.<br> + * This is a shortcut for getParagraph( getFirstParagraphNr() ). + * It is <b>not</b> the same as getParagraph( 0 ), because the physically first paragraph often contain tables etc. + */ + public Paragraph getFirstParagraph() { + return this.getParagraph(firstParagraphNr); + } + + /** + * Sets the language element of a parsed page. + * + * @param languages A ContentElement containg the languages of a page. + */ + public void setLanguagesElement(ContentElement languages) { + this.languages = languages; + } + + /** + * Returns a ContentElement containing the languages that are linked inside the article. + * + * @return A ContentElement containing the languages that are linked inside the article. + */ + public ContentElement getLanguagesElement() { + return languages; + } + + /** + * Returns a list of language Link objects. + * This is a shortcut for writing getLanguagesElement().getLinks(); + */ + public List<Link> getLanguages() { + return languages.getLinks(); + } + //// I do not think that this should be a core api method, as it is language and template dependend. (TZ) // /** // * Returns a ContentElement with the Content of "Dieser Artikel" Template @@ -173,129 +183,173 @@ public List<Link> getLanguages(){ // } - /** - * Sets the Sections of a ParsedPage. - * @param sections A list of sections. - */ - public void setSections( List<Section> sections ){ - for( Section s: sections ) superSection.addSection(s); - } - - /** - * Set the Sections of the ParsedPage.<br> - * This function is used to upgrade a SectionContainer to a ParsedPage. - * @param s A sectionContainer. - */ - public void setSections( SectionContainer s ){ - superSection = s; - } - + /** + * Sets the Sections of a ParsedPage. + * + * @param sections A list of sections. + */ + public void setSections(List<Section> sections) { + for (Section s : sections) superSection.addSection(s); + } + + /** + * Set the Sections of the ParsedPage.<br> + * This function is used to upgrade a SectionContainer to a ParsedPage. + * + * @param s A sectionContainer. + */ + public void setSections(SectionContainer s) { + superSection = s; + } + // TODO What means lowest level? => TZ: I think it means "highest" semantically and "lowest" in numbering (e.g. <h1>). - /** - * Returns the requested Section of the lowest level. - * @param i The number of the section. - * @return The section with number i. - */ - public Section getSection(int i){ - return superSection.getSubSection(i); - } - - /** - * Retruns a list of all Sections of the lowest level. - * @return A list of sections. - */ - public List<Section> getSections(){ - return superSection.getSubSections(); - } - - /* - * Returns pageId and name in a String - */ - public String toString(){ - return "ParsedPage " + pageId + " " + name; - } - - /** - * Returns the number of paragraphs. - * @return The number of paragraphs. - */ - public int nrOfParagraphs(){ return superSection.nrOfParagraphs(); } - - /** - * Returns the paragraph indicated by the parameter i. - * @param i The number of the paragraph to return. - * @return The paragraph with number i. - */ - public Paragraph getParagraph(int i){return superSection.getParagraph(i); } - - /** - * Returns a list of paragraphs. - * @return A list of paragraphs. - */ - public List<Paragraph> getParagraphs(){ return superSection.getParagraphs(); } - - /** - * Returns the number of tables. - * @return The number of tables. - */ - public int nrOfTables(){ return superSection.nrOfTables(); } - - /** - * Returns the table indicated by the parameter i. - * @param i The number of the table to return. - * @return The table with number i. - */ - public Table getTable(int i){ return superSection.getTable(i); } - - /** - * Returns a list of tables. - * @return A list of tables. - */ - public List<Table> getTables(){ return superSection.getTables(); } - - /** - * Returns the number of nested lists. - * @return The number of nested lists. - */ - public int nrOfNestedLists(){ return superSection.nrOfNestedLists(); } - - /** - * Returns the nested list indicated by the parameter i. - * @param i The number of the nested list to return. - * @return The nested list with number i. - */ - public NestedList getNestedList(int i){ return superSection.getNestedList(i); } - - /** - * Returns a list of nested lists. - * @return A list of nested lists. - */ - public List<NestedListContainer> getNestedLists(){ return superSection.getNestedLists(); } - - /** - * Returns the number of definition lists. - * @return The number of definition lists. - */ - public int nrOfDefinitionLists(){ return superSection.nrOfDefinitionLists(); } - - /** - * Returns the definition list indicated by the parameter i. - * @param i The number of the definition list to return. - * @return The definition list with number i. - */ - public DefinitionList getDefinitionList(int i){ return superSection.getDefinitionList(i); } - - /** - * Returns a list of definition lists. - * @return A list of definition lists. - */ - public List<DefinitionList> getDefinitionLists(){ return superSection.getDefinitionLists(); } - - /** - * Return the plain text. - * @return The plain text. - */ - public String getText(){ return superSection.getText(); } + + /** + * Returns the requested Section of the lowest level. + * + * @param i The number of the section. + * @return The section with number i. + */ + public Section getSection(int i) { + return superSection.getSubSection(i); + } + + /** + * Retruns a list of all Sections of the lowest level. + * + * @return A list of sections. + */ + public List<Section> getSections() { + return superSection.getSubSections(); + } + + /* + * Returns pageId and name in a String + */ + public String toString() { + return "ParsedPage " + pageId + " " + name; + } + + /** + * Returns the number of paragraphs. + * + * @return The number of paragraphs. + */ + public int nrOfParagraphs() { + return superSection.nrOfParagraphs(); + } + + /** + * Returns the paragraph indicated by the parameter i. + * + * @param i The number of the paragraph to return. + * @return The paragraph with number i. + */ + public Paragraph getParagraph(int i) { + return superSection.getParagraph(i); + } + + /** + * Returns a list of paragraphs. + * + * @return A list of paragraphs. + */ + public List<Paragraph> getParagraphs() { + return superSection.getParagraphs(); + } + + /** + * Returns the number of tables. + * + * @return The number of tables. + */ + public int nrOfTables() { + return superSection.nrOfTables(); + } + + /** + * Returns the table indicated by the parameter i. + * + * @param i The number of the table to return. + * @return The table with number i. + */ + public Table getTable(int i) { + return superSection.getTable(i); + } + + /** + * Returns a list of tables. + * + * @return A list of tables. + */ + public List<Table> getTables() { + return superSection.getTables(); + } + + /** + * Returns the number of nested lists. + * + * @return The number of nested lists. + */ + public int nrOfNestedLists() { + return superSection.nrOfNestedLists(); + } + + /** + * Returns the nested list indicated by the parameter i. + * + * @param i The number of the nested list to return. + * @return The nested list with number i. + */ + public NestedList getNestedList(int i) { + return superSection.getNestedList(i); + } + + /** + * Returns a list of nested lists. + * + * @return A list of nested lists. + */ + public List<NestedListContainer> getNestedLists() { + return superSection.getNestedLists(); + } + + /** + * Returns the number of definition lists. + * + * @return The number of definition lists. + */ + public int nrOfDefinitionLists() { + return superSection.nrOfDefinitionLists(); + } + + /** + * Returns the definition list indicated by the parameter i. + * + * @param i The number of the definition list to return. + * @return The definition list with number i. + */ + public DefinitionList getDefinitionList(int i) { + return superSection.getDefinitionList(i); + } + + /** + * Returns a list of definition lists. + * + * @return A list of definition lists. + */ + public List<DefinitionList> getDefinitionLists() { + return superSection.getDefinitionLists(); + } + + /** + * Return the plain text. + * + * @return The plain text. + */ + public String getText() { + return superSection.getText(); + } //// TODO we should not need that as we could call getText on the span itself. // /** @@ -304,37 +358,49 @@ public String toString(){ // public String getText( List<Span> sl ){ return superSection.getText( sl ); } - /** - * Returns the length of the text in characters. - * @return The length of the text in characters. - */ - public int length(){ return superSection.length(); } + /** + * Returns the length of the text in characters. + * + * @return The length of the text in characters. + */ + public int length() { + return superSection.length(); + } - public List<FormatType> getFormats(){ return superSection.getFormats(); } + public List<FormatType> getFormats() { + return superSection.getFormats(); + } ////I do not know what these are for and they are never used (TZ). // public List<FormatType> getFormats(int begin, int end){ return superSection.getFormats(begin,end); } // public List<FormatType> getFormats(Span s){ return superSection.getFormats(s); } - public List<Span> getFormatSpans(FormatType t){ return superSection.getFormatSpans(t); } + public List<Span> getFormatSpans(FormatType t) { + return superSection.getFormatSpans(t); + } ////I do not know what these are for and they are never used (TZ). // public List<Span> getFormatSpans(FormatType t, int start, int end ){ return superSection.getFormatSpans(t, start, end); } // public List<Span> getFormatSpans(FormatType t, Span s){ return superSection.getFormatSpans(t, s); } - - public List<Link> getLinks(){ return superSection.getLinks(); } + + public List<Link> getLinks() { + return superSection.getLinks(); + } ////I do not know what these are for and they are never used (TZ). // public List<Link> getLinks(Link.type t){ return superSection.getLinks(t); } // public List<Link> getLinks(Link.type t, int begin, int end){ return superSection.getLinks(t, begin, end); } // public List<Link> getLinks(Link.type t, Span s){ return superSection.getLinks(t, s); } - /** - * Returns a list of templates that are used in the page. - * @return A list of templates that are used in the page. - */ - public List<Template> getTemplates(){ return superSection.getTemplates(); } + /** + * Returns a list of templates that are used in the page. + * + * @return A list of templates that are used in the page. + */ + public List<Template> getTemplates() { + return superSection.getTemplates(); + } //// I do not know what these are for and they are never used (TZ). // public List<Template> getTemplates(int start, int end){ return superSection.getTemplates(start, end); } diff --git a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/ParsedPageObject.java b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/ParsedPageObject.java index d8363061..9c13b3c3 100755 --- a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/ParsedPageObject.java +++ b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/ParsedPageObject.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -21,19 +21,18 @@ * All clases in parsedpage package, which can be created by a * parser, extending this class. So it is possible for these * classes to refer to a SourceCode. - * */ public abstract class ParsedPageObject { - private SrcSpan srcSpan; + private SrcSpan srcSpan; - /** - * Returns a Span refering to a SourceCode. - */ - public SrcSpan getSrcSpan() { - return srcSpan; - } + /** + * Returns a Span refering to a SourceCode. + */ + public SrcSpan getSrcSpan() { + return srcSpan; + } - public void setSrcSpan(SrcSpan srcSpan) { - this.srcSpan = srcSpan; - } + public void setSrcSpan(SrcSpan srcSpan) { + this.srcSpan = srcSpan; + } } diff --git a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/Section.java b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/Section.java index 8948db76..8514fc00 100755 --- a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/Section.java +++ b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/Section.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -32,128 +32,132 @@ * makes the accest to the provieded Structures very simple.<br> * <br> * These structure requirements are implemented as SectionContainer an SectionContent. - * */ public abstract class Section extends ContentContainer { - - private int level; - private ContentElement title; - - public Section(ContentElement title, int level){ - this.ccl = new ArrayList<>(); - this.level = level; - this.title = title; - if( title!=null ) ccl.add( title ); - } - - /** - * Look at getLevel() for Details... - */ - public void setLevel(int level){ this.level = level; } - - /** - * Retruns the hirachical Level of this Section. - */ - public int getLevel(){ return level; } - - /** - * Returns getTitleElement().getText() without NullPointerException - */ - public String getTitle(){ - if( title!=null ) - return title.getText(); - else - return null; - } - - /** - * Look at getTitleElement() for Details... - */ - public void setTitleElement( ContentElement title ){ - if( title != null ){ - if( this.title == null ) ccl.add( 0, title ); - else ccl.set( 0, title ); - } - else if( this.title != null ) ccl.remove( this.title ); - - this.title = title; - } - - /** - * Returns a ContentElement representing the content, originally given as - * MediaWiki SourcCode, beween one ore more equality chars at the beginning - * of a line. This is known as Title. - */ - public ContentElement getTitleElement(){ return title; } - - /** - * Return a List with all Content of any Type in Order of appearance. - */ - public abstract List<Content> getContentList(); - - /** - * Returns the Number of Paragraphs in this Section. - */ - public abstract int nrOfParagraphs(); - - /** - * Returns the i-th Paragraph of this Section. - */ - public abstract Paragraph getParagraph(int i); - - /** - * Retuns a List of all Paragraphs of this Section. - */ - public abstract List<Paragraph> getParagraphs(); - - /** - * Returns the Number of Tables of this Section. - */ - public abstract int nrOfTables(); - - /** - * Returns the i-th Table of this Section. - */ - public abstract Table getTable(int i); - - /** - * Returns a List of all Tables of this Section. - */ - public abstract List<Table> getTables(); - - /** - * Returns the Number of NestedLists of this Section. - */ - public abstract int nrOfNestedLists(); - - /** - * Returns the i-th NestedList of this Section as NestedListContainer. - */ - public abstract NestedListContainer getNestedList(int i); - - /** - * Returns a List of all NestedLists of this Section. - */ - public abstract List<NestedListContainer> getNestedLists(); - - /** - * Returns the Number of DefinitionLists of this Section. - */ - public abstract int nrOfDefinitionLists(); - - /** - * Returns the i-th Table of this Section. - */ - public abstract DefinitionList getDefinitionList(int i); - - /** - * Returns a List of all DefinitionLists of this Section. - */ - public abstract List<DefinitionList> getDefinitionLists(); - - /** - * Returns a sequence of Chars followed by ZERO. - * For easy handling the result is of the Type String. - */ - public abstract String toString(); + + private int level; + private ContentElement title; + + public Section(ContentElement title, int level) { + this.ccl = new ArrayList<>(); + this.level = level; + this.title = title; + if (title != null) ccl.add(title); + } + + /** + * Look at getLevel() for Details... + */ + public void setLevel(int level) { + this.level = level; + } + + /** + * Retruns the hirachical Level of this Section. + */ + public int getLevel() { + return level; + } + + /** + * Returns getTitleElement().getText() without NullPointerException + */ + public String getTitle() { + if (title != null) + return title.getText(); + else + return null; + } + + /** + * Look at getTitleElement() for Details... + */ + public void setTitleElement(ContentElement title) { + if (title != null) { + if (this.title == null) ccl.add(0, title); + else ccl.set(0, title); + } else if (this.title != null) ccl.remove(this.title); + + this.title = title; + } + + /** + * Returns a ContentElement representing the content, originally given as + * MediaWiki SourcCode, beween one ore more equality chars at the beginning + * of a line. This is known as Title. + */ + public ContentElement getTitleElement() { + return title; + } + + /** + * Return a List with all Content of any Type in Order of appearance. + */ + public abstract List<Content> getContentList(); + + /** + * Returns the Number of Paragraphs in this Section. + */ + public abstract int nrOfParagraphs(); + + /** + * Returns the i-th Paragraph of this Section. + */ + public abstract Paragraph getParagraph(int i); + + /** + * Retuns a List of all Paragraphs of this Section. + */ + public abstract List<Paragraph> getParagraphs(); + + /** + * Returns the Number of Tables of this Section. + */ + public abstract int nrOfTables(); + + /** + * Returns the i-th Table of this Section. + */ + public abstract Table getTable(int i); + + /** + * Returns a List of all Tables of this Section. + */ + public abstract List<Table> getTables(); + + /** + * Returns the Number of NestedLists of this Section. + */ + public abstract int nrOfNestedLists(); + + /** + * Returns the i-th NestedList of this Section as NestedListContainer. + */ + public abstract NestedListContainer getNestedList(int i); + + /** + * Returns a List of all NestedLists of this Section. + */ + public abstract List<NestedListContainer> getNestedLists(); + + /** + * Returns the Number of DefinitionLists of this Section. + */ + public abstract int nrOfDefinitionLists(); + + /** + * Returns the i-th Table of this Section. + */ + public abstract DefinitionList getDefinitionList(int i); + + /** + * Returns a List of all DefinitionLists of this Section. + */ + public abstract List<DefinitionList> getDefinitionLists(); + + /** + * Returns a sequence of Chars followed by ZERO. + * For easy handling the result is of the Type String. + */ + public abstract String toString(); } diff --git a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/SectionContainer.java b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/SectionContainer.java index ebef1111..0b07b944 100755 --- a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/SectionContainer.java +++ b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/SectionContainer.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -26,167 +26,166 @@ * can be either, a SectionContent or anoter SectionContainer.<br> * For a description of the inherited functions of Section, take a * look at the Documentation of Section. - * */ public class SectionContainer extends Section { - - private final List<Section> sections; - - public SectionContainer(int level){ - super( null, level ); - sections = new ArrayList<>(); - } - - public SectionContainer(ContentElement title, int level){ - super( title, level ); - sections = new ArrayList<>(); - } - - /** - * Returns the Number of SubSection of this Section. - */ - public int nrOfSubSections(){ - return sections.size(); - } - - /** - * Adds a SubSection after the last SubSection. - */ - public void addSection( Section s ){ - sections.add( s ); - ccl.add( s ); - } - - /** - * Removes the specified Section. - */ - public void removeSection( Section s ){ - sections.remove( s ); - ccl.remove( s ); - } - - /** - * Returns the i�th SubSection of this Section. - */ - public Section getSubSection(int i){ - if( sections.size() > i ) return sections.get(i); - else return null; - } - - /** - * Returns a List of all SubSections of the next level. - */ - public List<Section> getSubSections(){ - return new ArrayList<>(sections); - } - - /* (non-Javadoc) - * @see org.tud.ukp.wikipedia.api.pageparser.Section#getContentList() - */ - public List<Content> getContentList(){ - return new ArrayList<>(ccl); - } - - public int nrOfParagraphs(){ - int result = 0; - for( Section s: sections ) result+= s.nrOfParagraphs(); - return result; - } - - public Paragraph getParagraph(int i){ - int nr; - int offset = 0; - for( Section s: sections ){ - nr = s.nrOfParagraphs(); - if( nr+offset > i )return s.getParagraph(i-offset); - offset += nr; - } - return null; - } - - public List<Paragraph> getParagraphs(){ - List<Paragraph> result = new ArrayList<>(); - for( Section s: sections ) result.addAll( s.getParagraphs() ); - return result; - } - - public int nrOfTables(){ - int result = 0; - for( Section s: sections ) result+= s.nrOfTables(); - return result; - } - - public Table getTable(int i){ - int nr; - int offset = 0; - for( Section s: sections ){ - nr = s.nrOfTables(); - if( nr+offset > i )return s.getTable(i-offset); - offset += nr; - } - return null; - } - - public List<Table> getTables(){ - List<Table> result = new ArrayList<>(); - for( Section s: sections ) result.addAll( s.getTables() ); - return result; - } - - public int nrOfNestedLists(){ - int result = 0; - for( Section s: sections )result += s.nrOfNestedLists(); - return result; - } - - public NestedListContainer getNestedList(int i){ - int nr; - int offset = 0; - for( Section s: sections ){ - nr = s.nrOfNestedLists(); - if( nr+offset > i )return s.getNestedList(i-offset); - offset += nr; - } - return null; - } - - public List<NestedListContainer> getNestedLists(){ - List<NestedListContainer> result = new ArrayList<>(); - for( Section s: sections ) result.addAll( s.getNestedLists() ); - return result; - } - - public int nrOfDefinitionLists(){ - int result = 0; - for( Section s: sections ) result+= s.nrOfDefinitionLists(); - return result; - } - - public DefinitionList getDefinitionList(int i){ - int nr; - int offset = 0; - for( Section s: sections ){ - nr = s.nrOfDefinitionLists(); - if( nr+offset > i )return s.getDefinitionList(i-offset); - offset += nr; - } - return null; - } - - public List<DefinitionList> getDefinitionLists(){ - List<DefinitionList> result = new ArrayList<>(); - for( Section s: sections ) result.addAll( s.getDefinitionLists() ); - return result; - } - - public String toString(){ - StringBuilder result = new StringBuilder(); - result.append( "SS_TITLE:\n"+ this.getTitleElement() ); - result.append( "\nSS_LEVEL: "+this.getLevel()); - result.append( "\nSS_SUBSECTIONS: "+ sections.size() ); - for( Section s: sections ) - result.append("\nSS_SUBSECTION:\n"+s.toString()); - - return result.toString(); - } + + private final List<Section> sections; + + public SectionContainer(int level) { + super(null, level); + sections = new ArrayList<>(); + } + + public SectionContainer(ContentElement title, int level) { + super(title, level); + sections = new ArrayList<>(); + } + + /** + * Returns the Number of SubSection of this Section. + */ + public int nrOfSubSections() { + return sections.size(); + } + + /** + * Adds a SubSection after the last SubSection. + */ + public void addSection(Section s) { + sections.add(s); + ccl.add(s); + } + + /** + * Removes the specified Section. + */ + public void removeSection(Section s) { + sections.remove(s); + ccl.remove(s); + } + + /** + * Returns the i�th SubSection of this Section. + */ + public Section getSubSection(int i) { + if (sections.size() > i) return sections.get(i); + else return null; + } + + /** + * Returns a List of all SubSections of the next level. + */ + public List<Section> getSubSections() { + return new ArrayList<>(sections); + } + + /* (non-Javadoc) + * @see org.tud.ukp.wikipedia.api.pageparser.Section#getContentList() + */ + public List<Content> getContentList() { + return new ArrayList<>(ccl); + } + + public int nrOfParagraphs() { + int result = 0; + for (Section s : sections) result += s.nrOfParagraphs(); + return result; + } + + public Paragraph getParagraph(int i) { + int nr; + int offset = 0; + for (Section s : sections) { + nr = s.nrOfParagraphs(); + if (nr + offset > i) return s.getParagraph(i - offset); + offset += nr; + } + return null; + } + + public List<Paragraph> getParagraphs() { + List<Paragraph> result = new ArrayList<>(); + for (Section s : sections) result.addAll(s.getParagraphs()); + return result; + } + + public int nrOfTables() { + int result = 0; + for (Section s : sections) result += s.nrOfTables(); + return result; + } + + public Table getTable(int i) { + int nr; + int offset = 0; + for (Section s : sections) { + nr = s.nrOfTables(); + if (nr + offset > i) return s.getTable(i - offset); + offset += nr; + } + return null; + } + + public List<Table> getTables() { + List<Table> result = new ArrayList<>(); + for (Section s : sections) result.addAll(s.getTables()); + return result; + } + + public int nrOfNestedLists() { + int result = 0; + for (Section s : sections) result += s.nrOfNestedLists(); + return result; + } + + public NestedListContainer getNestedList(int i) { + int nr; + int offset = 0; + for (Section s : sections) { + nr = s.nrOfNestedLists(); + if (nr + offset > i) return s.getNestedList(i - offset); + offset += nr; + } + return null; + } + + public List<NestedListContainer> getNestedLists() { + List<NestedListContainer> result = new ArrayList<>(); + for (Section s : sections) result.addAll(s.getNestedLists()); + return result; + } + + public int nrOfDefinitionLists() { + int result = 0; + for (Section s : sections) result += s.nrOfDefinitionLists(); + return result; + } + + public DefinitionList getDefinitionList(int i) { + int nr; + int offset = 0; + for (Section s : sections) { + nr = s.nrOfDefinitionLists(); + if (nr + offset > i) return s.getDefinitionList(i - offset); + offset += nr; + } + return null; + } + + public List<DefinitionList> getDefinitionLists() { + List<DefinitionList> result = new ArrayList<>(); + for (Section s : sections) result.addAll(s.getDefinitionLists()); + return result; + } + + public String toString() { + StringBuilder result = new StringBuilder(); + result.append("SS_TITLE:\n" + this.getTitleElement()); + result.append("\nSS_LEVEL: " + this.getLevel()); + result.append("\nSS_SUBSECTIONS: " + sections.size()); + for (Section s : sections) + result.append("\nSS_SUBSECTION:\n" + s.toString()); + + return result.toString(); + } } diff --git a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/SectionContent.java b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/SectionContent.java index dfec5676..c622106d 100755 --- a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/SectionContent.java +++ b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/SectionContent.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -26,137 +26,141 @@ * For a description of the inherited functions of Section, take a * look at the Documentation of Section. */ -public class SectionContent extends Section{ - - private List<Paragraph> paragraphs; - private List<Table> tables; - private List<NestedListContainer> nestedLists; - private List<DefinitionList> definitionLists; - - public SectionContent(int level){ - super( null, level ); - init(); - } - - public SectionContent(ContentElement title, int level){ - super( title, level ); - init(); - } - - private void init(){ - paragraphs = new ArrayList<>(); - tables = new ArrayList<>(); - nestedLists = new ArrayList<>(); - definitionLists = new ArrayList<>(); - } - - public List<Content> getContentList(){ - return new ArrayList<>(ccl); - } - - public int nrOfParagraphs(){ return paragraphs.size(); } - - public void addParagraph( Paragraph p ){ - paragraphs.add( p ); - ccl.add( p ); - } - - public void removeParagraph( Paragraph p ){ - paragraphs.remove( p ); - ccl.remove( p ); - } - - public Paragraph getParagraph(int i){ - if( paragraphs.size()> i) return paragraphs.get(i); - else return null; - } - - public List<Paragraph> getParagraphs(){ - return new ArrayList<>(paragraphs); - } - - public int nrOfTables(){ - return tables.size(); - } - - public void addTable( Table t ){ - tables.add( t ); - ccl.add( t ); - } - - public void removeTable( Table t ){ - tables.remove( t ); - ccl.remove( t ); - } - - public Table getTable(int i){ - if( tables.size()>i) return tables.get(i); - else return null; - } - - public List<Table> getTables(){ - return new ArrayList<>(tables); - } - - public int nrOfNestedLists(){ - return nestedLists.size(); - } - - public void addNestedList( NestedListContainer nl ){ - nestedLists.add( nl ); - ccl.add( nl ); - } - - public void removeNestedList( NestedListContainer nl ){ - nestedLists.remove( nl ); - ccl.remove( nl ); - } - - public NestedListContainer getNestedList(int i){ - if( nestedLists.size() > i ) return nestedLists.get(i); - else return null; - } - - public List<NestedListContainer> getNestedLists(){ - return new ArrayList<>(nestedLists); - } - - public int nrOfDefinitionLists(){ - return definitionLists.size(); - } - - public void addDefinitionList( DefinitionList dl ){ - definitionLists.add( dl ); - ccl.add( dl ); - } - - public void removeDefinitionList( DefinitionList dl ){ - definitionLists.remove( dl ); - ccl.remove( dl ); - } - - public DefinitionList getDefinitionList(int i){ - if( definitionLists.size() > i ) return definitionLists.get(i); - else return null; - } - - public List<DefinitionList> getDefinitionLists(){ return new ArrayList<>(definitionLists); } - - public String toString(){ - StringBuilder result = new StringBuilder(); - - result.append( "SC_TITLE:\n"+this.getTitleElement() ); - result.append( "\nSC_LEVEL: "+this.getLevel()); - - result.append("\nSC_PARAGRAPHS: "+paragraphs.size()); - for( Paragraph p: paragraphs) result.append( "\nSC_PARAGRAPH:\n"+p ); - result.append("\nSC_TABLES: "+tables.size()); - for( Table t: tables) result.append("\nSC_TABLE:\n"+ t); - result.append("\nSC_NESTED_LISTS: "+nestedLists.size()); - for( NestedList nl: nestedLists) result.append("\nSC_NESTED_LIST:\n"+nl); - result.append("\nSC_DEFINITON_LISTS: "+definitionLists.size()); - for( DefinitionList dl: definitionLists)result.append("\nSC_DEFINITION_LIST:\n"+dl); - - return result.toString(); - } +public class SectionContent extends Section { + + private List<Paragraph> paragraphs; + private List<Table> tables; + private List<NestedListContainer> nestedLists; + private List<DefinitionList> definitionLists; + + public SectionContent(int level) { + super(null, level); + init(); + } + + public SectionContent(ContentElement title, int level) { + super(title, level); + init(); + } + + private void init() { + paragraphs = new ArrayList<>(); + tables = new ArrayList<>(); + nestedLists = new ArrayList<>(); + definitionLists = new ArrayList<>(); + } + + public List<Content> getContentList() { + return new ArrayList<>(ccl); + } + + public int nrOfParagraphs() { + return paragraphs.size(); + } + + public void addParagraph(Paragraph p) { + paragraphs.add(p); + ccl.add(p); + } + + public void removeParagraph(Paragraph p) { + paragraphs.remove(p); + ccl.remove(p); + } + + public Paragraph getParagraph(int i) { + if (paragraphs.size() > i) return paragraphs.get(i); + else return null; + } + + public List<Paragraph> getParagraphs() { + return new ArrayList<>(paragraphs); + } + + public int nrOfTables() { + return tables.size(); + } + + public void addTable(Table t) { + tables.add(t); + ccl.add(t); + } + + public void removeTable(Table t) { + tables.remove(t); + ccl.remove(t); + } + + public Table getTable(int i) { + if (tables.size() > i) return tables.get(i); + else return null; + } + + public List<Table> getTables() { + return new ArrayList<>(tables); + } + + public int nrOfNestedLists() { + return nestedLists.size(); + } + + public void addNestedList(NestedListContainer nl) { + nestedLists.add(nl); + ccl.add(nl); + } + + public void removeNestedList(NestedListContainer nl) { + nestedLists.remove(nl); + ccl.remove(nl); + } + + public NestedListContainer getNestedList(int i) { + if (nestedLists.size() > i) return nestedLists.get(i); + else return null; + } + + public List<NestedListContainer> getNestedLists() { + return new ArrayList<>(nestedLists); + } + + public int nrOfDefinitionLists() { + return definitionLists.size(); + } + + public void addDefinitionList(DefinitionList dl) { + definitionLists.add(dl); + ccl.add(dl); + } + + public void removeDefinitionList(DefinitionList dl) { + definitionLists.remove(dl); + ccl.remove(dl); + } + + public DefinitionList getDefinitionList(int i) { + if (definitionLists.size() > i) return definitionLists.get(i); + else return null; + } + + public List<DefinitionList> getDefinitionLists() { + return new ArrayList<>(definitionLists); + } + + public String toString() { + StringBuilder result = new StringBuilder(); + + result.append("SC_TITLE:\n" + this.getTitleElement()); + result.append("\nSC_LEVEL: " + this.getLevel()); + + result.append("\nSC_PARAGRAPHS: " + paragraphs.size()); + for (Paragraph p : paragraphs) result.append("\nSC_PARAGRAPH:\n" + p); + result.append("\nSC_TABLES: " + tables.size()); + for (Table t : tables) result.append("\nSC_TABLE:\n" + t); + result.append("\nSC_NESTED_LISTS: " + nestedLists.size()); + for (NestedList nl : nestedLists) result.append("\nSC_NESTED_LIST:\n" + nl); + result.append("\nSC_DEFINITON_LISTS: " + definitionLists.size()); + for (DefinitionList dl : definitionLists) result.append("\nSC_DEFINITION_LIST:\n" + dl); + + return result.toString(); + } } diff --git a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/Span.java b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/Span.java index a8427ef8..8ccec238 100755 --- a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/Span.java +++ b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/Span.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -20,154 +20,155 @@ /** * Provides a Start and End Position... */ -public class Span extends ParsedPageObject{ - - private int start; - private int end; - - public Span(int start, int end) { - this.start = start; - this.end = end; - } - - public int getStart(){ - return start; - } - - public Span setStart( int start ){ - this.start = start; - return this; - } - - public Span adjustStart( int n ){ - start+=n; - return this; - } - - public int getEnd(){ - return end; - } - - public Span setEnd( int end ){ - this.end = end; - return this; - } - - public Span adjustEnd( int n ){ - end+=n; - return this; - } - - public Span adjust( int n ){ return adjust(0,n); } - - /** - * Adjusts the start and end Position of the Span, if they are - * larger than the offset. - */ - public Span adjust(int offset, int n){ - if( offset < 0 )return this; //null - - if( offset < end ){ - end += n; - if( end<offset )end = offset; - } - else return this; //null - - if( offset < start ){ - start += n; - if( start<offset )start = offset; - } - return this; - } - - public boolean equals(int start, int end){ - return ((this.start == start )&&( this.end == end)); - } - - public boolean equals(Span s){ - return ((this.start == s.getStart() )&&( this.end == s.getEnd() )); - } - - /** - * returns true if this Span is in the range of the Span s. - */ - public boolean hits( Span s ){ - return start < s.getEnd() && s.getStart() < end; - } - - public String toString(){ - return "("+start+", "+end+")"; - } - - /** - * simply src.substring( this.getStart(), this.getEnd ); - */ - public String getText(String src){ - if (end > src.length()) { - end = src.length(); - } - return src.substring(start, end); - } - - /** - * A defined ErrorChar which will be returnd when an error occures.<br> - * An ErrorChar seems to be more easy to handle than e.g. an IndexOutOfBoundsException. - */ - public static final char ERRORCHAR = 0; - - public char charAt(int pos, CharSequence cs){ - if( pos + start < end ) return cs.charAt( start + pos ); - else return ERRORCHAR; - } - - public int nonWSCharPos(CharSequence cs){ - int pos=0; - while( charAt(pos, cs)==' ' )pos++; - return pos; - } - - /** - * Returns the Span, with trailing whitespaces omitted. - */ - public Span trimTrail(CharSequence src){ - if( start<end ){ - while(src.charAt( end-1 ) == 32){ - end--; - if( start==end )break; - } - } - return this; - } - - /** - * Returns the Span, with leading and trailing whitespaces omitted. - */ - public Span trim( CharSequence src ){ - if( start<end ) - while(src.charAt( end-1 ) == 32){ - end--; - if( start==end )break; - } - - if( start<end ) - while( src.charAt( start ) == 32){ - start++; - if( start==end)break; - } - - return this; - } - - /** - * returns this.getEnd()-this.getStart() - */ - public int length(){ - return end-start; - } - - public Span clone(){ - Span result = new Span( start, end ); - result.setSrcSpan( this.getSrcSpan() ); - return result; - } +public class Span extends ParsedPageObject { + + private int start; + private int end; + + public Span(int start, int end) { + this.start = start; + this.end = end; + } + + public int getStart() { + return start; + } + + public Span setStart(int start) { + this.start = start; + return this; + } + + public Span adjustStart(int n) { + start += n; + return this; + } + + public int getEnd() { + return end; + } + + public Span setEnd(int end) { + this.end = end; + return this; + } + + public Span adjustEnd(int n) { + end += n; + return this; + } + + public Span adjust(int n) { + return adjust(0, n); + } + + /** + * Adjusts the start and end Position of the Span, if they are + * larger than the offset. + */ + public Span adjust(int offset, int n) { + if (offset < 0) return this; //null + + if (offset < end) { + end += n; + if (end < offset) end = offset; + } else return this; //null + + if (offset < start) { + start += n; + if (start < offset) start = offset; + } + return this; + } + + public boolean equals(int start, int end) { + return ((this.start == start) && (this.end == end)); + } + + public boolean equals(Span s) { + return ((this.start == s.getStart()) && (this.end == s.getEnd())); + } + + /** + * returns true if this Span is in the range of the Span s. + */ + public boolean hits(Span s) { + return start < s.getEnd() && s.getStart() < end; + } + + public String toString() { + return "(" + start + ", " + end + ")"; + } + + /** + * simply src.substring( this.getStart(), this.getEnd ); + */ + public String getText(String src) { + if (end > src.length()) { + end = src.length(); + } + return src.substring(start, end); + } + + /** + * A defined ErrorChar which will be returnd when an error occures.<br> + * An ErrorChar seems to be more easy to handle than e.g. an IndexOutOfBoundsException. + */ + public static final char ERRORCHAR = 0; + + public char charAt(int pos, CharSequence cs) { + if (pos + start < end) return cs.charAt(start + pos); + else return ERRORCHAR; + } + + public int nonWSCharPos(CharSequence cs) { + int pos = 0; + while (charAt(pos, cs) == ' ') pos++; + return pos; + } + + /** + * Returns the Span, with trailing whitespaces omitted. + */ + public Span trimTrail(CharSequence src) { + if (start < end) { + while (src.charAt(end - 1) == 32) { + end--; + if (start == end) break; + } + } + return this; + } + + /** + * Returns the Span, with leading and trailing whitespaces omitted. + */ + public Span trim(CharSequence src) { + if (start < end) + while (src.charAt(end - 1) == 32) { + end--; + if (start == end) break; + } + + if (start < end) + while (src.charAt(start) == 32) { + start++; + if (start == end) break; + } + + return this; + } + + /** + * returns this.getEnd()-this.getStart() + */ + public int length() { + return end - start; + } + + public Span clone() { + Span result = new Span(start, end); + result.setSrcSpan(this.getSrcSpan()); + return result; + } } diff --git a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/SrcSpan.java b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/SrcSpan.java index a2027529..707cab78 100755 --- a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/SrcSpan.java +++ b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/SrcSpan.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -21,47 +21,47 @@ * */ public class SrcSpan { - private int start; - private int end; - - /** - * @param start is the startposition of the Object in the original MediaWikiSource - * @param end is the endposition of the Object in the original MediaWikiSource - */ - public SrcSpan(int start, int end) { - this.start = start; - this.end = end; - } + private int start; + private int end; + + /** + * @param start is the startposition of the Object in the original MediaWikiSource + * @param end is the endposition of the Object in the original MediaWikiSource + */ + public SrcSpan(int start, int end) { + this.start = start; + this.end = end; + } + + /** + * Look at Constructor for Details... + */ + public int getEnd() { + return end; + } - /** - * Look at Constructor for Details... - */ - public int getEnd() { - return end; - } + /** + * Look at Constructor for Details... + */ + public void setEnd(int end) { + this.end = end; + } - /** - * Look at Constructor for Details... - */ - public void setEnd(int end) { - this.end = end; - } + /** + * Look at Constructor for Details... + */ + public int getStart() { + return start; + } - /** - * Look at Constructor for Details... - */ - public int getStart() { - return start; - } + /** + * Look at Constructor for Details... + */ + public void setStart(int start) { + this.start = start; + } - /** - * Look at Constructor for Details... - */ - public void setStart(int start) { - this.start = start; - } - - public String toString(){ - return "("+start+", "+end+")"; - } + public String toString() { + return "(" + start + ", " + end + ")"; + } } diff --git a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/Table.java b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/Table.java index e2780a80..33d3aad4 100755 --- a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/Table.java +++ b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/Table.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -24,152 +24,150 @@ * A Table has a Title and contains TableElements.<br> * This Class provides all needed functions simmilar to the other classes in * this package. - * */ -public class Table extends ContentContainer{ - - private final List<TableElement> tableElements; - private ContentElement title; - - public Table(){ - ccl = new ArrayList<>(); - tableElements = new ArrayList<>(); - } - - public String toString(){ - StringBuilder result = new StringBuilder(); - - result.append( "TB_TableElements: "+tableElements.size() ); - for( TableElement td: tableElements ) result.append( "\n"+td ); - - return result.toString(); - } - - public void addTableElement( TableElement te ){ - tableElements.add( te ); - ccl.add( te ); - } - - public void removeTableElement( TableElement te ){ - tableElements.remove( te ); - ccl.remove( te ); - } - - public TableElement getTableElement( int i ){ - return tableElements.get(i); - } - - public ContentElement getTitleElement( ){ - return this.title; - } - - public void setTitleElement( ContentElement title ){ - if( title != null ){ - if( this.title == null ) ccl.add( 0, title ); - else ccl.set( 0, title ); - } - else if( this.title != null ) ccl.remove( this.title ); - - this.title = title; - } - - public int nrOfTableElements(){ - return tableElements.size(); - } - - public List<Content> getContentList(){ - return new ArrayList<>(ccl); - } - - public int nrOfParagraphs(){ - int result = 0; - for( TableElement td: tableElements ) result+= td.nrOfParagraphs(); - return result; - } - - public Paragraph getParagraph(int i){ - int nr; - int offset = 0; - for( TableElement td: tableElements ){ - nr = td.nrOfParagraphs(); - if( nr+offset > i )return td.getParagraph(i-offset); - offset += nr; - } - return null; - } - - public List<Paragraph> getParagraphs(){ - List<Paragraph> result = new ArrayList<>(); - for( TableElement td: tableElements ) result.addAll( td.getParagraphs() ); - return result; - } - - public int nrOfTables(){ - int result = 0; - for( TableElement td: tableElements ) result+= td.nrOfTables(); - return result; - } - - public Table getTable(int i){ - int nr; - int offset = 0; - for( TableElement td: tableElements ){ - nr = td.nrOfTables(); - if( nr+offset > i )return td.getTable(i-offset); - offset += nr; - } - return null; - } - - public List<Table> getTables(){ - List<Table> result = new ArrayList<>(); - for( TableElement td: tableElements ) result.addAll( td.getTables() ); - return result; - } - - public int nrOfNestedLists(){ - int result = 0; - for( TableElement td: tableElements )result += td.nrOfNestedLists(); - return result; - } - - public NestedList getNestedList(int i){ - int nr; - int offset = 0; - for( TableElement td: tableElements ){ - nr = td.nrOfNestedLists(); - if( nr+offset > i )return td.getNestedList(i-offset); - offset += nr; - } - return null; - } - - public List<NestedList> getNestedLists(){ - List<NestedList> result = new ArrayList<>(); - for( TableElement td: tableElements ) result.addAll( td.getNestedLists() ); - return result; - } - - public int nrOfDefinitionLists(){ - int result = 0; - for( TableElement td: tableElements ) result+= td.nrOfDefinitionLists(); - return result; - } - - public DefinitionList getDefinitionList(int i){ - int nr; - int offset = 0; - for( TableElement td: tableElements ){ - nr = td.nrOfDefinitionLists(); - if( nr+offset > i )return td.getDefinitionList(i-offset); - offset += nr; - } - return null; - } - - public List<DefinitionList> getDefinitionLists(){ - List<DefinitionList> result = new ArrayList<>(); - for( TableElement td: tableElements ) result.addAll( td.getDefinitionLists() ); - return result; - } +public class Table extends ContentContainer { + + private final List<TableElement> tableElements; + private ContentElement title; + + public Table() { + ccl = new ArrayList<>(); + tableElements = new ArrayList<>(); + } + + public String toString() { + StringBuilder result = new StringBuilder(); + + result.append("TB_TableElements: " + tableElements.size()); + for (TableElement td : tableElements) result.append("\n" + td); + + return result.toString(); + } + + public void addTableElement(TableElement te) { + tableElements.add(te); + ccl.add(te); + } + + public void removeTableElement(TableElement te) { + tableElements.remove(te); + ccl.remove(te); + } + + public TableElement getTableElement(int i) { + return tableElements.get(i); + } + + public ContentElement getTitleElement() { + return this.title; + } + + public void setTitleElement(ContentElement title) { + if (title != null) { + if (this.title == null) ccl.add(0, title); + else ccl.set(0, title); + } else if (this.title != null) ccl.remove(this.title); + + this.title = title; + } + + public int nrOfTableElements() { + return tableElements.size(); + } + + public List<Content> getContentList() { + return new ArrayList<>(ccl); + } + + public int nrOfParagraphs() { + int result = 0; + for (TableElement td : tableElements) result += td.nrOfParagraphs(); + return result; + } + + public Paragraph getParagraph(int i) { + int nr; + int offset = 0; + for (TableElement td : tableElements) { + nr = td.nrOfParagraphs(); + if (nr + offset > i) return td.getParagraph(i - offset); + offset += nr; + } + return null; + } + + public List<Paragraph> getParagraphs() { + List<Paragraph> result = new ArrayList<>(); + for (TableElement td : tableElements) result.addAll(td.getParagraphs()); + return result; + } + + public int nrOfTables() { + int result = 0; + for (TableElement td : tableElements) result += td.nrOfTables(); + return result; + } + + public Table getTable(int i) { + int nr; + int offset = 0; + for (TableElement td : tableElements) { + nr = td.nrOfTables(); + if (nr + offset > i) return td.getTable(i - offset); + offset += nr; + } + return null; + } + + public List<Table> getTables() { + List<Table> result = new ArrayList<>(); + for (TableElement td : tableElements) result.addAll(td.getTables()); + return result; + } + + public int nrOfNestedLists() { + int result = 0; + for (TableElement td : tableElements) result += td.nrOfNestedLists(); + return result; + } + + public NestedList getNestedList(int i) { + int nr; + int offset = 0; + for (TableElement td : tableElements) { + nr = td.nrOfNestedLists(); + if (nr + offset > i) return td.getNestedList(i - offset); + offset += nr; + } + return null; + } + + public List<NestedList> getNestedLists() { + List<NestedList> result = new ArrayList<>(); + for (TableElement td : tableElements) result.addAll(td.getNestedLists()); + return result; + } + + public int nrOfDefinitionLists() { + int result = 0; + for (TableElement td : tableElements) result += td.nrOfDefinitionLists(); + return result; + } + + public DefinitionList getDefinitionList(int i) { + int nr; + int offset = 0; + for (TableElement td : tableElements) { + nr = td.nrOfDefinitionLists(); + if (nr + offset > i) return td.getDefinitionList(i - offset); + offset += nr; + } + return null; + } + + public List<DefinitionList> getDefinitionLists() { + List<DefinitionList> result = new ArrayList<>(); + for (TableElement td : tableElements) result.addAll(td.getDefinitionLists()); + return result; + } } diff --git a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/TableElement.java b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/TableElement.java index e767fc68..dafbb0ae 100755 --- a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/TableElement.java +++ b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/TableElement.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -25,45 +25,100 @@ * This implementation is needed, because a Table in MediaWiki can contain neary * everything. */ -public class TableElement extends ContentContainer{ - - private final int col; - private final int row; - private final SectionContainer s; - - public TableElement( SectionContainer s, int row, int col ){ - this.ccl = s.ccl; - this.s = s; - this.row = row; - this.col = col; - } - - public int getCol(){ return col; } - public int getRow(){ return row; } - - public int nrOfSections(){ return s.nrOfSubSections(); } - public Section getSection(int i){ return s.getSubSection(i); } - public void removeSection( Section s ){ this.s.removeSection( s );} - public List<Section> getSubSections(){ return s.getSubSections(); } - - public List<Content> getContentList(){ return s.getContentList(); } - - public int nrOfParagraphs(){ return s.nrOfParagraphs(); } - public Paragraph getParagraph(int i){ return s.getParagraph(i); } - public List<Paragraph> getParagraphs(){ return s.getParagraphs(); } - public int nrOfTables(){ return s.nrOfTables(); } - public Table getTable(int i){ return s.getTable(i); } - public List<Table> getTables(){ return s.getTables(); } - public int nrOfNestedLists(){ return s.nrOfNestedLists(); } - public NestedList getNestedList(int i){ return s.getNestedList(i); } - public List<NestedListContainer> getNestedLists(){ return s.getNestedLists(); } - public int nrOfDefinitionLists(){ return s.nrOfDefinitionLists(); } - public DefinitionList getDefinitionList(int i){ return s.getDefinitionList(i); } - public List<DefinitionList> getDefinitionLists(){ return s.getDefinitionLists(); } - - public SectionContainer getSectionContainer(){ return s; } - - public String toString(){ - return "TABLE_DATA: \n"+ s; - } +public class TableElement extends ContentContainer { + + private final int col; + private final int row; + private final SectionContainer s; + + public TableElement(SectionContainer s, int row, int col) { + this.ccl = s.ccl; + this.s = s; + this.row = row; + this.col = col; + } + + public int getCol() { + return col; + } + + public int getRow() { + return row; + } + + public int nrOfSections() { + return s.nrOfSubSections(); + } + + public Section getSection(int i) { + return s.getSubSection(i); + } + + public void removeSection(Section s) { + this.s.removeSection(s); + } + + public List<Section> getSubSections() { + return s.getSubSections(); + } + + public List<Content> getContentList() { + return s.getContentList(); + } + + public int nrOfParagraphs() { + return s.nrOfParagraphs(); + } + + public Paragraph getParagraph(int i) { + return s.getParagraph(i); + } + + public List<Paragraph> getParagraphs() { + return s.getParagraphs(); + } + + public int nrOfTables() { + return s.nrOfTables(); + } + + public Table getTable(int i) { + return s.getTable(i); + } + + public List<Table> getTables() { + return s.getTables(); + } + + public int nrOfNestedLists() { + return s.nrOfNestedLists(); + } + + public NestedList getNestedList(int i) { + return s.getNestedList(i); + } + + public List<NestedListContainer> getNestedLists() { + return s.getNestedLists(); + } + + public int nrOfDefinitionLists() { + return s.nrOfDefinitionLists(); + } + + public DefinitionList getDefinitionList(int i) { + return s.getDefinitionList(i); + } + + public List<DefinitionList> getDefinitionLists() { + return s.getDefinitionLists(); + } + + public SectionContainer getSectionContainer() { + return s; + } + + public String toString() { + return "TABLE_DATA: \n" + s; + } } diff --git a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/Template.java b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/Template.java index f2800127..97642ec4 100755 --- a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/Template.java +++ b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/Template.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -19,57 +19,57 @@ import java.util.List; -public class Template extends ParsedPageObject{ - - private Span pos; - private String name; - private List<String> parameters; - - public Template(Span pos, String name, List<String> parameters) { - this.pos = pos; - this.name = name; - this.parameters = parameters; - } +public class Template extends ParsedPageObject { - public String getName() { - return name; - } + private Span pos; + private String name; + private List<String> parameters; - public void setName(String name) { - this.name = name; - } + public Template(Span pos, String name, List<String> parameters) { + this.pos = pos; + this.name = name; + this.parameters = parameters; + } - public List<String> getParameters() { - return parameters; - } + public String getName() { + return name; + } - public void setParameters(List<String> parameters) { - this.parameters = parameters; - } + public void setName(String name) { + this.name = name; + } - /** - * Returns the Position Span of this Template refering to the ContentElement - * in which the Template occures. This is mainly the same like Link.getPos(), - * but a Template does�n know it�s HomeElement. - */ - public Span getPos() { - return pos; - } + public List<String> getParameters() { + return parameters; + } - /** - * Look at getPos for Details... - */ - public void setPos(Span pos) { - this.pos = pos; - } - - public String toString(){ - StringBuilder result = new StringBuilder(); - result.append("TE_NAME: \""+name+"\""); - result.append("\nTE_PARAMETERS: "+parameters.size()); - for( String parameter: parameters) result.append("\nTE_PARAMETER: \""+ parameter +"\""); - result.append("\nTE_POS: "+ pos); - return result.toString(); - } + public void setParameters(List<String> parameters) { + this.parameters = parameters; + } + + /** + * Returns the Position Span of this Template refering to the ContentElement + * in which the Template occures. This is mainly the same like Link.getPos(), + * but a Template does�n know it�s HomeElement. + */ + public Span getPos() { + return pos; + } + + /** + * Look at getPos for Details... + */ + public void setPos(Span pos) { + this.pos = pos; + } + + public String toString() { + StringBuilder result = new StringBuilder(); + result.append("TE_NAME: \"" + name + "\""); + result.append("\nTE_PARAMETERS: " + parameters.size()); + for (String parameter : parameters) result.append("\nTE_PARAMETER: \"" + parameter + "\""); + result.append("\nTE_POS: " + pos); + return result.toString(); + } } diff --git a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/html/HtmlWriter.java b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/html/HtmlWriter.java index 79f04ecf..69460d14 100644 --- a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/html/HtmlWriter.java +++ b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/html/HtmlWriter.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -54,29 +54,28 @@ * <br> * There is a ParsedPage.css for formatting the HTML Tags.<br> * Look at the {@code T7_HtmlFileDemo.java} in the 'tutorial' module for a better introduction. - * */ public class HtmlWriter { - private static final Logger logger = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); - - /** - * Generates HTML Output for a {@link ParsedPage}. - * - * @param pp The page that shall be parsed. - * @return A string containing the HTML rendering of the {@link ParsedPage}. - */ - public static String parsedPageToHtml( ParsedPage pp ){ - StringBuilder result = new StringBuilder(); - result.append(getHtmlHeader()); - - if( pp != null ) { - //Title - result.append( - "<table class=\"ParsedPage\">\n"+ - "<tr><th class=\"ParsedPage\">ParsedPage: \n" + - pp.getName()+ - "</th></tr>\n"); + private static final Logger logger = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + + /** + * Generates HTML Output for a {@link ParsedPage}. + * + * @param pp The page that shall be parsed. + * @return A string containing the HTML rendering of the {@link ParsedPage}. + */ + public static String parsedPageToHtml(ParsedPage pp) { + StringBuilder result = new StringBuilder(); + result.append(getHtmlHeader()); + + if (pp != null) { + //Title + result.append( + "<table class=\"ParsedPage\">\n" + + "<tr><th class=\"ParsedPage\">ParsedPage: \n" + + pp.getName() + + "</th></tr>\n"); // if( pp.aboutArticle()!=null ){ // result.append("<tr><td class=\"ParsedPage\">\n"); @@ -84,419 +83,412 @@ public static String parsedPageToHtml( ParsedPage pp ){ // result.append("</td></tr>\n"); // } - //Sections - result.append( - "<tr><td class=\"ParsedPage\">\n" ); - for( Section s: pp.getSections() ) { - result.append( sectionToHtml( s )); - } - result.append( - "</td></tr>\n"); - - //Categories - if( pp.getCategoryElement()!= null ){ - result.append("<tr><td class=\"ParsedPage\">\n"); - result.append("Categories:\n" + contentElementToHtml( pp.getCategoryElement() )); - result.append("</td></tr>\n"); - } - - //Languages - if( pp.getLanguagesElement()!= null ){ - result.append("<tr><td class=\"ParsedPage\">\n"); - result.append("Languages:\n" + contentElementToHtml( pp.getLanguagesElement() )); - result.append("</td></tr>\n"); - } - - //Finalize - result.append("</table>\n"); - } + //Sections + result.append( + "<tr><td class=\"ParsedPage\">\n"); + for (Section s : pp.getSections()) { + result.append(sectionToHtml(s)); + } + result.append( + "</td></tr>\n"); + + //Categories + if (pp.getCategoryElement() != null) { + result.append("<tr><td class=\"ParsedPage\">\n"); + result.append("Categories:\n" + contentElementToHtml(pp.getCategoryElement())); + result.append("</td></tr>\n"); + } + + //Languages + if (pp.getLanguagesElement() != null) { + result.append("<tr><td class=\"ParsedPage\">\n"); + result.append("Languages:\n" + contentElementToHtml(pp.getLanguagesElement())); + result.append("</td></tr>\n"); + } + + //Finalize + result.append("</table>\n"); + } - result.append(getHtmlFooter()); - - return result.toString(); - } - - /** - * @return Creates and returns the header of the HTML page - */ - private static String getHtmlHeader() { - StringBuilder header = new StringBuilder(); - header.append("<?xml version=\"1.0\" encoding=\"UTF-8\"?>"); - header.append("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">"); - header.append("<html>"); - header.append("<head>"); - header.append(getCSS()); + result.append(getHtmlFooter()); + + return result.toString(); + } + + /** + * @return Creates and returns the header of the HTML page + */ + private static String getHtmlHeader() { + StringBuilder header = new StringBuilder(); + header.append("<?xml version=\"1.0\" encoding=\"UTF-8\"?>"); + header.append("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">"); + header.append("<html>"); + header.append("<head>"); + header.append(getCSS()); // header.append(" <link href=\""+cssFileName+"\" type=\"text/css\" rel=\"stylesheet\"/>"); - header.append("</head>"); - header.append("<body>"); - - return header.toString(); - } - - /** - * @return Creates and returns the footer of the HTML page - */ - private static String getHtmlFooter() { - StringBuilder footer = new StringBuilder(); - footer.append("</body>"); - footer.append("</html>"); - - return footer.toString(); - } - - /** - * @return Creates and returns the CSS definitions of the HTML page - */ - private static String getCSS() { - StringBuilder css = new StringBuilder(); - css.append("<style>"); - css.append(ParsedPageCSS.getFileText()); - css.append("</style>"); - - return css.toString(); - } - - /** - * Generates HTML Output for a {@link SectionContainer} or {@link SectionContent}. - */ - private static String sectionToHtml( Section s ){ - - return "<table class=\"Section\">\n"+ - "<tr><th class=\"Section\">\n" + - - "<table class=\"SectionTh\"><tr>\n" + - "<th class=\"SectionTh\">\n" + - (s.getClass() == SectionContainer.class?"SectionStructure":"SectionContent")+":<br>\n"+ - "Level: "+s.getLevel()+"\n"+ - "</th><th class=\"SectionTh\">\n" + - (s.getTitleElement()!=null?contentElementToHtml( s.getTitleElement() ):"")+ - "</th>\n" + - "</tr></table>\n"+ - - "</th></tr>\n" + - "<tr><td class=\"Section\">\n"+ - sectionCCLToHtml( s )+ - "</td></tr>\n"+ - "</table>\n"; - } - - private static String sectionCCLToHtml( Section s ){ - StringBuilder result = new StringBuilder(); - - if( s.getClass() == SectionContainer.class ){ - for( Section ss: ((SectionContainer)s).getSubSections() ) { - result.append( sectionToHtml( ss )); - } - } - else{ - List<Content> ccl = s.getContentList(); - for( int i=(s.getTitleElement()!=null?1:0); i<ccl.size(); i++ ){ - Content c = ccl.get(i); - Class<? extends Content> cc = c.getClass(); - if( cc == Paragraph.class ) { - result.append( paragraphToHtml( (Paragraph)c ) ); - } - else if( cc == DefinitionList.class ) { - result.append( definitionListToHtml( (DefinitionList)c ) ); - } - else if( cc == NestedListContainer.class ) { - result.append( nestedListToHtml( (NestedList)c ) ); - } - else if( cc == Table.class ) { - result.append( tableToHtml( (Table)c ) ); - } - else { - result.append("\n<pre>UNKNOWN CLASS: "+cc+"\n"+ convertTags( c.toString() )+"</pre>\n"); - } - } - } - - return result.toString(); - } - - /** - * Generates HTML Output for a {@link Paragraph}. - */ - private static String paragraphToHtml( Paragraph p ){ - return contentElementToHtml( p, "Paragraph", "Paragraph: "+p.getType() ); - } - - /** - * Generates HTML Output for a {@link ContentElement}. - */ - private static String contentElementToHtml( ContentElement ce ){ - return contentElementToHtml( ce, "ContentElement", "ContentElement" ); - } - - private static String contentElementToHtml( ContentElement ce, String cssClass, String headline ){ - - StringBuilder result = new StringBuilder(); - - result.append( - "<table class=\""+cssClass+"\">\n" + - "<tr><th class=\""+cssClass+"\">" +headline+"</th></tr>\n"+ - "<tr><td class=\""+cssClass+"\">\n" + - "\"" + convertTags( ce.getText() )+ "\"\n" + - "</td></tr>\n" ); - - String BoldWords = ce.getText( ce.getFormatSpans( FormatType.BOLD )); - if( BoldWords.length() > 0 ) { - result.append("<tr><td class=\""+cssClass+"\">BoldWords: "+convertTags(BoldWords)+"</td></tr>\n"); - } - - String ItalicWords = ce.getText( ce.getFormatSpans( FormatType.ITALIC )); - if( ItalicWords.length() > 0 ) { - result.append("<tr><td class=\""+cssClass+"\">italicWords: "+convertTags(ItalicWords)+"</td></tr>\n"); - } - - if( ce.getFormatSpans( FormatType.MATH ).size() != 0 ){ - result.append("<tr><td class=\""+cssClass+"\">MathTags\n"); - for( Span s: ce.getFormatSpans( FormatType.MATH ) ) { - result.append( s.toString() +"\n"); - } - result.append("</td></tr>\n"); - } - - if( ce.getFormatSpans( FormatType.TAG ).size()!=0 ){ - result.append("<tr><td class=\""+cssClass+"\">Tags:\n"); - for( Span s: ce.getFormatSpans( FormatType.TAG ) ) { - result.append( s.toString() +"\n"); - } - result.append("</td></tr>\n"); - } - - if( ce.getLinks().size()!=0 ){ - result.append("<tr><td class=\""+cssClass+"\">\n"); - for( Link l: ce.getLinks() ) { - result.append( linkToHtml( l ) ); - } - result.append("</td></tr>\n"); - } - - if( ce.getTemplates().size()!=0 ){ - result.append("<tr><td class=\""+cssClass+"\">\n"); - for( Template t: ce.getTemplates() ) { - result.append( templateToHtml( t )); - } - result.append("</td></tr>\n"); - } - - result.append( "</table>\n" ); - - return result.toString(); - } - - /** - * Generates HTML Output for a {@link DefinitionList}. - */ - private static String definitionListToHtml( DefinitionList dl){ - if( dl == null ) { - return "null"; - } - - StringBuilder result = new StringBuilder(); - - result.append("<table class=\"DefinitionList\">\n" + - "<tr><th class=\"DefinitionList\">DefinitionList</th></tr>\n"+ - "<tr><td class=\"DefinitionList\">" ); - - if( dl.getDefinedTerm() != null ) { - result.append( contentElementToHtml( dl.getDefinedTerm() )+ "\n"); - } - - result.append("<ul>"); - for( ContentElement ce: dl.getDefinitions() ) { - result.append("<li>"+contentElementToHtml(ce)+"</li>" ); - } - - result.append("</ul>\n"); - result.append("</td></tr>\n" ); - result.append("</table>\n"); - - return result.toString(); - } - - /** - * Generates HTML Output for a {@link NestedList}. - */ - private static String nestedListToHtml( NestedList nl ){ - if( nl == null ) { - return "null"; - } - - StringBuilder result = new StringBuilder(); - - if( nl.getClass()==NestedListElement.class ){ - result.append( "<li>\n"+ contentElementToHtml( (NestedListElement)nl ) +"</li>\n" ); - } - else{ - result.append("<table class=\"NestedList\">\n" + - "<tr><th class=\"NestedList\">NestedList</th></tr>\n"+ - "<tr><td class=\"NestedList\">" ); - - result.append((((NestedListContainer)nl).isNumbered()?"<ol>":"<ul>")+"\n" ); - for( NestedList nl2 : ((NestedListContainer)nl).getNestedLists() ) { - result.append( nestedListToHtml( nl2 ) ); - } - result.append((((NestedListContainer)nl).isNumbered()?"</ol>":"</ul>")+"\n"); - - result.append("</td></tr>\n" ); - result.append("</table>\n"); - } - - return result.toString(); - } - - /** - * Generates HTML Output for a {@link Table}. - */ - private static String tableToHtml( Table t ){ - - if( t == null ) { - return "null"; - } - - StringBuilder result = new StringBuilder(); - - int colspan; - try{ - colspan = t.getTableElement( t.nrOfTableElements()-1 ).getCol()+1; - }catch( Exception e){ - colspan = 1; - } - - result.append("<table class=\"Table\">\n<tr><th colspan="+colspan+" class=\"Table\">Table"); - - if( t.getTitleElement()!=null ) { - result.append( contentElementToHtml( t.getTitleElement() ) ); - } - - result.append("</th></tr>\n<tr>\n"); - - int row = 0; - for( int i=0; i<t.nrOfTableElements(); i++ ){ - TableElement td = t.getTableElement(i); - if( td.getRow() > row ){ - result.append( "</tr><tr>\n"); - row = td.getRow(); - } - - result.append( "<td class=\"Table\">\n" + tableElementToHtml( td ) +"</td>\n" ); - } - - result.append("</tr>\n</table>\n"); - return result.toString(); - } - - /** - * Generates HTML Output for a {@link TableElement}. - */ - private static String tableElementToHtml( TableElement td ){ - StringBuilder result = new StringBuilder(); - - result.append("Row: "+td.getRow()+" Col: "+td.getCol()+"\n"); - - if( td.nrOfSections()==1 && td.getSection(0).getTitleElement()==null) { - result.append( sectionCCLToHtml( td.getSection(0) )); - } - else { - for( int i=0; i<td.nrOfSections(); i++) { - result.append( sectionToHtml(td.getSection(i) )); - } - } - - return result.toString(); - } - - /** - * Generates HTML Output for a {@link Link}. - */ - private static String linkToHtml( Link l ){ - if( l == null ) { - return "null"; - } - - StringBuilder result = new StringBuilder(); - - result.append("<div class=\"Link\"><b class=\"Link\">Link:</b>" + - l.getType() + ": \"" + - convertTags( l.getText() )+ "\" -> \"" + convertTags( l.getTarget() ) +"\""); - - if( l.getParameters().size() != 0 ){ - for( String parameter: l.getParameters() ) { - result.append("<br>\nPARAMETER: \""+convertTags( parameter )+"\""); - } - } - - result.append("</div>\n"); - - return result.toString(); - } - - /** - * Generates HTML Output for a {@link Template}. - */ - private static String templateToHtml( Template t){ - if( t == null ) { - return "null"; - } - - StringBuilder result = new StringBuilder(); - - result.append( - "<table class=\"Template\">\n" + - "<tr><th class=\"Template\">Template</th></tr>\n"+ - "<tr><td class=\"Template\">" + - "Name: \""+convertTags( t.getName() )+"\"<br>"+ - "</td></tr>\n"); - - if( t.getParameters().size() != 0 ){ - result.append("<tr><td class=\"Template\">"); - for( String parameter: t.getParameters() ) { - result.append("Parameter: \""+convertTags( parameter )+"\"<br>"); - } - result.append("</td></tr>\n"); - } - - result.append("</table>" ); - - return result.toString(); - } - - private static String convertTags( String s ){ - if( s==null ) { - return null; - } - - StringBuilder result = new StringBuilder( s ); - - int temp; - - temp = 0; - while( (temp=result.indexOf("<", temp))!=-1 ) { - result.replace(temp, temp+1, "<"); - } - - temp = 0; - while( (temp=result.indexOf(">", temp))!=-1 ) { - result.replace(temp, temp+1, ">"); - } - - return result.toString(); - } - - public static void writeFile(String filename, String encoding, String text) { - - File outFile = new File(filename); - try (Writer destFile = new BufferedWriter(new OutputStreamWriter(new BufferedOutputStream(new FileOutputStream(outFile)), encoding))) { - destFile.write(text); - } catch (UnsupportedEncodingException e1) { - logger.error("Unsupported encoding exception while opening file '{}'", outFile.getAbsolutePath(), e1); - } catch (FileNotFoundException e1) { - logger.error("File '{}' not found.", outFile.getAbsolutePath(), e1); - } catch (IOException e) { - logger.error("IO exception while writing file '{}", outFile.getAbsolutePath(), e); + header.append("</head>"); + header.append("<body>"); + + return header.toString(); + } + + /** + * @return Creates and returns the footer of the HTML page + */ + private static String getHtmlFooter() { + StringBuilder footer = new StringBuilder(); + footer.append("</body>"); + footer.append("</html>"); + + return footer.toString(); + } + + /** + * @return Creates and returns the CSS definitions of the HTML page + */ + private static String getCSS() { + StringBuilder css = new StringBuilder(); + css.append("<style>"); + css.append(ParsedPageCSS.getFileText()); + css.append("</style>"); + + return css.toString(); + } + + /** + * Generates HTML Output for a {@link SectionContainer} or {@link SectionContent}. + */ + private static String sectionToHtml(Section s) { + + return "<table class=\"Section\">\n" + + "<tr><th class=\"Section\">\n" + + + "<table class=\"SectionTh\"><tr>\n" + + "<th class=\"SectionTh\">\n" + + (s.getClass() == SectionContainer.class ? "SectionStructure" : "SectionContent") + ":<br>\n" + + "Level: " + s.getLevel() + "\n" + + "</th><th class=\"SectionTh\">\n" + + (s.getTitleElement() != null ? contentElementToHtml(s.getTitleElement()) : "") + + "</th>\n" + + "</tr></table>\n" + + + "</th></tr>\n" + + "<tr><td class=\"Section\">\n" + + sectionCCLToHtml(s) + + "</td></tr>\n" + + "</table>\n"; + } + + private static String sectionCCLToHtml(Section s) { + StringBuilder result = new StringBuilder(); + + if (s.getClass() == SectionContainer.class) { + for (Section ss : ((SectionContainer) s).getSubSections()) { + result.append(sectionToHtml(ss)); + } + } else { + List<Content> ccl = s.getContentList(); + for (int i = (s.getTitleElement() != null ? 1 : 0); i < ccl.size(); i++) { + Content c = ccl.get(i); + Class<? extends Content> cc = c.getClass(); + if (cc == Paragraph.class) { + result.append(paragraphToHtml((Paragraph) c)); + } else if (cc == DefinitionList.class) { + result.append(definitionListToHtml((DefinitionList) c)); + } else if (cc == NestedListContainer.class) { + result.append(nestedListToHtml((NestedList) c)); + } else if (cc == Table.class) { + result.append(tableToHtml((Table) c)); + } else { + result.append("\n<pre>UNKNOWN CLASS: " + cc + "\n" + convertTags(c.toString()) + "</pre>\n"); } + } + } + + return result.toString(); + } + + /** + * Generates HTML Output for a {@link Paragraph}. + */ + private static String paragraphToHtml(Paragraph p) { + return contentElementToHtml(p, "Paragraph", "Paragraph: " + p.getType()); + } + + /** + * Generates HTML Output for a {@link ContentElement}. + */ + private static String contentElementToHtml(ContentElement ce) { + return contentElementToHtml(ce, "ContentElement", "ContentElement"); + } + + private static String contentElementToHtml(ContentElement ce, String cssClass, String headline) { + + StringBuilder result = new StringBuilder(); + + result.append( + "<table class=\"" + cssClass + "\">\n" + + "<tr><th class=\"" + cssClass + "\">" + headline + "</th></tr>\n" + + "<tr><td class=\"" + cssClass + "\">\n" + + "\"" + convertTags(ce.getText()) + "\"\n" + + "</td></tr>\n"); + + String BoldWords = ce.getText(ce.getFormatSpans(FormatType.BOLD)); + if (BoldWords.length() > 0) { + result.append("<tr><td class=\"" + cssClass + "\">BoldWords: " + convertTags(BoldWords) + "</td></tr>\n"); + } + + String ItalicWords = ce.getText(ce.getFormatSpans(FormatType.ITALIC)); + if (ItalicWords.length() > 0) { + result.append("<tr><td class=\"" + cssClass + "\">italicWords: " + convertTags(ItalicWords) + "</td></tr>\n"); + } + + if (ce.getFormatSpans(FormatType.MATH).size() != 0) { + result.append("<tr><td class=\"" + cssClass + "\">MathTags\n"); + for (Span s : ce.getFormatSpans(FormatType.MATH)) { + result.append(s.toString() + "\n"); + } + result.append("</td></tr>\n"); + } + + if (ce.getFormatSpans(FormatType.TAG).size() != 0) { + result.append("<tr><td class=\"" + cssClass + "\">Tags:\n"); + for (Span s : ce.getFormatSpans(FormatType.TAG)) { + result.append(s.toString() + "\n"); + } + result.append("</td></tr>\n"); + } + + if (ce.getLinks().size() != 0) { + result.append("<tr><td class=\"" + cssClass + "\">\n"); + for (Link l : ce.getLinks()) { + result.append(linkToHtml(l)); + } + result.append("</td></tr>\n"); + } + + if (ce.getTemplates().size() != 0) { + result.append("<tr><td class=\"" + cssClass + "\">\n"); + for (Template t : ce.getTemplates()) { + result.append(templateToHtml(t)); + } + result.append("</td></tr>\n"); + } + + result.append("</table>\n"); + + return result.toString(); + } + + /** + * Generates HTML Output for a {@link DefinitionList}. + */ + private static String definitionListToHtml(DefinitionList dl) { + if (dl == null) { + return "null"; + } + + StringBuilder result = new StringBuilder(); + + result.append("<table class=\"DefinitionList\">\n" + + "<tr><th class=\"DefinitionList\">DefinitionList</th></tr>\n" + + "<tr><td class=\"DefinitionList\">"); + + if (dl.getDefinedTerm() != null) { + result.append(contentElementToHtml(dl.getDefinedTerm()) + "\n"); + } + + result.append("<ul>"); + for (ContentElement ce : dl.getDefinitions()) { + result.append("<li>" + contentElementToHtml(ce) + "</li>"); + } + + result.append("</ul>\n"); + result.append("</td></tr>\n"); + result.append("</table>\n"); + + return result.toString(); + } + + /** + * Generates HTML Output for a {@link NestedList}. + */ + private static String nestedListToHtml(NestedList nl) { + if (nl == null) { + return "null"; + } + + StringBuilder result = new StringBuilder(); + + if (nl.getClass() == NestedListElement.class) { + result.append("<li>\n" + contentElementToHtml((NestedListElement) nl) + "</li>\n"); + } else { + result.append("<table class=\"NestedList\">\n" + + "<tr><th class=\"NestedList\">NestedList</th></tr>\n" + + "<tr><td class=\"NestedList\">"); + + result.append((((NestedListContainer) nl).isNumbered() ? "<ol>" : "<ul>") + "\n"); + for (NestedList nl2 : ((NestedListContainer) nl).getNestedLists()) { + result.append(nestedListToHtml(nl2)); + } + result.append((((NestedListContainer) nl).isNumbered() ? "</ol>" : "</ul>") + "\n"); + + result.append("</td></tr>\n"); + result.append("</table>\n"); + } + + return result.toString(); + } + + /** + * Generates HTML Output for a {@link Table}. + */ + private static String tableToHtml(Table t) { + + if (t == null) { + return "null"; + } + + StringBuilder result = new StringBuilder(); + + int colspan; + try { + colspan = t.getTableElement(t.nrOfTableElements() - 1).getCol() + 1; + } catch (Exception e) { + colspan = 1; + } + + result.append("<table class=\"Table\">\n<tr><th colspan=" + colspan + " class=\"Table\">Table"); + + if (t.getTitleElement() != null) { + result.append(contentElementToHtml(t.getTitleElement())); + } + + result.append("</th></tr>\n<tr>\n"); + + int row = 0; + for (int i = 0; i < t.nrOfTableElements(); i++) { + TableElement td = t.getTableElement(i); + if (td.getRow() > row) { + result.append("</tr><tr>\n"); + row = td.getRow(); + } + + result.append("<td class=\"Table\">\n" + tableElementToHtml(td) + "</td>\n"); + } + + result.append("</tr>\n</table>\n"); + return result.toString(); + } + + /** + * Generates HTML Output for a {@link TableElement}. + */ + private static String tableElementToHtml(TableElement td) { + StringBuilder result = new StringBuilder(); + + result.append("Row: " + td.getRow() + " Col: " + td.getCol() + "\n"); + + if (td.nrOfSections() == 1 && td.getSection(0).getTitleElement() == null) { + result.append(sectionCCLToHtml(td.getSection(0))); + } else { + for (int i = 0; i < td.nrOfSections(); i++) { + result.append(sectionToHtml(td.getSection(i))); + } + } + + return result.toString(); + } + + /** + * Generates HTML Output for a {@link Link}. + */ + private static String linkToHtml(Link l) { + if (l == null) { + return "null"; + } + + StringBuilder result = new StringBuilder(); + + result.append("<div class=\"Link\"><b class=\"Link\">Link:</b>" + + l.getType() + ": \"" + + convertTags(l.getText()) + "\" -> \"" + convertTags(l.getTarget()) + "\""); + + if (l.getParameters().size() != 0) { + for (String parameter : l.getParameters()) { + result.append("<br>\nPARAMETER: \"" + convertTags(parameter) + "\""); + } + } + + result.append("</div>\n"); + + return result.toString(); + } + + /** + * Generates HTML Output for a {@link Template}. + */ + private static String templateToHtml(Template t) { + if (t == null) { + return "null"; + } + + StringBuilder result = new StringBuilder(); + + result.append( + "<table class=\"Template\">\n" + + "<tr><th class=\"Template\">Template</th></tr>\n" + + "<tr><td class=\"Template\">" + + "Name: \"" + convertTags(t.getName()) + "\"<br>" + + "</td></tr>\n"); + + if (t.getParameters().size() != 0) { + result.append("<tr><td class=\"Template\">"); + for (String parameter : t.getParameters()) { + result.append("Parameter: \"" + convertTags(parameter) + "\"<br>"); + } + result.append("</td></tr>\n"); + } + + result.append("</table>"); + + return result.toString(); + } + + private static String convertTags(String s) { + if (s == null) { + return null; + } + + StringBuilder result = new StringBuilder(s); + + int temp; + + temp = 0; + while ((temp = result.indexOf("<", temp)) != -1) { + result.replace(temp, temp + 1, "<"); + } + + temp = 0; + while ((temp = result.indexOf(">", temp)) != -1) { + result.replace(temp, temp + 1, ">"); + } + + return result.toString(); + } + + public static void writeFile(String filename, String encoding, String text) { + + File outFile = new File(filename); + try (Writer destFile = new BufferedWriter(new OutputStreamWriter(new BufferedOutputStream(new FileOutputStream(outFile)), encoding))) { + destFile.write(text); + } catch (UnsupportedEncodingException e1) { + logger.error("Unsupported encoding exception while opening file '{}'", outFile.getAbsolutePath(), e1); + } catch (FileNotFoundException e1) { + logger.error("File '{}' not found.", outFile.getAbsolutePath(), e1); + } catch (IOException e) { + logger.error("IO exception while writing file '{}", outFile.getAbsolutePath(), e); } + } } diff --git a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/html/ParsedPage.css b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/html/ParsedPage.css index d1b7ae7f..49f9ee9b 100644 --- a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/html/ParsedPage.css +++ b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/html/ParsedPage.css @@ -16,87 +16,180 @@ * limitations under the License. */ -body -{ - font-size: 10pt; - font-family: Arial; -} - -table -{ - border-collapse: collapse; - border-spacing: 10px; - margin: 10px; - vertical-align: top; -} - -th{ - text-align: left; - border-width: 1px; - border-color: #000000; - border-style: solid; - - font-size: 10pt; - font-family: Arial; - font-weight: normal; - - padding: 10px; -} - -td{ - border-width: 1px; - border-color: #000000; - border-style: solid; - - font-size: 10pt; - font-family: monospace; - vertical-align: top; - - padding: 10px; -} - -table.ParsedPage{} -th.ParsedPage{ background-color: #FF8900; } -td.ParsedPage{ background-color: #FFD29E; } - -table.Section{ width: 100%; } -th.Section{ margin: 0px; padding: 0px; background-color: #FFFF00; } - table.SectionTh{ margin: 0px;} - th.SectionTh{ border-width: 0px; border-style:none; background-color: #FFFF00; vertical-align: middle; } -td.Section{ background-color: #EEEEEE; } - -table.Template{ margin: 2px; } -th.Template{ font-size: 7pt; padding: 1px; background-color: #99CCCC; } -td.Template{ padding: 5px; } - -table.Table{ margin: 2px; background-color: #EEEEEE; } -th.Table{ font-size: 7pt; padding: 1px; background-color: #FF0000; } -td.Table{ padding: 5px; background-color: #FFCCCC;} - - -b.Link{ color: #0000FF; } -div.Link{ - padding-left: 5px; - padding-right: 5px; - margin: 1px; - border-width: 1px; - border-color: #999999; - border-style: solid; - background-color: #EEEEEE; -} - -table.ContentElement{ margin: 2px; } -th.ContentElement{ font-size: 7pt; padding: 1px; background-color: #6699CC; } -td.ContentElement{ padding: 5px; background-color: #FFFFFF;} - -table.Paragraph{ margin: 2px; } -th.Paragraph{ font-size: 7pt; padding: 1px; background-color: #66CC00; } -td.Paragraph{ padding: 5px; background-color: #FFFFFF; } - -table.NestedList{ margin: 2px; } -th.NestedList{ font-size: 7pt; padding: 1px; background-color: #66CC00; } -td.NestedList{ padding: 5px; background-color: #CCFFCC; } - -table.DefinitionList{ margin: 2px; } -th.DefinitionList{ font-size: 7pt; padding: 1px; background-color: #66CC00; } -td.DefinitionList{ padding: 5px; background-color: #CCFFCC; } +body { + font-size: 10pt; + font-family: Arial; +} + +table { + border-collapse: collapse; + border-spacing: 10px; + margin: 10px; + vertical-align: top; +} + +th { + text-align: left; + border-width: 1px; + border-color: #000000; + border-style: solid; + + font-size: 10pt; + font-family: Arial; + font-weight: normal; + + padding: 10px; +} + +td { + border-width: 1px; + border-color: #000000; + border-style: solid; + + font-size: 10pt; + font-family: monospace; + vertical-align: top; + + padding: 10px; +} + +table.ParsedPage { +} + +th.ParsedPage { + background-color: #FF8900; +} + +td.ParsedPage { + background-color: #FFD29E; +} + +table.Section { + width: 100%; +} + +th.Section { + margin: 0px; + padding: 0px; + background-color: #FFFF00; +} + +table.SectionTh { + margin: 0px; +} + +th.SectionTh { + border-width: 0px; + border-style: none; + background-color: #FFFF00; + vertical-align: middle; +} + +td.Section { + background-color: #EEEEEE; +} + +table.Template { + margin: 2px; +} + +th.Template { + font-size: 7pt; + padding: 1px; + background-color: #99CCCC; +} + +td.Template { + padding: 5px; +} + +table.Table { + margin: 2px; + background-color: #EEEEEE; +} + +th.Table { + font-size: 7pt; + padding: 1px; + background-color: #FF0000; +} + +td.Table { + padding: 5px; + background-color: #FFCCCC; +} + + +b.Link { + color: #0000FF; +} + +div.Link { + padding-left: 5px; + padding-right: 5px; + margin: 1px; + border-width: 1px; + border-color: #999999; + border-style: solid; + background-color: #EEEEEE; +} + +table.ContentElement { + margin: 2px; +} + +th.ContentElement { + font-size: 7pt; + padding: 1px; + background-color: #6699CC; +} + +td.ContentElement { + padding: 5px; + background-color: #FFFFFF; +} + +table.Paragraph { + margin: 2px; +} + +th.Paragraph { + font-size: 7pt; + padding: 1px; + background-color: #66CC00; +} + +td.Paragraph { + padding: 5px; + background-color: #FFFFFF; +} + +table.NestedList { + margin: 2px; +} + +th.NestedList { + font-size: 7pt; + padding: 1px; + background-color: #66CC00; +} + +td.NestedList { + padding: 5px; + background-color: #CCFFCC; +} + +table.DefinitionList { + margin: 2px; +} + +th.DefinitionList { + font-size: 7pt; + padding: 1px; + background-color: #66CC00; +} + +td.DefinitionList { + padding: 5px; + background-color: #CCFFCC; +} diff --git a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/html/ParsedPageCSS.java b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/html/ParsedPageCSS.java index c18f57f7..55b621eb 100644 --- a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/html/ParsedPageCSS.java +++ b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/html/ParsedPageCSS.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -19,98 +19,98 @@ public class ParsedPageCSS { - private static final String LF = "\n"; + private static final String LF = "\n"; - public static String getFileText() { - StringBuilder sb = new StringBuilder(); + public static String getFileText() { + StringBuilder sb = new StringBuilder(); - sb.append("body"); - sb.append("{"); - sb.append(" font-size: 10pt;"); - sb.append(" font-family: Arial;"); - sb.append("}"); - sb.append(LF); - sb.append("table"); - sb.append("{"); - sb.append(" border-collapse: collapse;"); - sb.append(" border-spacing: 10px;"); - sb.append(" margin: 10px;"); - sb.append(" vertical-align: top;"); - sb.append("}"); - sb.append(LF); - sb.append("th{"); - sb.append(" text-align: left;"); - sb.append(" border-width: 1px;"); - sb.append(" border-color: #000000;"); - sb.append(" border-style: solid;"); - sb.append(LF); - sb.append(" font-size: 10pt;"); - sb.append(" font-family: Arial;"); - sb.append(" font-weight: normal;"); - sb.append(" "); - sb.append(" padding: 10px;"); - sb.append("}"); - sb.append(LF); - sb.append("td{"); - sb.append(" border-width: 1px;"); - sb.append(" border-color: #000000;"); - sb.append(" border-style: solid;"); - sb.append(" "); - sb.append(" font-size: 10pt;"); - sb.append(" font-family: monospace;"); - sb.append(" vertical-align: top;"); - sb.append(" "); - sb.append(" padding: 10px;"); - sb.append("}"); - sb.append(LF); - sb.append("table.ParsedPage{}"); - sb.append("th.ParsedPage{ background-color: #FF8900; }"); - sb.append("td.ParsedPage{ background-color: #FFD29E; }"); - sb.append(LF); - sb.append("table.Section{ width: 100%; }"); - sb.append("th.Section{ margin: 0px; padding: 0px; background-color: #FFFF00; }"); - sb.append(" table.SectionTh{ margin: 0px;}"); - sb.append(" th.SectionTh{ border-width: 0px; border-style:none; background-color: #FFFF00; vertical-align: middle; }"); - sb.append("td.Section{ background-color: #EEEEEE; }"); - sb.append(LF); - sb.append("table.Template{ margin: 2px; }"); - sb.append("th.Template{ font-size: 7pt; padding: 1px; background-color: #99CCCC; }"); - sb.append("td.Template{ padding: 5px; }"); - sb.append(""); - sb.append("table.Table{ margin: 2px; background-color: #EEEEEE; }"); - sb.append("th.Table{ font-size: 7pt; padding: 1px; background-color: #FF0000; }"); - sb.append("td.Table{ padding: 5px; background-color: #FFCCCC;}"); - sb.append(LF); - sb.append(LF); - sb.append("b.Link{ color: #0000FF; }"); - sb.append("div.Link{"); - sb.append(" padding-left: 5px;"); - sb.append(" padding-right: 5px;"); - sb.append(" margin: 1px;"); - sb.append(" border-width: 1px;"); - sb.append(" border-color: #999999;"); - sb.append(" border-style: solid; "); - sb.append(" background-color: #EEEEEE;"); - sb.append("}"); - sb.append(LF); - sb.append("table.ContentElement{ margin: 2px; }"); - sb.append("th.ContentElement{ font-size: 7pt; padding: 1px; background-color: #6699CC; }"); - sb.append("td.ContentElement{ padding: 5px; background-color: #FFFFFF;}"); - sb.append(LF); - sb.append("table.Paragraph{ margin: 2px; }"); - sb.append("th.Paragraph{ font-size: 7pt; padding: 1px; background-color: #66CC00; }"); - sb.append("td.Paragraph{ padding: 5px; background-color: #FFFFFF; }"); - sb.append(LF); - sb.append("table.NestedList{ margin: 2px; }"); - sb.append("th.NestedList{ font-size: 7pt; padding: 1px; background-color: #66CC00; }"); - sb.append("td.NestedList{ padding: 5px; background-color: #CCFFCC; }"); - sb.append(LF); - sb.append("table.DefinitionList{ margin: 2px; }"); - sb.append("th.DefinitionList{ font-size: 7pt; padding: 1px; background-color: #66CC00; }"); - sb.append("td.DefinitionList{ padding: 5px; background-color: #CCFFCC; }"); - sb.append(LF); + sb.append("body"); + sb.append("{"); + sb.append(" font-size: 10pt;"); + sb.append(" font-family: Arial;"); + sb.append("}"); + sb.append(LF); + sb.append("table"); + sb.append("{"); + sb.append(" border-collapse: collapse;"); + sb.append(" border-spacing: 10px;"); + sb.append(" margin: 10px;"); + sb.append(" vertical-align: top;"); + sb.append("}"); + sb.append(LF); + sb.append("th{"); + sb.append(" text-align: left;"); + sb.append(" border-width: 1px;"); + sb.append(" border-color: #000000;"); + sb.append(" border-style: solid;"); + sb.append(LF); + sb.append(" font-size: 10pt;"); + sb.append(" font-family: Arial;"); + sb.append(" font-weight: normal;"); + sb.append(" "); + sb.append(" padding: 10px;"); + sb.append("}"); + sb.append(LF); + sb.append("td{"); + sb.append(" border-width: 1px;"); + sb.append(" border-color: #000000;"); + sb.append(" border-style: solid;"); + sb.append(" "); + sb.append(" font-size: 10pt;"); + sb.append(" font-family: monospace;"); + sb.append(" vertical-align: top;"); + sb.append(" "); + sb.append(" padding: 10px;"); + sb.append("}"); + sb.append(LF); + sb.append("table.ParsedPage{}"); + sb.append("th.ParsedPage{ background-color: #FF8900; }"); + sb.append("td.ParsedPage{ background-color: #FFD29E; }"); + sb.append(LF); + sb.append("table.Section{ width: 100%; }"); + sb.append("th.Section{ margin: 0px; padding: 0px; background-color: #FFFF00; }"); + sb.append(" table.SectionTh{ margin: 0px;}"); + sb.append(" th.SectionTh{ border-width: 0px; border-style:none; background-color: #FFFF00; vertical-align: middle; }"); + sb.append("td.Section{ background-color: #EEEEEE; }"); + sb.append(LF); + sb.append("table.Template{ margin: 2px; }"); + sb.append("th.Template{ font-size: 7pt; padding: 1px; background-color: #99CCCC; }"); + sb.append("td.Template{ padding: 5px; }"); + sb.append(""); + sb.append("table.Table{ margin: 2px; background-color: #EEEEEE; }"); + sb.append("th.Table{ font-size: 7pt; padding: 1px; background-color: #FF0000; }"); + sb.append("td.Table{ padding: 5px; background-color: #FFCCCC;}"); + sb.append(LF); + sb.append(LF); + sb.append("b.Link{ color: #0000FF; }"); + sb.append("div.Link{"); + sb.append(" padding-left: 5px;"); + sb.append(" padding-right: 5px;"); + sb.append(" margin: 1px;"); + sb.append(" border-width: 1px;"); + sb.append(" border-color: #999999;"); + sb.append(" border-style: solid; "); + sb.append(" background-color: #EEEEEE;"); + sb.append("}"); + sb.append(LF); + sb.append("table.ContentElement{ margin: 2px; }"); + sb.append("th.ContentElement{ font-size: 7pt; padding: 1px; background-color: #6699CC; }"); + sb.append("td.ContentElement{ padding: 5px; background-color: #FFFFFF;}"); + sb.append(LF); + sb.append("table.Paragraph{ margin: 2px; }"); + sb.append("th.Paragraph{ font-size: 7pt; padding: 1px; background-color: #66CC00; }"); + sb.append("td.Paragraph{ padding: 5px; background-color: #FFFFFF; }"); + sb.append(LF); + sb.append("table.NestedList{ margin: 2px; }"); + sb.append("th.NestedList{ font-size: 7pt; padding: 1px; background-color: #66CC00; }"); + sb.append("td.NestedList{ padding: 5px; background-color: #CCFFCC; }"); + sb.append(LF); + sb.append("table.DefinitionList{ margin: 2px; }"); + sb.append("th.DefinitionList{ font-size: 7pt; padding: 1px; background-color: #66CC00; }"); + sb.append("td.DefinitionList{ padding: 5px; background-color: #CCFFCC; }"); + sb.append(LF); - return sb.toString(); - } + return sb.toString(); + } } diff --git a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/mediawiki/EmptyStructureRemover.java b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/mediawiki/EmptyStructureRemover.java index e2464233..4ecb30fa 100644 --- a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/mediawiki/EmptyStructureRemover.java +++ b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/mediawiki/EmptyStructureRemover.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -33,127 +33,125 @@ * It is possible that some Elements which has been parsed are empty after * the Parsing process becaus of the options which has been set. This class * can remove these empty elmentens. - * */ class EmptyStructureRemover { - - /** - * Removes all empty Structures from a SectionContainer and all substructures. - */ - public static SectionContainer eliminateEmptyStructures( SectionContainer sc ){ - - for( int i=sc.nrOfSubSections()-1; i>=0; i-- ){ - Section ss = sc.getSubSection( i ); - - if( ss.getClass() == SectionContainer.class ){ - SectionContainer sci = (SectionContainer)ss; - eliminateEmptyStructures( sci ); - } - else if( ss.getClass() == SectionContent.class ) - eliminateEmptyStructures( (SectionContent)ss ); - - if( ss.empty() ) sc.removeSection(ss); - } - - //encapsulating Sections - if( sc.nrOfSubSections()==1 && sc.getSubSection(0).getClass()==SectionContainer.class ){ - SectionContainer sc0 = (SectionContainer)sc.getSubSection( 0 ); - if( sc0.getTitleElement()==null ){ - sc.removeSection( sc0 ); - for( int i=0; i<sc0.nrOfSubSections(); i++) - sc.addSection( sc0.getSubSection(i) ); - } - } - - return sc; - } - - /** - * Removes all empty Structures from a SectionContent and all substructures. - */ - public static SectionContent eliminateEmptyStructures( SectionContent sc ){ - - for( int i=sc.nrOfParagraphs()-1; i>=0; i-- ){ - Paragraph p = sc.getParagraph(i); - if( p.empty() ) sc.removeParagraph( p ); - } - - for( int i=sc.nrOfDefinitionLists()-1; i>=0; i--){ - DefinitionList dl = sc.getDefinitionList(i); - eliminateEmptyStructures( dl ); - if( dl.empty() ) sc.removeDefinitionList( dl ); - } - - for( int i=sc.nrOfNestedLists()-1; i>=0; i--){ - NestedListContainer nl = sc.getNestedList(i); - eliminateEmptyStructures( nl ); - if( nl.empty() ) sc.removeNestedList( nl ); - } - - for( int i=sc.nrOfTables()-1; i>=0; i--){ - Table t = sc.getTable(i); - eliminateEmptyStructures( t ); - if( t.empty() ) sc.removeTable( t ); - } - - return sc; - } - - /** - * Removes all empty Structures from a NestedListContainer and all substructures. - */ - public static NestedListContainer eliminateEmptyStructures( NestedListContainer nlc ){ - for(int i=nlc.size()-1; i>=0; i--){ - NestedList nl = nlc.getNestedList(i); - if( nl.getClass()==NestedListContainer.class ) - eliminateEmptyStructures( (NestedListContainer)nl ); - - if( nl.empty() )nlc.remove( nl ); - } - return nlc; - } - - /** - * Removes all empty Structures from a Table and all substructures. - */ - public static Table eliminateEmptyStructures( Table t ){ - for( int i=t.nrOfTableElements()-1; i>=0; i-- ){ - TableElement te = t.getTableElement(i); - eliminateEmptyStructures( te ); - if( te.empty() ) t.removeTableElement( te ); - } - return t; - } - - /** - * Removes all empty Structures from a TableElement and all substructures. - */ - public static TableElement eliminateEmptyStructures( TableElement te ){ - for( int i=te.nrOfSections()-1; i>=0; i--){ - Section s = te.getSection(i); - - if( s.getClass() == SectionContainer.class ) - eliminateEmptyStructures( (SectionContainer)s ); - else if( s.getClass() == SectionContent.class ) - eliminateEmptyStructures( (SectionContent)s ); - - if( s.empty() ) te.removeSection( s ); - } - return te; - } - - /** - * Removes all empty Structures from a DefinitionList and all substructures. - */ - public static DefinitionList eliminateEmptyStructures( DefinitionList dl ){ - - ContentElement dt = dl.getDefinedTerm(); - if( dt!=null && dt.empty() ) dl.setDefinedTerm( null ); - - for(int i=dl.nrOfDefinitions()-1; i>=0; i-- ){ - ContentElement ce = dl.getDefinition(i); - if( ce.empty() )dl.removeDefinition( ce ); - } - return dl; - } + + /** + * Removes all empty Structures from a SectionContainer and all substructures. + */ + public static SectionContainer eliminateEmptyStructures(SectionContainer sc) { + + for (int i = sc.nrOfSubSections() - 1; i >= 0; i--) { + Section ss = sc.getSubSection(i); + + if (ss.getClass() == SectionContainer.class) { + SectionContainer sci = (SectionContainer) ss; + eliminateEmptyStructures(sci); + } else if (ss.getClass() == SectionContent.class) + eliminateEmptyStructures((SectionContent) ss); + + if (ss.empty()) sc.removeSection(ss); + } + + //encapsulating Sections + if (sc.nrOfSubSections() == 1 && sc.getSubSection(0).getClass() == SectionContainer.class) { + SectionContainer sc0 = (SectionContainer) sc.getSubSection(0); + if (sc0.getTitleElement() == null) { + sc.removeSection(sc0); + for (int i = 0; i < sc0.nrOfSubSections(); i++) + sc.addSection(sc0.getSubSection(i)); + } + } + + return sc; + } + + /** + * Removes all empty Structures from a SectionContent and all substructures. + */ + public static SectionContent eliminateEmptyStructures(SectionContent sc) { + + for (int i = sc.nrOfParagraphs() - 1; i >= 0; i--) { + Paragraph p = sc.getParagraph(i); + if (p.empty()) sc.removeParagraph(p); + } + + for (int i = sc.nrOfDefinitionLists() - 1; i >= 0; i--) { + DefinitionList dl = sc.getDefinitionList(i); + eliminateEmptyStructures(dl); + if (dl.empty()) sc.removeDefinitionList(dl); + } + + for (int i = sc.nrOfNestedLists() - 1; i >= 0; i--) { + NestedListContainer nl = sc.getNestedList(i); + eliminateEmptyStructures(nl); + if (nl.empty()) sc.removeNestedList(nl); + } + + for (int i = sc.nrOfTables() - 1; i >= 0; i--) { + Table t = sc.getTable(i); + eliminateEmptyStructures(t); + if (t.empty()) sc.removeTable(t); + } + + return sc; + } + + /** + * Removes all empty Structures from a NestedListContainer and all substructures. + */ + public static NestedListContainer eliminateEmptyStructures(NestedListContainer nlc) { + for (int i = nlc.size() - 1; i >= 0; i--) { + NestedList nl = nlc.getNestedList(i); + if (nl.getClass() == NestedListContainer.class) + eliminateEmptyStructures((NestedListContainer) nl); + + if (nl.empty()) nlc.remove(nl); + } + return nlc; + } + + /** + * Removes all empty Structures from a Table and all substructures. + */ + public static Table eliminateEmptyStructures(Table t) { + for (int i = t.nrOfTableElements() - 1; i >= 0; i--) { + TableElement te = t.getTableElement(i); + eliminateEmptyStructures(te); + if (te.empty()) t.removeTableElement(te); + } + return t; + } + + /** + * Removes all empty Structures from a TableElement and all substructures. + */ + public static TableElement eliminateEmptyStructures(TableElement te) { + for (int i = te.nrOfSections() - 1; i >= 0; i--) { + Section s = te.getSection(i); + + if (s.getClass() == SectionContainer.class) + eliminateEmptyStructures((SectionContainer) s); + else if (s.getClass() == SectionContent.class) + eliminateEmptyStructures((SectionContent) s); + + if (s.empty()) te.removeSection(s); + } + return te; + } + + /** + * Removes all empty Structures from a DefinitionList and all substructures. + */ + public static DefinitionList eliminateEmptyStructures(DefinitionList dl) { + + ContentElement dt = dl.getDefinedTerm(); + if (dt != null && dt.empty()) dl.setDefinedTerm(null); + + for (int i = dl.nrOfDefinitions() - 1; i >= 0; i--) { + ContentElement ce = dl.getDefinition(i); + if (ce.empty()) dl.removeDefinition(ce); + } + return dl; + } } diff --git a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/mediawiki/FlushTemplates.java b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/mediawiki/FlushTemplates.java index 466715ab..95208060 100644 --- a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/mediawiki/FlushTemplates.java +++ b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/mediawiki/FlushTemplates.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -22,19 +22,18 @@ /** * This TemplateParser will delete ALL templates, whitout any exception! - * */ public final class FlushTemplates implements MediaWikiTemplateParser { - public ResolvedTemplate parseTemplate(Template t, ParsedPage pp) { - ResolvedTemplate result = new ResolvedTemplate( t ); - result.setPreParseReplacement( ResolvedTemplate.TEMPLATESPACER ); - result.setPostParseReplacement( "" ); - result.setParsedObject( null ); - return result; - } - - public String configurationInfo(){ - return "All Templates will be Deleted"; - } + public ResolvedTemplate parseTemplate(Template t, ParsedPage pp) { + ResolvedTemplate result = new ResolvedTemplate(t); + result.setPreParseReplacement(ResolvedTemplate.TEMPLATESPACER); + result.setPostParseReplacement(""); + result.setParsedObject(null); + return result; + } + + public String configurationInfo() { + return "All Templates will be Deleted"; + } } diff --git a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/mediawiki/GermanTemplateParser.java b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/mediawiki/GermanTemplateParser.java index ceb30f01..df112b3f 100644 --- a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/mediawiki/GermanTemplateParser.java +++ b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/mediawiki/GermanTemplateParser.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -29,116 +29,112 @@ /** * This is the TemplateParser for the german language, with special treatment * for all the german templates, like "Dieser Artikel" or "Deutschlandlastig". - * */ public class GermanTemplateParser implements MediaWikiTemplateParser { - private static final Logger logger = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + private static final Logger logger = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); - private final String templatePrefix = "TEMPLATE["; - private final String templatePostfix = "]"; - private final String parameterDivisor = ", "; - private final String templateNotImplementedPrefix = "TEMPLATE NOT IMPLEMENTED["; - private final String templateNotImplementedPostfix = "]"; - private final String emptyLinkText = "[ ]"; + private final String templatePrefix = "TEMPLATE["; + private final String templatePostfix = "]"; + private final String parameterDivisor = ", "; + private final String templateNotImplementedPrefix = "TEMPLATE NOT IMPLEMENTED["; + private final String templateNotImplementedPostfix = "]"; + private final String emptyLinkText = "[ ]"; -// private MediaWikiContentElementParser parser; - private final List<String> deleteTemplates; - private final List<String> parseTemplates; + // private MediaWikiContentElementParser parser; + private final List<String> deleteTemplates; + private final List<String> parseTemplates; - public GermanTemplateParser(MediaWikiContentElementParser parser, List<String> deleteTemplates, List<String> parseTemplates){ - this.deleteTemplates = deleteTemplates; - this.parseTemplates = parseTemplates; + public GermanTemplateParser(MediaWikiContentElementParser parser, List<String> deleteTemplates, List<String> parseTemplates) { + this.deleteTemplates = deleteTemplates; + this.parseTemplates = parseTemplates; // this.parser = parser; - } - - public String configurationInfo(){ - StringBuilder result = new StringBuilder(); - result.append("Standard Template treatment: ShowNameAndParameters"); - result.append("\nDelete Templates: "); - for( String s: deleteTemplates ) { - result.append( "\""+s+"\" "); - } - result.append("\nParse Templates: "); - for( String s: parseTemplates ) { - result.append( "\""+s+"\" "); - } - return result.toString(); - } - - public ResolvedTemplate parseTemplate(Template t, ParsedPage pp) { - - final String templateName = t.getName(); - - //Show Name and Parameters as Standart treatment. - ResolvedTemplate result = new ResolvedTemplate( t ); - result.setPreParseReplacement( ResolvedTemplate.TEMPLATESPACER ); - StringBuilder sb = new StringBuilder(); - sb.append(templatePrefix); - sb.append( t.getName() + parameterDivisor ); - for( String s: t.getParameters()) { - sb.append( s + parameterDivisor ); - } - sb.delete( sb.length() - parameterDivisor.length(), sb.length() ); - sb.append(templatePostfix); - result.setPostParseReplacement( sb.toString() ); - - result.setParsedObject( t ); - - //Delete Template if it is in the List - for( String s: deleteTemplates ){ - if( s.equals(templateName) ){ - result.setPostParseReplacement( "" ); - result.setParsedObject( null ); - return result; - } - } - - //Parse Template if it is in the List - for( String s: parseTemplates ){ - List<String> templateParameters = t.getParameters(); - - if( s.equals(templateName)){ - logger.info("ParseTemplate: {}", templateName); - if( templateName.equals("Dieser Artikel")){ + } + + public String configurationInfo() { + StringBuilder result = new StringBuilder(); + result.append("Standard Template treatment: ShowNameAndParameters"); + result.append("\nDelete Templates: "); + for (String s : deleteTemplates) { + result.append("\"" + s + "\" "); + } + result.append("\nParse Templates: "); + for (String s : parseTemplates) { + result.append("\"" + s + "\" "); + } + return result.toString(); + } + + public ResolvedTemplate parseTemplate(Template t, ParsedPage pp) { + + final String templateName = t.getName(); + + //Show Name and Parameters as Standart treatment. + ResolvedTemplate result = new ResolvedTemplate(t); + result.setPreParseReplacement(ResolvedTemplate.TEMPLATESPACER); + StringBuilder sb = new StringBuilder(); + sb.append(templatePrefix); + sb.append(t.getName() + parameterDivisor); + for (String s : t.getParameters()) { + sb.append(s + parameterDivisor); + } + sb.delete(sb.length() - parameterDivisor.length(), sb.length()); + sb.append(templatePostfix); + result.setPostParseReplacement(sb.toString()); + + result.setParsedObject(t); + + //Delete Template if it is in the List + for (String s : deleteTemplates) { + if (s.equals(templateName)) { + result.setPostParseReplacement(""); + result.setParsedObject(null); + return result; + } + } + + //Parse Template if it is in the List + for (String s : parseTemplates) { + List<String> templateParameters = t.getParameters(); + + if (s.equals(templateName)) { + logger.info("ParseTemplate: {}", templateName); + if (templateName.equals("Dieser Artikel")) { // I removed that from the core API, as it is not likely to be present in most non-German articles. (TZ) // pp.setAboutArticle( parser.parseContentElement( templateParameters.get(0) )); - result.setPostParseReplacement(""); - result.setParsedObject( null ); - return result; - } - else if( templateName.equals("Audio") || templateName.equals("Audio genau")){ - if( templateParameters.size() == 0 ) { - break; - } - if( templateParameters.size() == 1 ) { - templateParameters.add( emptyLinkText ); - } - result.setPostParseReplacement( t.getParameters().get(1) ); - result.setParsedObject( new Link(null, t.getPos() , templateParameters.get(0), Link.type.AUDIO, null ) ); - - return result; - } - else if( templateName.equals("Video")){ - if( templateParameters.size() == 0 ) { - break; - } - if( templateParameters.size() == 1 ) { - templateParameters.add( emptyLinkText ); - } - result.setPostParseReplacement(t.getParameters().get(1)); - result.setParsedObject( new Link(null, t.getPos(), t.getParameters().get(0), Link.type.VIDEO, null ) ); - return result; - } - else{ - result.setPostParseReplacement( templateNotImplementedPrefix+ templateName + templateNotImplementedPostfix ); - return result; - } - } - } - - return result; - } + result.setPostParseReplacement(""); + result.setParsedObject(null); + return result; + } else if (templateName.equals("Audio") || templateName.equals("Audio genau")) { + if (templateParameters.size() == 0) { + break; + } + if (templateParameters.size() == 1) { + templateParameters.add(emptyLinkText); + } + result.setPostParseReplacement(t.getParameters().get(1)); + result.setParsedObject(new Link(null, t.getPos(), templateParameters.get(0), Link.type.AUDIO, null)); + + return result; + } else if (templateName.equals("Video")) { + if (templateParameters.size() == 0) { + break; + } + if (templateParameters.size() == 1) { + templateParameters.add(emptyLinkText); + } + result.setPostParseReplacement(t.getParameters().get(1)); + result.setParsedObject(new Link(null, t.getPos(), t.getParameters().get(0), Link.type.VIDEO, null)); + return result; + } else { + result.setPostParseReplacement(templateNotImplementedPrefix + templateName + templateNotImplementedPostfix); + return result; + } + } + } + + return result; + } } diff --git a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/mediawiki/MediaWikiContentElementParser.java b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/mediawiki/MediaWikiContentElementParser.java index 792127fa..38d97715 100644 --- a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/mediawiki/MediaWikiContentElementParser.java +++ b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/mediawiki/MediaWikiContentElementParser.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -22,11 +22,10 @@ /** * This Interface makes it possible to parse a single content element. * Some TemplateParses might uses this Feauture. - * */ interface MediaWikiContentElementParser { - /** - * Parses a ContentElement from a String. - */ - ContentElement parseContentElement( String src ); + /** + * Parses a ContentElement from a String. + */ + ContentElement parseContentElement(String src); } diff --git a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/mediawiki/MediaWikiParser.java b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/mediawiki/MediaWikiParser.java index 19a5306a..6b1643f8 100644 --- a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/mediawiki/MediaWikiParser.java +++ b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/mediawiki/MediaWikiParser.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -22,22 +22,21 @@ /** * This is an Interface for MediaWiki Parsers. Which simply "converts" * MediaWiki Source, given as a String, to a ParsedPage - * */ public interface MediaWikiParser { - /** - * Parses MediaWiki Source, given as parameter src, and returns a ParsedPage. - */ + /** + * Parses MediaWiki Source, given as parameter src, and returns a ParsedPage. + */ ParsedPage parse(String src); - - /** - * Retruns information abour the configuration of the parser. - */ + + /** + * Retruns information abour the configuration of the parser. + */ String configurationInfo(); - - /** - * Retruns the String which is uses as line separator, usually it - * will be "\n" or "\r\n" - */ + + /** + * Retruns the String which is uses as line separator, usually it + * will be "\n" or "\r\n" + */ String getLineSeparator(); } diff --git a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/mediawiki/MediaWikiParserFactory.java b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/mediawiki/MediaWikiParserFactory.java index 54a85b9c..f0663fb3 100644 --- a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/mediawiki/MediaWikiParserFactory.java +++ b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/mediawiki/MediaWikiParserFactory.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -30,430 +30,596 @@ */ public class MediaWikiParserFactory { - private static final Logger logger = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); - - private Class parserClass; - private Class templateParserClass; - private String lineSeparator; - private List<String> deleteTemplates; - private List<String> parseTemplates; - private List<String> categoryIdentifers; - private List<String> languageIdentifers; - private List<String> imageIdentifers; - private boolean showImageText; - private boolean deleteTags; - private boolean showMathTagContent; - private boolean calculateSrcSpans; - - /** - * Creates a new un-configured {@link MediaWikiParserFactory}. - */ - public MediaWikiParserFactory(){ - initVariables(); - } - - /** - * Creates a fully configured {@link MediaWikiParserFactory} for the specified {@link Language}.<br> - * Next step is {@link MediaWikiParserFactory#createParser()}. - */ - public MediaWikiParserFactory(Language language){ - initVariables(); - if (language.equals(Language.german)) { - initGermanVariables(); + private static final Logger logger = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + + private Class parserClass; + private Class templateParserClass; + private String lineSeparator; + private List<String> deleteTemplates; + private List<String> parseTemplates; + private List<String> categoryIdentifers; + private List<String> languageIdentifers; + private List<String> imageIdentifers; + private boolean showImageText; + private boolean deleteTags; + private boolean showMathTagContent; + private boolean calculateSrcSpans; + + /** + * Creates a new un-configured {@link MediaWikiParserFactory}. + */ + public MediaWikiParserFactory() { + initVariables(); + } + + /** + * Creates a fully configured {@link MediaWikiParserFactory} for the specified {@link Language}.<br> + * Next step is {@link MediaWikiParserFactory#createParser()}. + */ + public MediaWikiParserFactory(Language language) { + initVariables(); + if (language.equals(Language.german)) { + initGermanVariables(); + } else if (language.equals(Language.english)) { + initEnglishVariables(); + } else { + logger.warn("No language specific parser for '{}' available. Using default values.", language); + } + } + + private void initVariables() { + lineSeparator = "LF"; + parserClass = ModularParser.class; + imageIdentifers = new ArrayList<>(); + categoryIdentifers = new ArrayList<>(); + languageIdentifers = new ArrayList<>(); + deleteTemplates = new ArrayList<>(); + parseTemplates = new ArrayList<>(); + showImageText = false; + deleteTags = true; + showMathTagContent = true; + calculateSrcSpans = false; + templateParserClass = ShowTemplateNamesAndParameters.class; + + initLanguages(); + } + + private void initLanguages() { + //Init the Languages... + languageIdentifers.add("aa"); + languageIdentifers.add("ab"); + languageIdentifers.add("af"); + languageIdentifers.add("am"); + languageIdentifers.add("an"); + languageIdentifers.add("ar"); + languageIdentifers.add("as"); + languageIdentifers.add("av"); + languageIdentifers.add("ay"); + languageIdentifers.add("az"); + + languageIdentifers.add("ba"); + languageIdentifers.add("be"); + languageIdentifers.add("bg"); + languageIdentifers.add("bh"); + languageIdentifers.add("bi"); + languageIdentifers.add("bm"); + languageIdentifers.add("bn"); + languageIdentifers.add("bo"); + languageIdentifers.add("br"); + languageIdentifers.add("bs"); + + languageIdentifers.add("ca"); + languageIdentifers.add("ce"); + languageIdentifers.add("ch"); + languageIdentifers.add("co"); + languageIdentifers.add("cr"); + languageIdentifers.add("cs"); + languageIdentifers.add("cv"); + languageIdentifers.add("cy"); + + languageIdentifers.add("da"); + languageIdentifers.add("de"); + languageIdentifers.add("dk"); + languageIdentifers.add("dv"); + languageIdentifers.add("dz"); + + languageIdentifers.add("ee"); + languageIdentifers.add("el"); + languageIdentifers.add("en"); + languageIdentifers.add("eo"); + languageIdentifers.add("es"); + languageIdentifers.add("et"); + languageIdentifers.add("eu"); + + languageIdentifers.add("fa"); + languageIdentifers.add("ff"); + languageIdentifers.add("fi"); + languageIdentifers.add("fj"); + languageIdentifers.add("fo"); + languageIdentifers.add("fr"); + languageIdentifers.add("fy"); + + languageIdentifers.add("ga"); + languageIdentifers.add("gd"); + languageIdentifers.add("gl"); + languageIdentifers.add("gn"); + languageIdentifers.add("gu"); + languageIdentifers.add("gv"); + + languageIdentifers.add("ha"); + languageIdentifers.add("he"); + languageIdentifers.add("hi"); + languageIdentifers.add("hr"); + languageIdentifers.add("ht"); + languageIdentifers.add("hu"); + languageIdentifers.add("hy"); + + languageIdentifers.add("ia"); + languageIdentifers.add("id"); + languageIdentifers.add("ie"); + languageIdentifers.add("ig"); + languageIdentifers.add("ii"); + languageIdentifers.add("ik"); + languageIdentifers.add("io"); + languageIdentifers.add("is"); + languageIdentifers.add("it"); + languageIdentifers.add("iu"); + + languageIdentifers.add("ja"); + languageIdentifers.add("jv"); + + languageIdentifers.add("ka"); + languageIdentifers.add("kg"); + languageIdentifers.add("ki"); + languageIdentifers.add("kk"); + languageIdentifers.add("kl"); + languageIdentifers.add("km"); + languageIdentifers.add("kn"); + languageIdentifers.add("ko"); + languageIdentifers.add("ks"); + languageIdentifers.add("ku"); + languageIdentifers.add("kv"); + languageIdentifers.add("kw"); + languageIdentifers.add("ky"); + + languageIdentifers.add("la"); + languageIdentifers.add("lb"); + languageIdentifers.add("li"); + languageIdentifers.add("ln"); + languageIdentifers.add("lo"); + languageIdentifers.add("lt"); + languageIdentifers.add("lv"); + + languageIdentifers.add("mg"); + languageIdentifers.add("mh"); + languageIdentifers.add("mi"); + languageIdentifers.add("mk"); + languageIdentifers.add("ml"); + languageIdentifers.add("mn"); + languageIdentifers.add("mo"); + languageIdentifers.add("mr"); + languageIdentifers.add("ms"); + languageIdentifers.add("mt"); + languageIdentifers.add("my"); + + languageIdentifers.add("na"); + languageIdentifers.add("nb"); + languageIdentifers.add("ne"); + languageIdentifers.add("ng"); + languageIdentifers.add("nl"); + languageIdentifers.add("nn"); + languageIdentifers.add("no"); + languageIdentifers.add("nv"); + languageIdentifers.add("ny"); + + languageIdentifers.add("oc"); + languageIdentifers.add("os"); + languageIdentifers.add("pa"); + languageIdentifers.add("pl"); + languageIdentifers.add("ps"); + languageIdentifers.add("pt"); + + languageIdentifers.add("qu"); + + languageIdentifers.add("rm"); + languageIdentifers.add("rn"); + languageIdentifers.add("ro"); + languageIdentifers.add("ru"); + languageIdentifers.add("rw"); + + languageIdentifers.add("sa"); + languageIdentifers.add("sc"); + languageIdentifers.add("sd"); + languageIdentifers.add("se"); + languageIdentifers.add("sg"); + languageIdentifers.add("sh"); + languageIdentifers.add("si"); + languageIdentifers.add("sk"); + languageIdentifers.add("sl"); + languageIdentifers.add("sm"); + languageIdentifers.add("sn"); + languageIdentifers.add("so"); + languageIdentifers.add("sq"); + languageIdentifers.add("sr"); + languageIdentifers.add("ss"); + languageIdentifers.add("st"); + languageIdentifers.add("su"); + languageIdentifers.add("sv"); + languageIdentifers.add("sw"); + + languageIdentifers.add("ta"); + languageIdentifers.add("te"); + languageIdentifers.add("tg"); + languageIdentifers.add("th"); + languageIdentifers.add("ti"); + languageIdentifers.add("tk"); + languageIdentifers.add("tl"); + languageIdentifers.add("tn"); + languageIdentifers.add("to"); + languageIdentifers.add("tr"); + languageIdentifers.add("ts"); + languageIdentifers.add("tt"); + languageIdentifers.add("tw"); + languageIdentifers.add("ty"); + + languageIdentifers.add("ug"); + languageIdentifers.add("uk"); + languageIdentifers.add("ur"); + languageIdentifers.add("uz"); + + languageIdentifers.add("ve"); + languageIdentifers.add("vi"); + languageIdentifers.add("vo"); + + languageIdentifers.add("wa"); + languageIdentifers.add("wo"); + + languageIdentifers.add("xh"); + + languageIdentifers.add("yi"); + languageIdentifers.add("yo"); + + languageIdentifers.add("za"); + languageIdentifers.add("zh"); + languageIdentifers.add("zu"); + + languageIdentifers.add("als"); + languageIdentifers.add("ang"); + languageIdentifers.add("arc"); + languageIdentifers.add("ast"); + languageIdentifers.add("bug"); + languageIdentifers.add("ceb"); + languageIdentifers.add("chr"); + languageIdentifers.add("chy"); + languageIdentifers.add("csb"); + languageIdentifers.add("frp"); + languageIdentifers.add("fur"); + languageIdentifers.add("got"); + languageIdentifers.add("haw"); + languageIdentifers.add("ilo"); + languageIdentifers.add("jbo"); + languageIdentifers.add("ksh"); + languageIdentifers.add("lad"); + languageIdentifers.add("lmo"); + languageIdentifers.add("nah"); + languageIdentifers.add("nap"); + languageIdentifers.add("nds"); + languageIdentifers.add("nrm"); + languageIdentifers.add("pam"); + languageIdentifers.add("pap"); + languageIdentifers.add("pdc"); + languageIdentifers.add("pih"); + languageIdentifers.add("pms"); + languageIdentifers.add("rmy"); + languageIdentifers.add("scn"); + languageIdentifers.add("sco"); + languageIdentifers.add("tet"); + languageIdentifers.add("tpi"); + languageIdentifers.add("tum"); + languageIdentifers.add("udm"); + languageIdentifers.add("vec"); + languageIdentifers.add("vls"); + languageIdentifers.add("war"); + languageIdentifers.add("xal"); + + languageIdentifers.add("simple"); + } + + private void initGermanVariables() { + templateParserClass = FlushTemplates.class; + //deleteTemplates.add( "Prettytable" ); + //parseTemplates.add( "Dieser Artikel" ); + //parseTemplates.add( "Audio" ); + //parseTemplates.add( "Video" ); + imageIdentifers.add("Bild"); + imageIdentifers.add("Image"); + imageIdentifers.add("Datei"); + categoryIdentifers.add("Kategorie"); + languageIdentifers.remove("de"); + } + + private void initEnglishVariables() { + templateParserClass = FlushTemplates.class; + + imageIdentifers.add("Image"); + imageIdentifers.add("File"); + imageIdentifers.add("media"); + categoryIdentifers.add("Category"); + languageIdentifers.remove("en"); + } + + private String resolveLineSeparator() { + if (lineSeparator.equals("CRLF")) { + return "\r\n"; + } + if (lineSeparator.equals("LF")) { + return "\n"; + } + + logger.error( + "LineSeparator is UNKNOWN: \"" + lineSeparator + "\"\n" + + "Set LineSeparator to \"LF\" or \"CRLF\" for a Error free configuration"); + + return lineSeparator; + } + + /** + * Creates a MediaWikiParser with the configurations which has been set. + */ + public MediaWikiParser createParser() { + logger.debug("Selected Parser: {}", parserClass); + + if (parserClass == ModularParser.class) { + ModularParser mwgp = new ModularParser( +// resolveLineSeparator(), + "\n", + languageIdentifers, + categoryIdentifers, + imageIdentifers, + showImageText, + deleteTags, + showMathTagContent, + calculateSrcSpans, + null); + + StringBuilder sb = new StringBuilder(); + sb.append(lineSeparator + "languageIdentifers: "); + for (String s : languageIdentifers) { + sb.append(s + " "); + } + sb.append(lineSeparator + "categoryIdentifers: "); + for (String s : categoryIdentifers) { + sb.append(s + " "); + } + sb.append(lineSeparator + "imageIdentifers: "); + for (String s : imageIdentifers) { + sb.append(s + " "); + } + logger.debug(sb.toString()); + + MediaWikiTemplateParser mwtp; + + logger.debug("Selected TemplateParser: {}", templateParserClass); + if (templateParserClass == GermanTemplateParser.class) { + for (String s : deleteTemplates) { + logger.debug("DeleteTemplate: '{}'", s); } - else if(language.equals(Language.english)){ - initEnglishVariables(); - }else - { - logger.warn("No language specific parser for '{}' available. Using default values.", language); + for (String s : parseTemplates) { + logger.debug("ParseTemplate: '{}'", s); } - } - - private void initVariables(){ - lineSeparator = "LF"; - parserClass = ModularParser.class; - imageIdentifers = new ArrayList<>(); - categoryIdentifers = new ArrayList<>(); - languageIdentifers = new ArrayList<>(); - deleteTemplates = new ArrayList<>(); - parseTemplates = new ArrayList<>(); - showImageText = false; - deleteTags = true; - showMathTagContent = true; - calculateSrcSpans = false; - templateParserClass = ShowTemplateNamesAndParameters.class; - - initLanguages(); - } - - private void initLanguages(){ - //Init the Languages... - languageIdentifers.add("aa");languageIdentifers.add("ab");languageIdentifers.add("af"); - languageIdentifers.add("am");languageIdentifers.add("an");languageIdentifers.add("ar"); - languageIdentifers.add("as");languageIdentifers.add("av");languageIdentifers.add("ay"); - languageIdentifers.add("az"); - - languageIdentifers.add("ba");languageIdentifers.add("be");languageIdentifers.add("bg"); - languageIdentifers.add("bh");languageIdentifers.add("bi");languageIdentifers.add("bm"); - languageIdentifers.add("bn");languageIdentifers.add("bo");languageIdentifers.add("br"); - languageIdentifers.add("bs"); - - languageIdentifers.add("ca");languageIdentifers.add("ce");languageIdentifers.add("ch"); - languageIdentifers.add("co");languageIdentifers.add("cr");languageIdentifers.add("cs"); - languageIdentifers.add("cv");languageIdentifers.add("cy"); - - languageIdentifers.add("da");languageIdentifers.add("de");languageIdentifers.add("dk"); - languageIdentifers.add("dv");languageIdentifers.add("dz"); - - languageIdentifers.add("ee");languageIdentifers.add("el");languageIdentifers.add("en"); - languageIdentifers.add("eo");languageIdentifers.add("es");languageIdentifers.add("et"); - languageIdentifers.add("eu"); - - languageIdentifers.add("fa");languageIdentifers.add("ff");languageIdentifers.add("fi"); - languageIdentifers.add("fj");languageIdentifers.add("fo");languageIdentifers.add("fr"); - languageIdentifers.add("fy"); - - languageIdentifers.add("ga");languageIdentifers.add("gd");languageIdentifers.add("gl"); - languageIdentifers.add("gn");languageIdentifers.add("gu");languageIdentifers.add("gv"); - - languageIdentifers.add("ha");languageIdentifers.add("he");languageIdentifers.add("hi"); - languageIdentifers.add("hr");languageIdentifers.add("ht");languageIdentifers.add("hu"); - languageIdentifers.add("hy"); - - languageIdentifers.add("ia");languageIdentifers.add("id");languageIdentifers.add("ie"); - languageIdentifers.add("ig");languageIdentifers.add("ii");languageIdentifers.add("ik"); - languageIdentifers.add("io");languageIdentifers.add("is");languageIdentifers.add("it"); - languageIdentifers.add("iu"); - - languageIdentifers.add("ja");languageIdentifers.add("jv"); - - languageIdentifers.add("ka");languageIdentifers.add("kg");languageIdentifers.add("ki"); - languageIdentifers.add("kk");languageIdentifers.add("kl");languageIdentifers.add("km"); - languageIdentifers.add("kn");languageIdentifers.add("ko");languageIdentifers.add("ks"); - languageIdentifers.add("ku");languageIdentifers.add("kv");languageIdentifers.add("kw"); - languageIdentifers.add("ky"); - - languageIdentifers.add("la");languageIdentifers.add("lb");languageIdentifers.add("li"); - languageIdentifers.add("ln");languageIdentifers.add("lo");languageIdentifers.add("lt"); - languageIdentifers.add("lv"); - - languageIdentifers.add("mg");languageIdentifers.add("mh");languageIdentifers.add("mi"); - languageIdentifers.add("mk");languageIdentifers.add("ml");languageIdentifers.add("mn"); - languageIdentifers.add("mo");languageIdentifers.add("mr");languageIdentifers.add("ms"); - languageIdentifers.add("mt");languageIdentifers.add("my"); - - languageIdentifers.add("na");languageIdentifers.add("nb");languageIdentifers.add("ne"); - languageIdentifers.add("ng");languageIdentifers.add("nl");languageIdentifers.add("nn"); - languageIdentifers.add("no");languageIdentifers.add("nv");languageIdentifers.add("ny"); - - languageIdentifers.add("oc");languageIdentifers.add("os");languageIdentifers.add("pa"); - languageIdentifers.add("pl");languageIdentifers.add("ps");languageIdentifers.add("pt"); - - languageIdentifers.add("qu"); - - languageIdentifers.add("rm");languageIdentifers.add("rn");languageIdentifers.add("ro"); - languageIdentifers.add("ru");languageIdentifers.add("rw"); - - languageIdentifers.add("sa");languageIdentifers.add("sc");languageIdentifers.add("sd"); - languageIdentifers.add("se");languageIdentifers.add("sg");languageIdentifers.add("sh"); - languageIdentifers.add("si");languageIdentifers.add("sk");languageIdentifers.add("sl"); - languageIdentifers.add("sm");languageIdentifers.add("sn");languageIdentifers.add("so"); - languageIdentifers.add("sq");languageIdentifers.add("sr");languageIdentifers.add("ss"); - languageIdentifers.add("st");languageIdentifers.add("su");languageIdentifers.add("sv"); - languageIdentifers.add("sw"); - - languageIdentifers.add("ta");languageIdentifers.add("te");languageIdentifers.add("tg"); - languageIdentifers.add("th");languageIdentifers.add("ti");languageIdentifers.add("tk"); - languageIdentifers.add("tl");languageIdentifers.add("tn");languageIdentifers.add("to"); - languageIdentifers.add("tr");languageIdentifers.add("ts");languageIdentifers.add("tt"); - languageIdentifers.add("tw");languageIdentifers.add("ty"); - - languageIdentifers.add("ug");languageIdentifers.add("uk");languageIdentifers.add("ur"); - languageIdentifers.add("uz"); - - languageIdentifers.add("ve");languageIdentifers.add("vi");languageIdentifers.add("vo"); - - languageIdentifers.add("wa");languageIdentifers.add("wo"); - - languageIdentifers.add("xh"); - - languageIdentifers.add("yi");languageIdentifers.add("yo"); - - languageIdentifers.add("za");languageIdentifers.add("zh");languageIdentifers.add("zu"); - - languageIdentifers.add("als");languageIdentifers.add("ang");languageIdentifers.add("arc");languageIdentifers.add("ast"); - languageIdentifers.add("bug"); - languageIdentifers.add("ceb");languageIdentifers.add("chr");languageIdentifers.add("chy");languageIdentifers.add("csb"); - languageIdentifers.add("frp"); - languageIdentifers.add("fur"); - languageIdentifers.add("got"); - languageIdentifers.add("haw"); - languageIdentifers.add("ilo"); - languageIdentifers.add("jbo"); - languageIdentifers.add("ksh"); - languageIdentifers.add("lad");languageIdentifers.add("lmo"); - languageIdentifers.add("nah");languageIdentifers.add("nap");languageIdentifers.add("nds");languageIdentifers.add("nrm"); - languageIdentifers.add("pam");languageIdentifers.add("pap");languageIdentifers.add("pdc");languageIdentifers.add("pih");languageIdentifers.add("pms"); - languageIdentifers.add("rmy"); - languageIdentifers.add("scn");languageIdentifers.add("sco"); - languageIdentifers.add("tet");languageIdentifers.add("tpi");languageIdentifers.add("tum"); - languageIdentifers.add("udm"); - languageIdentifers.add("vec");languageIdentifers.add("vls"); - languageIdentifers.add("war"); - languageIdentifers.add("xal"); - - languageIdentifers.add("simple"); - } - - private void initGermanVariables(){ - templateParserClass = FlushTemplates.class; - //deleteTemplates.add( "Prettytable" ); - //parseTemplates.add( "Dieser Artikel" ); - //parseTemplates.add( "Audio" ); - //parseTemplates.add( "Video" ); - imageIdentifers.add("Bild"); - imageIdentifers.add("Image"); - imageIdentifers.add("Datei"); - categoryIdentifers.add( "Kategorie" ); - languageIdentifers.remove("de"); - } - - private void initEnglishVariables(){ - templateParserClass = FlushTemplates.class; - - imageIdentifers.add("Image"); - imageIdentifers.add("File"); - imageIdentifers.add("media"); - categoryIdentifers.add( "Category" ); - languageIdentifers.remove("en"); - } - - private String resolveLineSeparator(){ - if( lineSeparator.equals("CRLF")) { - return "\r\n"; - } - if( lineSeparator.equals("LF")) { - return "\n"; - } - - logger.error( - "LineSeparator is UNKNOWN: \""+lineSeparator+"\"\n" + - "Set LineSeparator to \"LF\" or \"CRLF\" for a Error free configuration" ); - - return lineSeparator; - } - - /** - * Creates a MediaWikiParser with the configurations which has been set. - */ - public MediaWikiParser createParser(){ - logger.debug( "Selected Parser: {}", parserClass ); - - if( parserClass == ModularParser.class ){ - ModularParser mwgp = new ModularParser( -// resolveLineSeparator(), - "\n", - languageIdentifers, - categoryIdentifers, - imageIdentifers, - showImageText, - deleteTags, - showMathTagContent, - calculateSrcSpans, - null ); - - StringBuilder sb = new StringBuilder(); - sb.append( lineSeparator + "languageIdentifers: "); - for( String s: languageIdentifers ) { - sb.append( s + " "); - } - sb.append( lineSeparator + "categoryIdentifers: "); - for( String s: categoryIdentifers ) { - sb.append( s + " "); - } - sb.append( lineSeparator + "imageIdentifers: "); - for( String s: imageIdentifers ) { - sb.append( s + " "); - } - logger.debug( sb.toString() ); - - MediaWikiTemplateParser mwtp; - - logger.debug( "Selected TemplateParser: {}", templateParserClass); - if( templateParserClass == GermanTemplateParser.class ){ - for( String s: deleteTemplates) { - logger.debug( "DeleteTemplate: '{}'", s); - } - for( String s: parseTemplates) { - logger.debug( "ParseTemplate: '{}'", s); - } - mwtp = new GermanTemplateParser( mwgp, deleteTemplates, parseTemplates ); - } - else if( templateParserClass == FlushTemplates.class ) { - mwtp = new FlushTemplates(); - } else if( templateParserClass == ShowTemplateNamesAndParameters.class ){ - mwtp = new ShowTemplateNamesAndParameters(); - } - else{ - logger.error("TemplateParser Class Not Found!"); - return null; - } - - mwgp.setTemplateParser( mwtp ); - - return mwgp; - } - else{ - logger.error("Parser Class Not Found!"); - return null; - } - } - - /** - * Adds a Template which should be deleted while the parsing process. - */ - public void addDeleteTemplate( String deleteTemplate ){ - deleteTemplates.add( deleteTemplate ); - } - - /** - * Adds a Template which should be "parsed" while the parsing process. - */ - public void addParseTemplate( String parseTemplate ){ - parseTemplates.add( parseTemplate ); - } - - /** - * Retuns the Class of the selected Parser. - */ - public Class getParserClass(){ - return parserClass; - } - - /** - * Set the Parser which should be configurated and returned by createParser(). - */ - public void setParserClass(Class parserClass){ this.parserClass = parserClass; } - - /** - * Returns the Class of the selected TemplateParser. - */ - public Class getTemplateParserClass(){ return templateParserClass; } - - /** - * Set the Parser which should be used for Template parsing. - */ - public void setTemplateParserClass(Class templateParserClass){ this.templateParserClass = templateParserClass; } - - /** - * Retuns the List of templates which should be deleted in the parseing process. - */ - public List<String> getDeleteTemplates(){ return deleteTemplates; } - - /** - * Set the List of templates which should be deleted in the parseing process. - */ - public void setDeleteTemplates(List<String> deleteTemplates){ this.deleteTemplates = deleteTemplates; } - - /** - * Returns the CharSequence/String which should be used as line separator. - */ - public String getLineSeparator(){ return lineSeparator; } - - /** - * Sets the CharSequence/String which should be used as line separator. - */ - public void setLineSeparator(String lineSeparator){ this.lineSeparator = lineSeparator; } - - /** - * Returns the List of templates which should be "parsed" in the parseing process. - */ - public List<String> getParseTemplates(){ return parseTemplates; } - - /** - * Sets the List of templates which should be "parsed" in the parseing process. - */ - public void setParseTemplates(List<String> parseTemplates){ this.parseTemplates = parseTemplates; } - - /** - * Returns the List of Strings which are used to specifiy that a link is a link to a - * wikipedia i another language. - */ - public List<String> getLanguageIdentifers(){ return languageIdentifers; } - - /** - * Sets the list of language identifiers. - */ - public void setLanguageIdentifers( List<String> languageIdentifers){ this.languageIdentifers = languageIdentifers; } - - /** - * Returns the List of Strings which are used to specifiy that a link is a link to a - * cathegory. E.g. in german "Kathegorie" is used. But it could be usefull to use more - * than one identifier, mainly the english identifier "cathegory" should be used too. - */ - public List<String> getCategoryIdentifers( ){ return categoryIdentifers; } - - /** - * Set the list of cathegory identifers. - */ - public void setCategoryIdentifers( List<String> categoryIdentifers){ this.categoryIdentifers = categoryIdentifers; } - - /** - * Returns the List of Strings which are used to specifiy that a link is an Image. - */ - public List<String> getImageIdentifers( ){ return imageIdentifers; } - - /** - * Sets the image identifer list. - */ - public void setImageIdentifers( List<String> imageIdentifers){ this.imageIdentifers = imageIdentifers; } - - /** - * Returns if the Parser should show the Text of an Image, or delete it. If the Text is deleted, - * it will be added as a Parameter to the Link. - * @return true, if the Text should be shown. - */ - public boolean getShowImageText(){ return showImageText; } - - /** - * Sets if the Parser should show the Text of an Image, or delete it. - */ - public void setShowImageText( boolean showImageText ){ this.showImageText = showImageText; } - - /** - * Returns if < * > tags should be deleted or annotaded. - * @return true if the tags should be deleted. - */ - public boolean getDeleteTags() { return deleteTags; } - - /** - * Sets if < * > tags should be deleted or annotaded. - */ - public void setDeleteTags(boolean deleteTags) { this.deleteTags = deleteTags; } - - /** - * Retruns if the Content of math tags (<math><CONTENT/math>) should be deleted or - * annotated. - * - * @return true, if the tag content should be annotated. - */ - public boolean getShowMathTagContent() { return showMathTagContent; } - - /** - * Set if the Contetn of math tags should be deleted or annotated. - */ - public void setShowMathTagContent(boolean showMathTagContent) { this.showMathTagContent = showMathTagContent; } - - /** - * Returns if the Parser should calculate the positions in the original source of the elements - * which are parsed. - * @return true, if the positions should be calulated. - */ - public boolean getCalculateSrcSpans() { return calculateSrcSpans; } - - /** - * Sets if the Parser should calculate the positions in the original source of the elements - * which are parsed. - */ - public void setCalculateSrcSpans(boolean calculateSrcSpans) { this.calculateSrcSpans = calculateSrcSpans; } + mwtp = new GermanTemplateParser(mwgp, deleteTemplates, parseTemplates); + } else if (templateParserClass == FlushTemplates.class) { + mwtp = new FlushTemplates(); + } else if (templateParserClass == ShowTemplateNamesAndParameters.class) { + mwtp = new ShowTemplateNamesAndParameters(); + } else { + logger.error("TemplateParser Class Not Found!"); + return null; + } + + mwgp.setTemplateParser(mwtp); + + return mwgp; + } else { + logger.error("Parser Class Not Found!"); + return null; + } + } + + /** + * Adds a Template which should be deleted while the parsing process. + */ + public void addDeleteTemplate(String deleteTemplate) { + deleteTemplates.add(deleteTemplate); + } + + /** + * Adds a Template which should be "parsed" while the parsing process. + */ + public void addParseTemplate(String parseTemplate) { + parseTemplates.add(parseTemplate); + } + + /** + * Retuns the Class of the selected Parser. + */ + public Class getParserClass() { + return parserClass; + } + + /** + * Set the Parser which should be configurated and returned by createParser(). + */ + public void setParserClass(Class parserClass) { + this.parserClass = parserClass; + } + + /** + * Returns the Class of the selected TemplateParser. + */ + public Class getTemplateParserClass() { + return templateParserClass; + } + + /** + * Set the Parser which should be used for Template parsing. + */ + public void setTemplateParserClass(Class templateParserClass) { + this.templateParserClass = templateParserClass; + } + + /** + * Retuns the List of templates which should be deleted in the parseing process. + */ + public List<String> getDeleteTemplates() { + return deleteTemplates; + } + + /** + * Set the List of templates which should be deleted in the parseing process. + */ + public void setDeleteTemplates(List<String> deleteTemplates) { + this.deleteTemplates = deleteTemplates; + } + + /** + * Returns the CharSequence/String which should be used as line separator. + */ + public String getLineSeparator() { + return lineSeparator; + } + + /** + * Sets the CharSequence/String which should be used as line separator. + */ + public void setLineSeparator(String lineSeparator) { + this.lineSeparator = lineSeparator; + } + + /** + * Returns the List of templates which should be "parsed" in the parseing process. + */ + public List<String> getParseTemplates() { + return parseTemplates; + } + + /** + * Sets the List of templates which should be "parsed" in the parseing process. + */ + public void setParseTemplates(List<String> parseTemplates) { + this.parseTemplates = parseTemplates; + } + + /** + * Returns the List of Strings which are used to specifiy that a link is a link to a + * wikipedia i another language. + */ + public List<String> getLanguageIdentifers() { + return languageIdentifers; + } + + /** + * Sets the list of language identifiers. + */ + public void setLanguageIdentifers(List<String> languageIdentifers) { + this.languageIdentifers = languageIdentifers; + } + + /** + * Returns the List of Strings which are used to specifiy that a link is a link to a + * cathegory. E.g. in german "Kathegorie" is used. But it could be usefull to use more + * than one identifier, mainly the english identifier "cathegory" should be used too. + */ + public List<String> getCategoryIdentifers() { + return categoryIdentifers; + } + + /** + * Set the list of cathegory identifers. + */ + public void setCategoryIdentifers(List<String> categoryIdentifers) { + this.categoryIdentifers = categoryIdentifers; + } + + /** + * Returns the List of Strings which are used to specifiy that a link is an Image. + */ + public List<String> getImageIdentifers() { + return imageIdentifers; + } + + /** + * Sets the image identifer list. + */ + public void setImageIdentifers(List<String> imageIdentifers) { + this.imageIdentifers = imageIdentifers; + } + + /** + * Returns if the Parser should show the Text of an Image, or delete it. If the Text is deleted, + * it will be added as a Parameter to the Link. + * + * @return true, if the Text should be shown. + */ + public boolean getShowImageText() { + return showImageText; + } + + /** + * Sets if the Parser should show the Text of an Image, or delete it. + */ + public void setShowImageText(boolean showImageText) { + this.showImageText = showImageText; + } + + /** + * Returns if < * > tags should be deleted or annotaded. + * + * @return true if the tags should be deleted. + */ + public boolean getDeleteTags() { + return deleteTags; + } + + /** + * Sets if < * > tags should be deleted or annotaded. + */ + public void setDeleteTags(boolean deleteTags) { + this.deleteTags = deleteTags; + } + + /** + * Retruns if the Content of math tags (<math><CONTENT/math>) should be deleted or + * annotated. + * + * @return true, if the tag content should be annotated. + */ + public boolean getShowMathTagContent() { + return showMathTagContent; + } + + /** + * Set if the Contetn of math tags should be deleted or annotated. + */ + public void setShowMathTagContent(boolean showMathTagContent) { + this.showMathTagContent = showMathTagContent; + } + + /** + * Returns if the Parser should calculate the positions in the original source of the elements + * which are parsed. + * + * @return true, if the positions should be calulated. + */ + public boolean getCalculateSrcSpans() { + return calculateSrcSpans; + } + + /** + * Sets if the Parser should calculate the positions in the original source of the elements + * which are parsed. + */ + public void setCalculateSrcSpans(boolean calculateSrcSpans) { + this.calculateSrcSpans = calculateSrcSpans; + } } diff --git a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/mediawiki/MediaWikiTemplateParser.java b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/mediawiki/MediaWikiTemplateParser.java index 1f128af1..f14cf580 100644 --- a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/mediawiki/MediaWikiTemplateParser.java +++ b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/mediawiki/MediaWikiTemplateParser.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -23,22 +23,21 @@ /** * Because template parsing is a special task, it is usesfull to use * a special parser. - * */ public interface MediaWikiTemplateParser { - - /** - * Takes a Template and do whatever is required for handling this Template. - * It is possible to delete this template, to parse it to e.g a Link or - * to return MediaWiki code which can be parsed by a MediaWiki parser.<br> - * If you are interested how this works, you shoud read the documentation - * of ResolvedTemplate. - */ + + /** + * Takes a Template and do whatever is required for handling this Template. + * It is possible to delete this template, to parse it to e.g a Link or + * to return MediaWiki code which can be parsed by a MediaWiki parser.<br> + * If you are interested how this works, you shoud read the documentation + * of ResolvedTemplate. + */ ResolvedTemplate parseTemplate(Template t, ParsedPage pp); - - /** - * Returns some information about what the TemplateParser does am how - * it is configurated. - */ + + /** + * Returns some information about what the TemplateParser does am how + * it is configurated. + */ String configurationInfo(); } diff --git a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/mediawiki/ModularParser.java b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/mediawiki/ModularParser.java index c43ac056..e8bce845 100644 --- a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/mediawiki/ModularParser.java +++ b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/mediawiki/ModularParser.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -49,2115 +49,1802 @@ * This is a parser for MediaWiki Source. * <p> * It exist a {@link MediaWikiParserFactory}, to get an instance of this Parser.<br> - * */ public class ModularParser implements MediaWikiParser, - MediaWikiContentElementParser -{ - - private static final Logger logger = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); - - // Options, set by the ParserFactory - private String lineSeparator; - private List<String> categoryIdentifers; - private List<String> languageIdentifers; - private List<String> imageIdentifers; - private MediaWikiTemplateParser templateParser; - private boolean showImageText = false; - private boolean deleteTags = true; - private boolean showMathTagContent = true; - private boolean calculateSrcSpans = true; - - /** - * Creates a un-configured {@link ModularParser}... - */ - public ModularParser() - { - } - - /** - * Creates a fully configured {@link ModularParser}... - */ - public ModularParser(String lineSeparator, List<String> languageIdentifers, - List<String> categoryIdentifers, List<String> imageIdentifers, - boolean showImageText, boolean deleteTags, - boolean showMathTagContent, boolean calculateSrcSpans, - MediaWikiTemplateParser templateParser) - { - - setLineSeparator(lineSeparator); - setLanguageIdentifers(languageIdentifers); - setCategoryIdentifers(categoryIdentifers); - setImageIdentifers(imageIdentifers); - setShowImageText(showImageText); - setDeleteTags(deleteTags); - setShowMathTagContent(showMathTagContent); - setCalculateSrcSpans(calculateSrcSpans); - setTemplateParser(templateParser); - } - - /** - * Look at {@link MediaWikiParserFactory} for a description... - */ - @Override - public String getLineSeparator() - { - return lineSeparator; - } - - /** - * Look at {@link MediaWikiParserFactory} for a description... - */ - public void setLineSeparator(String lineSeparator) - { - this.lineSeparator = lineSeparator; - } - - /** - * Look at {@link MediaWikiParserFactory} for a description... - */ - public List<String> getLanguageIdentifers() - { - return languageIdentifers; - } - - /** - * Look at {@link MediaWikiParserFactory} for a description... - */ - public void setLanguageIdentifers(List<String> languageIdentifers) - { - this.languageIdentifers = listToLowerCase(languageIdentifers); - } - - /** - * Look at {@link MediaWikiParserFactory} for a description... - */ - public List<String> getCategoryIdentifers() - { - return categoryIdentifers; - } - - /** - * Look at {@link MediaWikiParserFactory} for a description... - */ - public void setCategoryIdentifers(List<String> categoryIdentifers) - { - this.categoryIdentifers = listToLowerCase(categoryIdentifers); - } - - /** - * Look at {@link MediaWikiParserFactory} for a description... - */ - public List<String> getImageIdentifers() - { - return imageIdentifers; - } - - /** - * Look at {@link MediaWikiParserFactory} for a description... - */ - public void setImageIdentifers(List<String> imageIdentifers) - { - this.imageIdentifers = listToLowerCase(imageIdentifers); - } - - /** - * Look at {@link MediaWikiParserFactory} for a description... - */ - public MediaWikiTemplateParser getTemplateParser() - { - return templateParser; - } - - /** - * Look at {@link MediaWikiParserFactory} for a description... - */ - public void setTemplateParser(MediaWikiTemplateParser templateParser) - { - this.templateParser = templateParser; - } - - /** - * Look at {@link MediaWikiParserFactory} for a description... - */ - public boolean showImageText() - { - return showImageText; - } - - /** - * Look at {@link MediaWikiParserFactory} for a description... - */ - public void setShowImageText(boolean showImageText) - { - this.showImageText = showImageText; - } - - /** - * Look at {@link MediaWikiParserFactory} for a description... - */ - public boolean deleteTags() - { - return deleteTags; - } - - /** - * Look at {@link MediaWikiParserFactory} for a description... - */ - public void setDeleteTags(boolean deleteTags) - { - this.deleteTags = deleteTags; - } - - /** - * Look at {@link MediaWikiParserFactory} for a description... - */ - public boolean showMathTagContent() - { - return showMathTagContent; - } - - /** - * Look at {@link MediaWikiParserFactory} for a description... - */ - public void setShowMathTagContent(boolean showMathTagContent) - { - this.showMathTagContent = showMathTagContent; - } - - /** - * Look at {@link MediaWikiParserFactory} for a description... - */ - public boolean calculateSrcSpans() - { - return calculateSrcSpans; - } - - /** - * Look at {@link MediaWikiParserFactory} for a description... - */ - public void setCalculateSrcSpans(boolean calculateSrcSpans) - { - this.calculateSrcSpans = calculateSrcSpans; - } - - /** - * Converts a List of Strings to lower case Strings. - */ - private List<String> listToLowerCase(List<String> l) - { - List<String> result = new ArrayList<>(); - for (String s : l) - { - result.add(s.toLowerCase()); - } - return result; - } - - /** - * Look at the MediaWikiParser interface for a description... - */ - @Override - public String configurationInfo() - { - StringBuilder result = new StringBuilder(); - - result.append("MediaWikiParser configuration:\n"); - result.append("ParserClass: " + this.getClass() + "\n"); - result.append("ShowImageText: " + showImageText + "\n"); - result.append("DeleteTags: " + deleteTags + "\n"); - result.append("ShowMathTagContent: " + showMathTagContent + "\n"); - result.append("CalculateSrcSpans: " + calculateSrcSpans + "\n"); - - result.append("LanguageIdentifers: "); - for (String s : languageIdentifers) - { - result.append(s + " "); - } - result.append("\n"); - - result.append("CategoryIdentifers: "); - for (String s : categoryIdentifers) - { - result.append(s + " "); - } - result.append("\n"); - - result.append("ImageIdentifers: "); - for (String s : imageIdentifers) - { - result.append(s + " "); - } - result.append("\n"); - - result.append("TemplateParser: " + templateParser.getClass() + "\n"); - result.append(templateParser.configurationInfo()); - - return result.toString(); - } - - /** - * Checks if the configuration is runnable. - */ - private boolean runConfig() - { - if (lineSeparator == null) - { - logger.debug("Set lineSeparator"); - return false; - } - if (categoryIdentifers == null) - { - logger.warn("Set categoryIdentifers"); - return false; - } - if (languageIdentifers == null) - { - logger.warn("Set languageIdentifers"); - return false; - } - if (imageIdentifers == null) - { - logger.warn("Set imageIdentifers"); - return false; - } - if (templateParser == null) - { - logger.warn("Set templateParser"); - return false; - } - return true; - } - - /** - * Look at the {@link MediaWikiParser} for a description... - */ - @Override - public ParsedPage parse(String src) - { - // check if the configuration is runnable. - if (!runConfig()) - { - return null; - } - - // check if the is something to parse. sometimes there is an empty string - // due to an error of other classes... - if (src == null || src.length() == 0) - { - return null; - } - - // creates a new span manager with the given source, appending a newline - // to avoid errors. - SpanManager sm = new SpanManager(src.replace('\t', ' ') + lineSeparator); - if (calculateSrcSpans) - { - sm.enableSrcPosCalculation(); - } - - // Creating a new ParsePage, which will be filled with information in - // the parseing process. - ParsedPage ppResult = new ParsedPage(); - - // Creating a new Parameter Container - ContentElementParsingParameters cepp = new ContentElementParsingParameters(); - - // Deletes comments out of the Source - deleteComments(sm); - - // Deletes any TOC Tags, these are not usesd in this parser. - deleteTOCTag(sm); - - // Removing the Content which should not parsed but integrated later in - // the resulting text - sm.manageList(cepp.noWikiSpans); - parseSpecifiedTag(sm, cepp.noWikiSpans, cepp.noWikiStrings, "PRE", " "); - parseSpecifiedTag(sm, cepp.noWikiSpans, cepp.noWikiStrings, "NOWIKI"); - if (cepp.noWikiSpans.size() == 0) - { - sm.removeManagedList(cepp.noWikiSpans); - } - - // Parseing the Math Tags... - sm.manageList(cepp.mathSpans); - parseSpecifiedTag(sm, cepp.mathSpans, cepp.mathStrings, "MATH"); - if (cepp.mathSpans.size() == 0) - { - sm.removeManagedList(cepp.mathSpans); - } - - // Parseing the Templates (the Span List will be added to the managed - // lists by the function) - parseTemplates(sm, cepp.templateSpans, cepp.templates, ppResult); - - // Parsing all other Tags - parseTags(sm, cepp.tagSpans); - - // Converting <gallery>s to normal Images, this is not beautiful, but - // a simple solution.. - convertGalleriesToImages(sm, cepp.tagSpans); - - // Parsing Links and Images. - parseImagesAndInternalLinks(sm, cepp.linkSpans, cepp.links); - - // Creating a list of Line Spans to work with lines in the following - // functions - LinkedList<Span> lineSpans = new LinkedList<>(); - getLineSpans(sm, lineSpans); - - // Removing the Category Links from the Links list, and crating an - // ContentElement for these links... - ppResult.setCategoryElement(getSpecialLinks(sm, cepp.linkSpans, - cepp.links, " - ", categoryIdentifers)); - - // Removing the Language Links from the Links list, and crating an - // ContentElement for these links... - ppResult.setLanguagesElement(getSpecialLinks(sm, cepp.linkSpans, - cepp.links, " - ", languageIdentifers)); - - // Parsing and Setting the Sections... the main work is done in parse - // sections! - ppResult.setSections(EmptyStructureRemover - .eliminateEmptyStructures(parseSections(sm, cepp, lineSpans))); - - // Finding and Setting the paragraph which is concidered as the "First" - setFirstParagraph(ppResult); - - // check the calculated source positions, and reset them if necessary. - if (calculateSrcSpans) - { - SrcPosRangeChecker.checkRange(ppResult); - } - - // So it is done... - return ppResult; - } - - - - /** - * Deleting all comments out of the SpanManager...<br> - * <!-- COMMENT --> - */ - private void deleteComments(SpanManager sm) - { - int start = 0; - while ((start = sm.indexOf("<!--", start)) != -1) - { - int end = sm.indexOf("-->", start + 4) + 3; - if (end == -1 + 3) - { - end = sm.length(); - } - - // Remove the one lineSeparator too, if the whole line is a comment! - try - { - if (lineSeparator.equals(sm.substring(start - - lineSeparator.length(), start)) - && lineSeparator.equals(sm.substring(end, end - + lineSeparator.length()))) - { - end += lineSeparator.length(); - } - } - catch (IndexOutOfBoundsException e) - {} - - sm.delete(start, end); - } - } - - /** - * Deleteing ALL TOC Tags - */ - private void deleteTOCTag(SpanManager sm) - { - // delete all __TOC__ from SRC - int temp = 0; - while ((temp = sm.indexOf("__TOC__", temp)) != -1) - { - sm.delete(temp, temp + 2 + 3 + 2); - } - - // delete all __NOTOC__ from SRC - temp = 0; - while ((temp = sm.indexOf("__NOTOC__", temp)) != -1) - { - sm.delete(temp, temp + 2 + 5 + 2); - } - } - - private ContentElement getSpecialLinks(SpanManager sm, - List<Span> linkSpans, List<Link> links, String linkSpacer, - List<String> identifers) - { - ContentElement result = new ContentElement(); - StringBuilder text = new StringBuilder(); - List<Link> localLinks = new ArrayList<>(); - - for (int i = links.size() - 1; i >= 0; i--) - { - String identifer = getLinkNameSpace(links.get(i).getTarget()); - - if (identifer != null && identifers.indexOf(identifer) != -1) - { - Link l = links.remove(i); - Span s = linkSpans.remove(i); - String linkText = sm.substring(s); - sm.delete(s); - l.setHomeElement(result); - s.adjust(-s.getStart() + text.length()); - text.append(linkText + linkSpacer); - localLinks.add(l); - //TODO add type? - } - } - - int len = text.length(); - if (len != 0) - { - text.delete(len - linkSpacer.length(), len); - } - - result.setText(text.toString()); - result.setLinks(localLinks); - - if (result.empty()) - { - return null; - } - else - { - return result; - } - } - - private void getLineSpans(SpanManager sm, LinkedList<Span> lineSpans) - { - sm.manageList(lineSpans); - - int start = 0; - int end; - - while ((end = sm.indexOf(lineSeparator, start)) != -1) - { - lineSpans.add(new Span(start, end).trimTrail(sm)); - start = end + lineSeparator.length(); - } - lineSpans.add(new Span(start, sm.length()).trimTrail(sm)); - - while (!lineSpans.isEmpty() && lineSpans.getFirst().length() == 0) - { - lineSpans.removeFirst(); - } - while (!lineSpans.isEmpty() && lineSpans.getLast().length() == 0) - { - lineSpans.removeLast(); - } - } - - private SectionContainer parseSections(SpanManager sm, - ContentElementParsingParameters cepp, LinkedList<Span> lineSpans) - { - - List<SectionContent> contentSections = new ArrayList<>(); - - SectionContent sc = new SectionContent(1); - - if (calculateSrcSpans) - { - sc.setSrcSpan(new SrcSpan(sm.getSrcPos(lineSpans.getFirst() - .getStart()), -1)); - } - - // Identify the Line Type and call the necessary Function for the - // further handling... - while (!lineSpans.isEmpty()) - { - - Span s = lineSpans.getFirst(); - - lineType t = getLineType(sm, s); - switch (t) - { - case SECTION: - contentSections.add(sc); - int level = getSectionLevel(sm, s); - sc = new SectionContent(parseContentElement(sm, cepp, new Span( - s.getStart() + level, s.getEnd() - level).trim(sm)), - level); - lineSpans.removeFirst(); - - if (calculateSrcSpans) - { - sc.setSrcSpan(new SrcSpan(sm.getSrcPos(s.getStart()), -1)); - } - - break; - - case HR: - // remove the HR (----) and handle the rest as a parapraph line - removeHr(sm, s); - t = lineType.PARAGRAPH; - case PARAGRAPH: - case PARAGRAPH_BOXED: - case PARAGRAPH_INDENTED: - sc.addParagraph(buildParagraph(sm, cepp, lineSpans, t)); - break; - - case NESTEDLIST: - case NESTEDLIST_NR: - sc.addNestedList(buildNestedList(sm, cepp, lineSpans, t)); - break; - - case DEFINITIONLIST: - sc.addDefinitionList(buildDefinitionList(sm, cepp, lineSpans)); - break; - - case TABLE: - sc.addTable(buildTable(sm, cepp, lineSpans)); - break; - - case EMPTYLINE: - lineSpans.removeFirst(); - break; - - default: - logger.error("unknown lineStart!: \"" + sm.substring(s) + "\""); - lineSpans.removeFirst(); - } - } - - // add the remaining Section to the list. - contentSections.add(sc); - - return buildSectionStructure(contentSections); - } - - private Span removeHr(SpanManager sm, Span s) - { - int start = s.getStart(); - final int end = s.getEnd(); - while (sm.charAt(start) == '-' && start < end) - { - start++; - } - return s.setStart(start).trim(sm); - } - - /** - * The Line Types wich are possible... - */ - private enum lineType - { - SECTION, TABLE, NESTEDLIST, NESTEDLIST_NR, DEFINITIONLIST, HR, PARAGRAPH, PARAGRAPH_INDENTED, PARAGRAPH_BOXED, EMPTYLINE - } - - /** - * Retunrns the Type of a line, this is mainly done by the First Char of the - * Line... - */ - private lineType getLineType(SpanManager sm, Span lineSpan) - { - - switch (lineSpan.charAt(0, sm)) - { - - case '{': - if (lineSpan.charAt(1, sm) == '|') - { - return lineType.TABLE; - } - else - { - return lineType.PARAGRAPH; - } - - case '=': - if (lineSpan.length() > 2 - && sm.charAt(lineSpan.getEnd() - 1) == '=') - { - return lineType.SECTION; - } - else - { - return lineType.PARAGRAPH; - } - - case '-': - if (lineSpan.charAt(1, sm) == '-' && lineSpan.charAt(2, sm) == '-' - && lineSpan.charAt(3, sm) == '-') - { - return lineType.HR; - } - else - { - return lineType.PARAGRAPH; - } - - case '*': - return lineType.NESTEDLIST; - - case '#': - return lineType.NESTEDLIST_NR; - - case ';': - return lineType.DEFINITIONLIST; - - case ':': - if (lineSpan.length() > 1) - { - if (lineSpan.length() > 2 && lineSpan.charAt(1, sm) == '{' - && lineSpan.charAt(2, sm) == '|') - { - return lineType.TABLE; - } - else - { - return lineType.PARAGRAPH_INDENTED; - } - } - else - { - return lineType.PARAGRAPH; - } - - case ' ': - int nonWSPos = lineSpan.nonWSCharPos(sm); - switch (lineSpan.charAt(nonWSPos, sm)) - { - case Span.ERRORCHAR: - return lineType.EMPTYLINE; - case '{': - if (lineSpan.charAt(nonWSPos + 1, sm) == '|') - { - return lineType.TABLE; - } - default: - return lineType.PARAGRAPH_BOXED; - } - - case Span.ERRORCHAR: - return lineType.EMPTYLINE; - - default: - return lineType.PARAGRAPH; - } - } - - /** - * Returns the number of Equality Chars which are used to specify the level - * of the Section. - */ - private int getSectionLevel(SpanManager sm, Span sectionNameSpan) - { - int begin = sectionNameSpan.getStart(); - int end = sectionNameSpan.getEnd(); - int level = 0; - - try - { - while ((sm.charAt(begin + level) == '=') - && (sm.charAt(end - 1 - level) == '=')) - { - level++; - } - } - catch (StringIndexOutOfBoundsException e) - { - // there is no need to do anything! - logger.debug("EXCEPTION IS OK: {}", e.getLocalizedMessage()); - } - - if (begin + level == end) - { - level = (level - 1) / 2; - } - - return level; - } - - /** - * Takes a list of SectionContent and returns a SectionContainer with the - * given SectionContent s in the right structure. - */ - private SectionContainer buildSectionStructure(List<SectionContent> scl) - { - SectionContainer result = new SectionContainer(0); - - for (SectionContent sContent : scl) - { - int contentLevel = sContent.getLevel(); - SectionContainer sContainer = result; - - // get the right SectionContainer or create it - for (int containerLevel = result.getLevel() + 1; containerLevel < contentLevel; containerLevel++) - { - int containerSubSections = sContainer.nrOfSubSections(); - if (containerSubSections != 0) - { - Section temp = sContainer - .getSubSection(containerSubSections - 1); - if (temp.getClass() == SectionContainer.class) - { - sContainer = (SectionContainer) temp; - } - else - { - SectionContainer sct = new SectionContainer(temp - .getTitleElement(), containerLevel); - sct.addSection(temp); - if (calculateSrcSpans) - { - sct.setSrcSpan(temp.getSrcSpan()); - } - temp.setTitleElement(null); - temp.setLevel(containerLevel + 1); - sContainer.removeSection(temp); - sContainer.addSection(sct); - sContainer = sct; - } - } - else - { - sContainer = new SectionContainer(null, containerLevel); - } - } - - sContainer.addSection(sContent); - } - - if (calculateSrcSpans) - { - result.setSrcSpan(new SrcSpan(0, -1)); - } - - return result; - } - - private boolean startsWithIgnoreCase(String s1, String s2) - { - final int s2len = s2.length(); - if (s1.length() < s2len) - { - return false; - } - return s1.substring(0, s2len).equalsIgnoreCase(s2); - } - - private Span getTag(SpanManager sm, int offset) - { - int start = sm.indexOf("<", offset); - if (start == -1) - { - return null; - } - int end = sm.indexOf(">", start); - if (end == -1) - { - return null; - } - - Span s = new Span(start, end + 1); - if (calculateSrcSpans) - { - s - .setSrcSpan(new SrcSpan(sm.getSrcPos(start), sm - .getSrcPos(end) + 1)); - } - return s; - } - - private String getTagText(SpanManager sm, Span tag) - { - return sm.substring(new Span(tag.getStart() + 1, tag.getEnd() - 1) - .trim(sm)); - } - - private void parseSpecifiedTag(SpanManager sm, List<Span> spans, - List<String> strings, String specifier) - { - parseSpecifiedTag(sm, spans, strings, specifier, ""); - } - - private void parseSpecifiedTag(SpanManager sm, List<Span> spans, - List<String> strings, String specifier, String prefix) - { - int offset = 0; - - Span s; - while ((s = getTag(sm, offset)) != null) - { - offset = s.getEnd(); - String tagText = getTagText(sm, s); - if (startsWithIgnoreCase(tagText, specifier)) - { - - Span e; - while ((e = getTag(sm, offset)) != null) - { - offset = e.getEnd(); - tagText = getTagText(sm, e); - if (startsWithIgnoreCase(tagText, "/" + specifier)) - { - break; - } - } - - if (e == null) - { - /* - * OF: Setting e to sm.length()results in ArrayIndexOutOfBoundsExeption if calculateSrcSpans=true - */ - //e = new Span(sm.length(), sm.length()); - e = new Span(Math.max(0,sm.length()-1), Math.max(0,sm.length()-1)); - } - - strings.add(sm.substring(s.getEnd(), e.getStart())); - - Span tSpan = new Span(s.getStart(), e.getEnd()); - if (calculateSrcSpans) - { - tSpan.setSrcSpan(new SrcSpan(sm.getSrcPos(s.getStart()), sm - .getSrcPos(e.getEnd()))); - } - - spans.add(tSpan); - sm.replace(tSpan, prefix + "(" + specifier + ")"); - tSpan.adjustStart(prefix.length()); - - offset = tSpan.getEnd(); - } - } - } - - private void parseTags(SpanManager sm, List<Span> spans) - { - sm.manageList(spans); - - Span s = new Span(0, 0); - while ((s = getTag(sm, s.getEnd())) != null) - { - spans.add(s); - } - - if (spans.size() == 0) - { - sm.removeManagedList(spans); - } - } - - private void parseTemplates(SpanManager sm, - List<Span> resolvedTemplateSpans, - List<ResolvedTemplate> resolvedTemplates, ParsedPage pp) - { - - sm.manageList(resolvedTemplateSpans); - - int pos = -2; - Stack<Integer> templateOpenTags = new Stack<>(); - while ((pos = sm.indexOf("{{", pos + 2)) != -1) - { - if (sm.length() > pos + 3 && sm.charAt(pos + 2) == '{' - && sm.charAt(pos + 3) != '{') - { - pos++; - } - templateOpenTags.push(pos); - } - - while (!templateOpenTags.empty()) - { - int templateOpenTag = templateOpenTags.pop(); - int templateCloseTag = sm.indexOf("}}", templateOpenTag); - if (templateCloseTag == -1) - { - continue; - } - - int templateOptionTag = sm.indexOf("|", templateOpenTag, - templateCloseTag); - int templateNameEnd; - List<String> templateOptions; - - if (templateOptionTag != -1) - { - templateNameEnd = templateOptionTag; - templateOptions = tokenize(sm, templateOptionTag + 1, - templateCloseTag, "|"); - } - else - { - templateNameEnd = templateCloseTag; - templateOptions = new ArrayList<>(); - } - - Span ts = new Span(templateOpenTag, templateCloseTag + 2); - - Template t = new Template(ts, encodeWikistyle(sm.substring( - templateOpenTag + 2, templateNameEnd).trim()), - templateOptions); - - if (calculateSrcSpans) - { - t.setSrcSpan(new SrcSpan(sm.getSrcPos(templateOpenTag), sm - .getSrcPos(templateCloseTag + 2))); - } - - t.setPos(ts); - - ResolvedTemplate rt = templateParser.parseTemplate(t, pp); - - resolvedTemplateSpans.add(ts); - resolvedTemplates.add(rt); - - sm.replace(ts, rt.getPreParseReplacement()); - } - - if (resolvedTemplateSpans.isEmpty()) - { - sm.removeManagedList(resolvedTemplateSpans); - } - } - - private void convertGalleriesToImages(SpanManager sm, List<Span> tagSpans) - { - // Quick Hack, not very efficent, should be improved, wont work with - // calculateSrcSpans == true ! - - for (int i = 0; i < tagSpans.size() - 1; i++) - { - String openText = getTagText(sm, tagSpans.get(i)); - if (startsWithIgnoreCase(openText, "GALLERY")) - { - - if (startsWithIgnoreCase(getTagText(sm, tagSpans.get(i + 1)), - "/GALLERY")) - { - - // gallery range is tag(i).end() .. tag(i+1).start() - Span startSpan = tagSpans.remove(i); - Span endSpan = tagSpans.remove(i); - i--; - - StringBuilder sb = new StringBuilder(); - - // caption (any option will be treated as caption) - int eqPos = openText.indexOf('='); - if (eqPos != -1) - { - int captionStart = eqPos + 1; - int captionEnd = openText.length(); - - if (captionStart < captionEnd - && openText.charAt(captionStart) == '"' - && openText.charAt(captionEnd - 1) == '"') - { - captionStart++; - captionEnd--; - } - - if (captionStart < captionEnd) - { - sb.append(openText.substring(captionStart, - captionEnd) - + lineSeparator); - } - } - - // images - for (String s : tokenize(sm, startSpan.getEnd(), endSpan - .getStart(), lineSeparator)) - { - sb.append("[[" + s + "]]" + lineSeparator); - } - - // replace the source and remove the tags - sm.replace(startSpan.getStart(), endSpan.getEnd(), sb - .toString()); - } - else - { - continue; - } - } - } - } - - private Table buildTable(SpanManager sm, - ContentElementParsingParameters cepp, LinkedList<Span> lineSpans) - { - - Table result = new Table(); - int col = -1; - int row = 0; - int subTables = 0; - LinkedList<Span> tableDataSpans = new LinkedList<>(); - sm.manageList(tableDataSpans); - - if (calculateSrcSpans) - { - result.setSrcSpan(new SrcSpan(sm.getSrcPos(lineSpans.getFirst() - .getStart()), -1)); - } - - lineSpans.removeFirst(); - - while (!lineSpans.isEmpty()) - { - Span s = lineSpans.removeFirst(); - - int pos = s.nonWSCharPos(sm); - char c0 = s.charAt(pos, sm); - char c1 = s.charAt(pos + 1, sm); - - if (subTables == 0 && (c0 == '!' || c0 == '|')) - { - if (!tableDataSpans.isEmpty()) - { - lineSpans.addFirst(s); - - SrcSpan ei = null; - if (calculateSrcSpans) - { - ei = new SrcSpan(sm.getSrcPos(tableDataSpans.getFirst() - .getStart() - 1) + 1, -1); - } - - TableElement te = new TableElement(parseSections(sm, cepp, - tableDataSpans), row, col); - te.setSrcSpan(ei); - result.addTableElement(te); - lineSpans.removeFirst(); - } - - col++; - if (c1 == '-') - { - row++; - col = -1; - continue; - } - else if (c0 == '|' && c1 == '}') - { - sm.removeManagedList(tableDataSpans); - - if (calculateSrcSpans) - { - result.getSrcSpan().setEnd(sm.getSrcPos(s.getEnd())); - } - - return result; - } - else if (c0 == '|' && c1 == '+') - { - result.setTitleElement(parseContentElement(sm, cepp, - new Span(s.getStart() + pos + 2, s.getEnd()) - .trim(sm))); - continue; - } - else - { - int multipleCols; - if ((multipleCols = sm.indexOf("||", - s.getStart() + pos + 1, s.getEnd())) != -1) - { - lineSpans.addFirst(new Span(multipleCols + 1, s - .getEnd())); - s.setEnd(multipleCols); - } - - int optionTagPos = sm.indexOf("|", s.getStart() + pos + 1, - s.getEnd()); - - if (optionTagPos != -1) - { - s.setStart(optionTagPos + 1).trim(sm); - } - else - { - s.adjustStart(pos + 1).trim(sm); - } - } - } - else if (c0 == '|' && c1 == '}') - { - subTables--; - } - else if (c0 == '{' && c1 == '|') - { - subTables++; - } - - tableDataSpans.addLast(s); - } - - if (tableDataSpans.size() != 0) - { - - SrcSpan ei = null; - if (calculateSrcSpans) - { - ei = new SrcSpan(sm.getSrcPos(tableDataSpans.getFirst() - .getStart() - 1) + 1, -1); - } - - TableElement te = new TableElement(parseSections(sm, cepp, - tableDataSpans), row, col); - te.setSrcSpan(ei); - - result.addTableElement(te); - } - - sm.removeManagedList(tableDataSpans); - - if (calculateSrcSpans) - { - result.getSrcSpan().setEnd(-1); - } - - return result; - } - - private NestedListContainer buildNestedList(SpanManager sm, - ContentElementParsingParameters cepp, LinkedList<Span> lineSpans, - lineType listType) - { - - boolean numbered = listType == lineType.NESTEDLIST_NR; - NestedListContainer result = new NestedListContainer(numbered); - - if (calculateSrcSpans) - { - result.setSrcSpan(new SrcSpan(sm.getSrcPos(lineSpans.getFirst() - .getStart()), -1)); - } - - LinkedList<Span> nestedListSpans = new LinkedList<>(); - while (!lineSpans.isEmpty()) - { - Span s = lineSpans.getFirst(); - if (listType != getLineType(sm, s)) - { - break; - } - nestedListSpans - .add(new Span(s.getStart() + 1, s.getEnd()).trim(sm)); - lineSpans.removeFirst(); - } - sm.manageList(nestedListSpans); - - if (calculateSrcSpans) - { - result.getSrcSpan().setEnd( - sm.getSrcPos(nestedListSpans.getLast().getEnd())); - } - - while (!nestedListSpans.isEmpty()) - { - Span s = nestedListSpans.getFirst(); - lineType t = getLineType(sm, s); - if (t == lineType.NESTEDLIST || t == lineType.NESTEDLIST_NR) - { - result.add(buildNestedList(sm, cepp, nestedListSpans, t)); - } - else - { - nestedListSpans.removeFirst(); - result.add((NestedListElement) parseContentElement(sm, cepp, s, - new NestedListElement())); - } - } - - sm.removeManagedList(nestedListSpans); - - return result; - } - - private DefinitionList buildDefinitionList(SpanManager sm, - ContentElementParsingParameters cepp, LinkedList<Span> lineSpans) - { - List<ContentElement> content = new ArrayList<>(); - - Span s = lineSpans.removeFirst(); - - int temp = sm.indexOf(":", s); - if (temp == -1) - { - content.add(parseContentElement(sm, cepp, new Span( - s.getStart() + 1, s.getEnd()))); - } - else - { - content.add(parseContentElement(sm, cepp, new Span(temp + 1, s - .getEnd()))); - content.add(0, parseContentElement(sm, cepp, new Span( - s.getStart() + 1, temp))); - } - - while (!lineSpans.isEmpty()) - { - Span ns = lineSpans.getFirst(); - if (sm.charAt(ns.getStart()) != ':') - { - break; - } - lineSpans.removeFirst(); - content.add(parseContentElement(sm, cepp, new Span( - ns.getStart() + 1, ns.getEnd()))); - } - - DefinitionList result = new DefinitionList(content); - - if (calculateSrcSpans) - { - result.setSrcSpan(new SrcSpan(sm.getSrcPos(s.getStart()), content - .get(content.size() - 1).getSrcSpan().getEnd())); - } - - return result; - } - - private Paragraph buildParagraph(SpanManager sm, - ContentElementParsingParameters cepp, LinkedList<Span> lineSpans, - lineType paragraphType) - { - - LinkedList<Span> paragraphSpans = new LinkedList<>(); - Paragraph result = new Paragraph(); - Span s = lineSpans.removeFirst(); - paragraphSpans.add(s); - - switch (paragraphType) - { - case PARAGRAPH: - result.setType(Paragraph.type.NORMAL); - while (!lineSpans.isEmpty()) - { - if (paragraphType != getLineType(sm, lineSpans.getFirst())) - { - break; - } - paragraphSpans.add(lineSpans.removeFirst()); - } - break; - - case PARAGRAPH_BOXED: - result.setType(Paragraph.type.BOXED); - while (!lineSpans.isEmpty()) - { - lineType lt = getLineType(sm, lineSpans.getFirst()); - if (paragraphType != lt && lineType.EMPTYLINE != lt) - { - break; - } - paragraphSpans.add(lineSpans.removeFirst()); - } - break; - - case PARAGRAPH_INDENTED: - result.setType(Paragraph.type.INDENTED); - s.trim(sm.setCharAt(s.getStart(), ' ')); - break; - - default: - return null; - } - - parseContentElement(sm, cepp, paragraphSpans, result); - - return result; - } - - private List<String> tokenize(SpanManager sm, int start, int end, - String delim) - { - List<String> result = new ArrayList<>(); - - if (start > end) - { - logger.debug("tokenize({},{}) doesn't make sense", start, end); - return result; - } - - int s = start; - int e; - String token; - // Span rs; - while ((e = sm.indexOf(delim, s, end)) != -1) - { - // rs = new Span(s, e).trim( sm ); - // if( rs.length()>0 ) result.add( sm.substring( rs ) ); - token = sm.substring(s, e).trim(); - if (token.length() > 0) - { - result.add(token); - } - s = e + delim.length(); - } - // rs = new Span(s, end).trim( sm ); - // if( rs.length()>0 ) result.add( sm.substring( rs ) ); - token = sm.substring(s, end).trim(); - if (token.length() > 0) - { - result.add(token); - } - - return result; - } - - private void parseExternalLinks(SpanManager sm, Span s, String protocol, - List<Span> managedList, List<Link> links, Content home_cc) - { - int extLinkTargetStart; - Span extLinkSpan = new Span(0, s.getStart()); - - while ((extLinkTargetStart = sm.indexOf(protocol, extLinkSpan.getEnd(), - s.getEnd())) != -1) - { - - // Allowed char before the protocol identifer ? - if (extLinkTargetStart > s.getStart() - && (" [").indexOf(sm.charAt(extLinkTargetStart - 1)) == -1) - { - extLinkSpan = new Span(0, extLinkTargetStart + 1); - continue; - } - - // Target - int extLinkTargetEnd = extLinkTargetStart; - while ((lineSeparator + " ]").indexOf(sm.charAt(extLinkTargetEnd)) == -1) - { - extLinkTargetEnd++; - } - - // Open/Close Tags - int extLinkOpenTag = extLinkTargetStart - 1; - int extLinkCloseTag; - int extLinkTextStart = extLinkTargetStart; - int extLinkTextEnd = extLinkTargetEnd; - - while (extLinkOpenTag >= s.getStart() - && sm.charAt(extLinkOpenTag) == ' ') - { - extLinkOpenTag--; - } - - if (extLinkOpenTag >= s.getStart() - && sm.charAt(extLinkOpenTag) == '[') - { - extLinkCloseTag = sm.indexOf("]", extLinkTargetEnd, s.getEnd()); - - if (extLinkCloseTag != -1) - { - extLinkTextStart = extLinkTargetEnd; - // nicht wie bei "normalen" links durhc | getrennt sondenr - // durhc leerzeichen !!! schei�e !!! - while (sm.charAt(extLinkTextStart) == ' ') - { - extLinkTextStart++; - } - extLinkTextEnd = extLinkCloseTag; - extLinkCloseTag++; - - if (extLinkTextStart == extLinkTextEnd) - { - sm.insert(extLinkTextStart, "[ ]"); - extLinkTextEnd += 3; - extLinkCloseTag += 3; - } - } - else - { - extLinkOpenTag = extLinkTargetStart; - extLinkCloseTag = extLinkTargetEnd; - } - } - else - { - extLinkOpenTag = extLinkTargetStart; - extLinkCloseTag = extLinkTargetEnd; - } - - extLinkSpan = new Span(extLinkOpenTag, extLinkCloseTag); - managedList.add(extLinkSpan); - - Link l = new Link(home_cc, extLinkSpan, sm.substring( - extLinkTargetStart, extLinkTargetEnd), Link.type.EXTERNAL, - null); - links.add(l); - - if (calculateSrcSpans) - { - l.setSrcSpan(new SrcSpan(sm.getSrcPos(extLinkOpenTag), sm - .getSrcPos(extLinkCloseTag - 1) + 1)); - } - - sm.delete(extLinkTextEnd, extLinkCloseTag); - sm.delete(extLinkOpenTag, extLinkTextStart); - } - } - - /** - * Returns the LOWERCASE NameSpace of the link target - */ - private static String getLinkNameSpace(String target) - { - int pos = target.indexOf(':'); - if (pos == -1) - { - return null; - } - else - { - return target.substring(0, pos).replace('_', ' ').trim() - .toLowerCase(); - } - } - - /** - * There is not much differences between links an images, so they are parsed - * in a single step - */ - private void parseImagesAndInternalLinks(SpanManager sm, - List<Span> linkSpans, List<Link> links) - { - - sm.manageList(linkSpans); - - int pos = -1; - Stack<Integer> linkOpenTags = new Stack<>(); - while ((pos = sm.indexOf("[[", pos + 1)) != -1) - { - linkOpenTags.push(pos); - } - - Span lastLinkSpan = new Span(sm.length() + 1, sm.length() + 1); - Link.type linkType = Link.type.INTERNAL; - - while (!linkOpenTags.empty()) - { - int linkStartTag = linkOpenTags.pop(); - int linkEndTag = sm.indexOf("]]", linkStartTag); - if (linkEndTag == -1) - { - continue; - } - - int linkOptionTag = sm.indexOf("|", linkStartTag, linkEndTag); - - int linkTextStart; - String linkTarget; - - if (linkOptionTag != -1) - { - linkTextStart = linkOptionTag + 1; - linkTarget = sm.substring(new Span(linkStartTag + 2, - linkOptionTag).trim(sm)); - } - else - { - linkTextStart = linkStartTag + 2; - linkTarget = sm - .substring(new Span(linkStartTag + 2, linkEndTag) - .trim(sm)); - } - - // is is a regular link ? - if (linkTarget.contains(lineSeparator)) - { - continue; - } - linkTarget = encodeWikistyle(linkTarget); - - // so it is a Link or image!!! - List<String> parameters; - - String namespace = getLinkNameSpace(linkTarget); - if (namespace != null) - { - if (imageIdentifers.indexOf(namespace) != -1) - { - if (linkOptionTag != -1) - { - int temp; - while ((temp = sm.indexOf("|", linkTextStart, - linkEndTag)) != -1) - { - linkTextStart = temp + 1; - } - - parameters = tokenize(sm, linkOptionTag + 1, - linkEndTag, "|"); - - // maybe there is an external link at the end of the - // image description... - if (sm.charAt(linkEndTag + 2) == ']' - && sm.indexOf("[", linkTextStart, linkEndTag) != -1) - { - linkEndTag++; - } - } - else - { - parameters = null; - } - linkType = Link.type.IMAGE; - } - else - { - //Link has namespace but is not image - linkType = Link.type.UNKNOWN; - parameters = null; - } - } - else - { - if (linkType == Link.type.INTERNAL - && lastLinkSpan.hits(new Span(linkStartTag, - linkEndTag + 2))) - { - continue; - } - parameters = null; - linkType = Link.type.INTERNAL; - } - - Span posSpan = new Span(linkTextStart, linkEndTag).trim(sm); - linkSpans.add(posSpan); - - Link l = new Link(null, posSpan, linkTarget, linkType, parameters); - links.add(l); - - if (calculateSrcSpans) - { - l.setSrcSpan(new SrcSpan(sm.getSrcPos(linkStartTag), sm - .getSrcPos(linkEndTag + 2))); - } - - sm.delete(posSpan.getEnd(), linkEndTag + 2); - sm.delete(linkStartTag, posSpan.getStart()); - - // removing line separators in link text - int lsinlink; - while ((lsinlink = sm.indexOf(lineSeparator, posSpan)) != -1) - { - sm.replace(lsinlink, lsinlink + lineSeparator.length(), " "); - } - - lastLinkSpan = posSpan; - } - } - - /** - * Searches the Range given by the Span s for the double occurence of - * "quotation" and puts the results in the List quotedSpans. The Quotation - * tags will be deleted. - * - * @param sm - * , the Source in which will be searched - * @param s - * , the range in which will be searched - * @param quotedSpans - * , the List where the Spans will be placed, should be managed - * by the SpanManager sm - * @param quotation - * , the start and end tag as String - */ - private void parseQuotedSpans(SpanManager sm, Span s, - List<Span> quotedSpans, String quotation) - { - - final int qlen = quotation.length(); - - // get the start position - int start = sm.indexOf(quotation, s.getStart(), s.getEnd()); - - while (start != -1) - { - - // get the end position - int end = sm.indexOf(quotation, start + qlen, s.getEnd()); - if (end == -1) - { - break; - } - - // build a new span from start and end position. - Span qs = new Span(start, end); - quotedSpans.add(qs); - - // calculate the original src positions. - if (calculateSrcSpans) - { - qs.setSrcSpan(new SrcSpan(sm.getSrcPos(start), sm.getSrcPos(end - + qlen - 1) + 1)); - } - - // delete the tags. - sm.delete(end, end + qlen); - sm.delete(start, start + qlen); - - // get the next start position - start = sm.indexOf(quotation, qs.getEnd(), s.getEnd()); - } - } - - /** - * Searches a line for Bold and Italic quotations, this has to be done - * linewhise. - */ - private void parseBoldAndItalicSpans(SpanManager sm, Span line, - List<Span> boldSpans, List<Span> italicSpans) - { - // Das suchen nach BOLD und ITALIC muss in den Jeweiligen - // Zeilen geschenhen, da ein LineSeparator immer BOLD und - // Italic Tags schliesst. - - // Bold Spans - parseQuotedSpans(sm, line, boldSpans, "'''"); - - // Italic Spans - parseQuotedSpans(sm, line, italicSpans, "''"); - - // Maybe there is ONE SINGLE OPEN TAG left... handel these... - int openTag = sm.indexOf("''", line); - if (openTag != -1) - { - // build a Span from this Tag. - Span qs = new Span(openTag, line.getEnd()); - - // calculate the original src positions. - if (calculateSrcSpans) - { - qs.setSrcSpan(new SrcSpan(sm.getSrcPos(openTag), sm - .getSrcPos(line.getEnd()))); - } - - // is it a Bold or an Italic tag ? - if (sm.indexOf("'''", openTag, openTag + 3) != -1) - { - // --> BOLD - boldSpans.add(qs); - sm.delete(openTag, openTag + 3); - } - else - { - // --> ITALIC - italicSpans.add(qs); - sm.delete(openTag, openTag + 2); - } - } - } - - private static String encodeWikistyle(String str) - { - return str.replace(' ', '_'); - } - - /** - * Building a ContentElement from a String - */ - @Override - public ContentElement parseContentElement(String src) - { - SpanManager sm = new SpanManager(src); - ContentElementParsingParameters cepp = new ContentElementParsingParameters(); - - parseImagesAndInternalLinks(sm, cepp.linkSpans, cepp.links); - - LinkedList<Span> lineSpans = new LinkedList<>(); - getLineSpans(sm, lineSpans); - sm.removeManagedList(lineSpans); - return (parseContentElement(sm, cepp, lineSpans, new ContentElement())); - } - - /** - * Building a ContentElement from a single line. - */ - private ContentElement parseContentElement(SpanManager sm, - ContentElementParsingParameters cepp, Span lineSpan) - { - LinkedList<Span> lineSpans = new LinkedList<>(); - lineSpans.add(lineSpan); - return parseContentElement(sm, cepp, lineSpans, new ContentElement()); - } - - /** - * Building a ContentElement from a single line. But the result is given, so - * e.g. a NestedListElement can be filled with information... - */ - private ContentElement parseContentElement(SpanManager sm, - ContentElementParsingParameters cepp, Span lineSpan, - ContentElement result) - { - LinkedList<Span> lineSpans = new LinkedList<>(); - lineSpans.add(lineSpan); - return parseContentElement(sm, cepp, lineSpans, result); - } - - /** - * Building a ContentElement, this funciton is calles by all the other - * parseContentElement(..) functions - */ - private ContentElement parseContentElement(SpanManager sm, - ContentElementParsingParameters cepp, LinkedList<Span> lineSpans, - ContentElement result) - { - - List<Link> localLinks = new ArrayList<>(); - List<Template> localTemplates = new ArrayList<>(); - - List<Span> boldSpans = new ArrayList<>(); - List<Span> italicSpans = new ArrayList<>(); - sm.manageList(boldSpans); - sm.manageList(italicSpans); - - List<Span> managedSpans = new ArrayList<>(); - sm.manageList(managedSpans); - - Span contentElementRange = new Span(lineSpans.getFirst().getStart(), - lineSpans.getLast().getEnd()).trim(sm); - managedSpans.add(contentElementRange); - - // set the SrcSpan - if (calculateSrcSpans) - { - result.setSrcSpan(new SrcSpan(sm.getSrcPos(contentElementRange - .getStart()), sm.getSrcPos(contentElementRange.getEnd()))); - } - - sm.manageList(lineSpans); - while (!lineSpans.isEmpty()) - { - Span line = lineSpans.getFirst(); - - parseBoldAndItalicSpans(sm, line, boldSpans, italicSpans); - - // External links - parseExternalLinks(sm, line, "http://", managedSpans, localLinks, - result); - parseExternalLinks(sm, line, "https://", managedSpans, localLinks, - result); - parseExternalLinks(sm, line, "ftp://", managedSpans, localLinks, - result); - parseExternalLinks(sm, line, "mailto:", managedSpans, localLinks, - result); - - // end of linewhise opperations - lineSpans.removeFirst(); - } - sm.removeManagedList(lineSpans); - - // Links - int i; - i = 0; - while (i < cepp.linkSpans.size()) - { - if (contentElementRange.hits(cepp.linkSpans.get(i))) - { - Span linkSpan = cepp.linkSpans.remove(i); - managedSpans.add(linkSpan); - Link l = cepp.links.remove(i).setHomeElement(result); - localLinks.add(l); - if (!showImageText && l.getType() == Link.type.IMAGE) - { - // deletes the Image Text from the ContentElement Text. - sm.delete(linkSpan); - } - } - else - { - i++; - } - } - - // Templates - i = 0; - while (i < cepp.templateSpans.size()) - { - Span ts = cepp.templateSpans.get(i); - if (contentElementRange.hits(ts)) - { - ResolvedTemplate rt = cepp.templates.remove(i); - - if (rt.getPostParseReplacement() != null) - { - sm.replace(ts, rt.getPostParseReplacement()); - } - cepp.templateSpans.remove(i); - - Object parsedObject = rt.getParsedObject(); - if (parsedObject != null) - { - managedSpans.add(ts); - - Class<?> parsedObjectClass = parsedObject.getClass(); - if (parsedObjectClass == Template.class) - { - localTemplates.add((Template) parsedObject); - } - else if (parsedObjectClass == Link.class) - { - localLinks.add(((Link) parsedObject) - .setHomeElement(result)); - } - else - { - localTemplates.add(rt.getTemplate()); - } - } - } - else - { - i++; - } - } - - // HTML/XML Tags - i = 0; - List<Span> tags = new ArrayList<>(); - while (i < cepp.tagSpans.size()) - { - Span s = cepp.tagSpans.get(i); - if (contentElementRange.hits(s)) - { - cepp.tagSpans.remove(i); - if (deleteTags) - { - sm.delete(s); - } - else - { - tags.add(s); - managedSpans.add(s); - } - } - else - { - i++; - } - } - - // noWiki - i = 0; - List<Span> localNoWikiSpans = new ArrayList<>(); - while (i < cepp.noWikiSpans.size()) - { - Span s = cepp.noWikiSpans.get(i); - if (contentElementRange.hits(s)) - { - cepp.noWikiSpans.remove(i); - sm.replace(s, cepp.noWikiStrings.remove(i)); - localNoWikiSpans.add(s); - managedSpans.add(s); - } - else - { - i++; - } - } - - // MATH Tags - i = 0; - List<Span> mathSpans = new ArrayList<>(); - while (i < cepp.mathSpans.size()) - { - Span s = cepp.mathSpans.get(i); - if (contentElementRange.hits(s)) - { - cepp.mathSpans.remove(i); - - if (showMathTagContent) - { - mathSpans.add(s); - managedSpans.add(s); - sm.replace(s, cepp.mathStrings.remove(i)); - } - else - { - sm.delete(s); - } - } - else - { - i++; - } - } - - result.setText(sm.substring(contentElementRange)); - - // managed spans must be removed here and not earlier, because every - // change in the SpanManager affects the Spans! - sm.removeManagedList(boldSpans); - sm.removeManagedList(italicSpans); - sm.removeManagedList(managedSpans); - - // contentElementRange ist auch noch in managedSpans !!! deswegen: - final int adjust = -contentElementRange.getStart(); - for (Span s : boldSpans) - { - s.adjust(adjust); - } - for (Span s : italicSpans) - { - s.adjust(adjust); - } - for (Span s : managedSpans) - { - s.adjust(adjust); - } - - result.setFormatSpans(FormatType.BOLD, boldSpans); - result.setFormatSpans(FormatType.ITALIC, italicSpans); - result.setFormatSpans(FormatType.TAG, tags); - result.setFormatSpans(FormatType.MATH, mathSpans); - result.setFormatSpans(FormatType.NOWIKI, localNoWikiSpans); - - result.setLinks(sortLinks(localLinks)); - result.setTemplates(sortTemplates(localTemplates)); - - return result; - } - - /** - * Sorts the Links... - */ - private static List<Link> sortLinks(List<Link> links) - { - List<Link> result = new ArrayList<>(); - for (Link l : links) - { - int pos = 0; - while (pos < result.size() - && l.getPos().getStart() > result.get(pos).getPos() - .getStart()) - { - pos++; - } - result.add(pos, l); - } - return result; - } - - /** - * Sorts the Templates... - */ - private static List<Template> sortTemplates(List<Template> templates) - { - List<Template> result = new ArrayList<>(); - for (Template t : templates) - { - int pos = 0; - while (pos < result.size() - && t.getPos().getStart() > result.get(pos).getPos() - .getStart()) - { - pos++; - } - result.add(pos, t); - } - return result; - } - - /** - * Algorithm to identify the first paragraph of a ParsedPage - */ - private void setFirstParagraph(ParsedPage pp) - { - int nr = pp.nrOfParagraphs(); - - // the paragraph with the lowest number, must not be the first, maybe it - // is only an Image... - for (int i = 0; i < nr; i++) - { - Paragraph p = pp.getParagraph(i); - - // get the Text from the paragraph - SpanManager ptext = new SpanManager(p.getText()); - List<Span> delete = new ArrayList<>(); - ptext.manageList(delete); - - // getting the spans to remove from the text, for templates - List<Template> tl = p.getTemplates(); - for (int j = tl.size() - 1; j >= 0; j--) - { - delete.add(tl.get(j).getPos()); - } - - // getting the spans to remove from the text, for Tags - List<Span> sl = p.getFormatSpans(FormatType.TAG); - for (int j = sl.size() - 1; j >= 0; j--) - { - delete.add(sl.get(j)); - } - - // getting the spans to remove from the text, for image text - if (showImageText) - { - List<Link> ll = p.getLinks(Link.type.IMAGE); - for (int j = ll.size() - 1; j >= 0; j--) - { - delete.add(ll.get(j).getPos()); - } - } - - // delete the spans in reverse order, the spans are managed, so - // there is no need to sort them - for (int j = delete.size() - 1; j >= 0; j--) - { - ptext.delete(delete.remove(j)); - } - - // removing line separators if exist, so the result can be trimmed - // in the next step - int pos = ptext.indexOf(lineSeparator); - while (pos != -1) - { - ptext.delete(pos, pos + lineSeparator.length()); - pos = ptext.indexOf(lineSeparator); - } - - // if the result is not an empty string, we got the number of the - // first paragraph - if (!ptext.toString().trim().equals("")) - { - pp.setFirstParagraphNr(i); - return; - } - } - } - - /** - * Container for all the Parameters needed in the parseing process - * - * - */ - class ContentElementParsingParameters - { - final List<Span> noWikiSpans; - final List<String> noWikiStrings; - final List<Span> linkSpans; - final List<Link> links; - final List<Span> templateSpans; - final List<ResolvedTemplate> templates; - final List<Span> tagSpans; - final List<Span> mathSpans; - final List<String> mathStrings; - - ContentElementParsingParameters() - { - noWikiSpans = new ArrayList<>(); - noWikiStrings = new ArrayList<>(); - linkSpans = new ArrayList<>(); - links = new ArrayList<>(); - templateSpans = new ArrayList<>(); - templates = new ArrayList<>(); - tagSpans = new ArrayList<>(); - mathSpans = new ArrayList<>(); - mathStrings = new ArrayList<>(); - } - } + MediaWikiContentElementParser { + + private static final Logger logger = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + + // Options, set by the ParserFactory + private String lineSeparator; + private List<String> categoryIdentifers; + private List<String> languageIdentifers; + private List<String> imageIdentifers; + private MediaWikiTemplateParser templateParser; + private boolean showImageText = false; + private boolean deleteTags = true; + private boolean showMathTagContent = true; + private boolean calculateSrcSpans = true; + + /** + * Creates a un-configured {@link ModularParser}... + */ + public ModularParser() { + } + + /** + * Creates a fully configured {@link ModularParser}... + */ + public ModularParser(String lineSeparator, List<String> languageIdentifers, + List<String> categoryIdentifers, List<String> imageIdentifers, + boolean showImageText, boolean deleteTags, + boolean showMathTagContent, boolean calculateSrcSpans, + MediaWikiTemplateParser templateParser) { + + setLineSeparator(lineSeparator); + setLanguageIdentifers(languageIdentifers); + setCategoryIdentifers(categoryIdentifers); + setImageIdentifers(imageIdentifers); + setShowImageText(showImageText); + setDeleteTags(deleteTags); + setShowMathTagContent(showMathTagContent); + setCalculateSrcSpans(calculateSrcSpans); + setTemplateParser(templateParser); + } + + /** + * Look at {@link MediaWikiParserFactory} for a description... + */ + @Override + public String getLineSeparator() { + return lineSeparator; + } + + /** + * Look at {@link MediaWikiParserFactory} for a description... + */ + public void setLineSeparator(String lineSeparator) { + this.lineSeparator = lineSeparator; + } + + /** + * Look at {@link MediaWikiParserFactory} for a description... + */ + public List<String> getLanguageIdentifers() { + return languageIdentifers; + } + + /** + * Look at {@link MediaWikiParserFactory} for a description... + */ + public void setLanguageIdentifers(List<String> languageIdentifers) { + this.languageIdentifers = listToLowerCase(languageIdentifers); + } + + /** + * Look at {@link MediaWikiParserFactory} for a description... + */ + public List<String> getCategoryIdentifers() { + return categoryIdentifers; + } + + /** + * Look at {@link MediaWikiParserFactory} for a description... + */ + public void setCategoryIdentifers(List<String> categoryIdentifers) { + this.categoryIdentifers = listToLowerCase(categoryIdentifers); + } + + /** + * Look at {@link MediaWikiParserFactory} for a description... + */ + public List<String> getImageIdentifers() { + return imageIdentifers; + } + + /** + * Look at {@link MediaWikiParserFactory} for a description... + */ + public void setImageIdentifers(List<String> imageIdentifers) { + this.imageIdentifers = listToLowerCase(imageIdentifers); + } + + /** + * Look at {@link MediaWikiParserFactory} for a description... + */ + public MediaWikiTemplateParser getTemplateParser() { + return templateParser; + } + + /** + * Look at {@link MediaWikiParserFactory} for a description... + */ + public void setTemplateParser(MediaWikiTemplateParser templateParser) { + this.templateParser = templateParser; + } + + /** + * Look at {@link MediaWikiParserFactory} for a description... + */ + public boolean showImageText() { + return showImageText; + } + + /** + * Look at {@link MediaWikiParserFactory} for a description... + */ + public void setShowImageText(boolean showImageText) { + this.showImageText = showImageText; + } + + /** + * Look at {@link MediaWikiParserFactory} for a description... + */ + public boolean deleteTags() { + return deleteTags; + } + + /** + * Look at {@link MediaWikiParserFactory} for a description... + */ + public void setDeleteTags(boolean deleteTags) { + this.deleteTags = deleteTags; + } + + /** + * Look at {@link MediaWikiParserFactory} for a description... + */ + public boolean showMathTagContent() { + return showMathTagContent; + } + + /** + * Look at {@link MediaWikiParserFactory} for a description... + */ + public void setShowMathTagContent(boolean showMathTagContent) { + this.showMathTagContent = showMathTagContent; + } + + /** + * Look at {@link MediaWikiParserFactory} for a description... + */ + public boolean calculateSrcSpans() { + return calculateSrcSpans; + } + + /** + * Look at {@link MediaWikiParserFactory} for a description... + */ + public void setCalculateSrcSpans(boolean calculateSrcSpans) { + this.calculateSrcSpans = calculateSrcSpans; + } + + /** + * Converts a List of Strings to lower case Strings. + */ + private List<String> listToLowerCase(List<String> l) { + List<String> result = new ArrayList<>(); + for (String s : l) { + result.add(s.toLowerCase()); + } + return result; + } + + /** + * Look at the MediaWikiParser interface for a description... + */ + @Override + public String configurationInfo() { + StringBuilder result = new StringBuilder(); + + result.append("MediaWikiParser configuration:\n"); + result.append("ParserClass: " + this.getClass() + "\n"); + result.append("ShowImageText: " + showImageText + "\n"); + result.append("DeleteTags: " + deleteTags + "\n"); + result.append("ShowMathTagContent: " + showMathTagContent + "\n"); + result.append("CalculateSrcSpans: " + calculateSrcSpans + "\n"); + + result.append("LanguageIdentifers: "); + for (String s : languageIdentifers) { + result.append(s + " "); + } + result.append("\n"); + + result.append("CategoryIdentifers: "); + for (String s : categoryIdentifers) { + result.append(s + " "); + } + result.append("\n"); + + result.append("ImageIdentifers: "); + for (String s : imageIdentifers) { + result.append(s + " "); + } + result.append("\n"); + + result.append("TemplateParser: " + templateParser.getClass() + "\n"); + result.append(templateParser.configurationInfo()); + + return result.toString(); + } + + /** + * Checks if the configuration is runnable. + */ + private boolean runConfig() { + if (lineSeparator == null) { + logger.debug("Set lineSeparator"); + return false; + } + if (categoryIdentifers == null) { + logger.warn("Set categoryIdentifers"); + return false; + } + if (languageIdentifers == null) { + logger.warn("Set languageIdentifers"); + return false; + } + if (imageIdentifers == null) { + logger.warn("Set imageIdentifers"); + return false; + } + if (templateParser == null) { + logger.warn("Set templateParser"); + return false; + } + return true; + } + + /** + * Look at the {@link MediaWikiParser} for a description... + */ + @Override + public ParsedPage parse(String src) { + // check if the configuration is runnable. + if (!runConfig()) { + return null; + } + + // check if the is something to parse. sometimes there is an empty string + // due to an error of other classes... + if (src == null || src.length() == 0) { + return null; + } + + // creates a new span manager with the given source, appending a newline + // to avoid errors. + SpanManager sm = new SpanManager(src.replace('\t', ' ') + lineSeparator); + if (calculateSrcSpans) { + sm.enableSrcPosCalculation(); + } + + // Creating a new ParsePage, which will be filled with information in + // the parseing process. + ParsedPage ppResult = new ParsedPage(); + + // Creating a new Parameter Container + ContentElementParsingParameters cepp = new ContentElementParsingParameters(); + + // Deletes comments out of the Source + deleteComments(sm); + + // Deletes any TOC Tags, these are not usesd in this parser. + deleteTOCTag(sm); + + // Removing the Content which should not parsed but integrated later in + // the resulting text + sm.manageList(cepp.noWikiSpans); + parseSpecifiedTag(sm, cepp.noWikiSpans, cepp.noWikiStrings, "PRE", " "); + parseSpecifiedTag(sm, cepp.noWikiSpans, cepp.noWikiStrings, "NOWIKI"); + if (cepp.noWikiSpans.size() == 0) { + sm.removeManagedList(cepp.noWikiSpans); + } + + // Parseing the Math Tags... + sm.manageList(cepp.mathSpans); + parseSpecifiedTag(sm, cepp.mathSpans, cepp.mathStrings, "MATH"); + if (cepp.mathSpans.size() == 0) { + sm.removeManagedList(cepp.mathSpans); + } + + // Parseing the Templates (the Span List will be added to the managed + // lists by the function) + parseTemplates(sm, cepp.templateSpans, cepp.templates, ppResult); + + // Parsing all other Tags + parseTags(sm, cepp.tagSpans); + + // Converting <gallery>s to normal Images, this is not beautiful, but + // a simple solution.. + convertGalleriesToImages(sm, cepp.tagSpans); + + // Parsing Links and Images. + parseImagesAndInternalLinks(sm, cepp.linkSpans, cepp.links); + + // Creating a list of Line Spans to work with lines in the following + // functions + LinkedList<Span> lineSpans = new LinkedList<>(); + getLineSpans(sm, lineSpans); + + // Removing the Category Links from the Links list, and crating an + // ContentElement for these links... + ppResult.setCategoryElement(getSpecialLinks(sm, cepp.linkSpans, + cepp.links, " - ", categoryIdentifers)); + + // Removing the Language Links from the Links list, and crating an + // ContentElement for these links... + ppResult.setLanguagesElement(getSpecialLinks(sm, cepp.linkSpans, + cepp.links, " - ", languageIdentifers)); + + // Parsing and Setting the Sections... the main work is done in parse + // sections! + ppResult.setSections(EmptyStructureRemover + .eliminateEmptyStructures(parseSections(sm, cepp, lineSpans))); + + // Finding and Setting the paragraph which is concidered as the "First" + setFirstParagraph(ppResult); + + // check the calculated source positions, and reset them if necessary. + if (calculateSrcSpans) { + SrcPosRangeChecker.checkRange(ppResult); + } + + // So it is done... + return ppResult; + } + + + /** + * Deleting all comments out of the SpanManager...<br> + * <!-- COMMENT --> + */ + private void deleteComments(SpanManager sm) { + int start = 0; + while ((start = sm.indexOf("<!--", start)) != -1) { + int end = sm.indexOf("-->", start + 4) + 3; + if (end == -1 + 3) { + end = sm.length(); + } + + // Remove the one lineSeparator too, if the whole line is a comment! + try { + if (lineSeparator.equals(sm.substring(start + - lineSeparator.length(), start)) + && lineSeparator.equals(sm.substring(end, end + + lineSeparator.length()))) { + end += lineSeparator.length(); + } + } catch (IndexOutOfBoundsException e) { + } + + sm.delete(start, end); + } + } + + /** + * Deleteing ALL TOC Tags + */ + private void deleteTOCTag(SpanManager sm) { + // delete all __TOC__ from SRC + int temp = 0; + while ((temp = sm.indexOf("__TOC__", temp)) != -1) { + sm.delete(temp, temp + 2 + 3 + 2); + } + + // delete all __NOTOC__ from SRC + temp = 0; + while ((temp = sm.indexOf("__NOTOC__", temp)) != -1) { + sm.delete(temp, temp + 2 + 5 + 2); + } + } + + private ContentElement getSpecialLinks(SpanManager sm, + List<Span> linkSpans, List<Link> links, String linkSpacer, + List<String> identifers) { + ContentElement result = new ContentElement(); + StringBuilder text = new StringBuilder(); + List<Link> localLinks = new ArrayList<>(); + + for (int i = links.size() - 1; i >= 0; i--) { + String identifer = getLinkNameSpace(links.get(i).getTarget()); + + if (identifer != null && identifers.indexOf(identifer) != -1) { + Link l = links.remove(i); + Span s = linkSpans.remove(i); + String linkText = sm.substring(s); + sm.delete(s); + l.setHomeElement(result); + s.adjust(-s.getStart() + text.length()); + text.append(linkText + linkSpacer); + localLinks.add(l); + //TODO add type? + } + } + + int len = text.length(); + if (len != 0) { + text.delete(len - linkSpacer.length(), len); + } + + result.setText(text.toString()); + result.setLinks(localLinks); + + if (result.empty()) { + return null; + } else { + return result; + } + } + + private void getLineSpans(SpanManager sm, LinkedList<Span> lineSpans) { + sm.manageList(lineSpans); + + int start = 0; + int end; + + while ((end = sm.indexOf(lineSeparator, start)) != -1) { + lineSpans.add(new Span(start, end).trimTrail(sm)); + start = end + lineSeparator.length(); + } + lineSpans.add(new Span(start, sm.length()).trimTrail(sm)); + + while (!lineSpans.isEmpty() && lineSpans.getFirst().length() == 0) { + lineSpans.removeFirst(); + } + while (!lineSpans.isEmpty() && lineSpans.getLast().length() == 0) { + lineSpans.removeLast(); + } + } + + private SectionContainer parseSections(SpanManager sm, + ContentElementParsingParameters cepp, LinkedList<Span> lineSpans) { + + List<SectionContent> contentSections = new ArrayList<>(); + + SectionContent sc = new SectionContent(1); + + if (calculateSrcSpans) { + sc.setSrcSpan(new SrcSpan(sm.getSrcPos(lineSpans.getFirst() + .getStart()), -1)); + } + + // Identify the Line Type and call the necessary Function for the + // further handling... + while (!lineSpans.isEmpty()) { + + Span s = lineSpans.getFirst(); + + lineType t = getLineType(sm, s); + switch (t) { + case SECTION: + contentSections.add(sc); + int level = getSectionLevel(sm, s); + sc = new SectionContent(parseContentElement(sm, cepp, new Span( + s.getStart() + level, s.getEnd() - level).trim(sm)), + level); + lineSpans.removeFirst(); + + if (calculateSrcSpans) { + sc.setSrcSpan(new SrcSpan(sm.getSrcPos(s.getStart()), -1)); + } + + break; + + case HR: + // remove the HR (----) and handle the rest as a parapraph line + removeHr(sm, s); + t = lineType.PARAGRAPH; + case PARAGRAPH: + case PARAGRAPH_BOXED: + case PARAGRAPH_INDENTED: + sc.addParagraph(buildParagraph(sm, cepp, lineSpans, t)); + break; + + case NESTEDLIST: + case NESTEDLIST_NR: + sc.addNestedList(buildNestedList(sm, cepp, lineSpans, t)); + break; + + case DEFINITIONLIST: + sc.addDefinitionList(buildDefinitionList(sm, cepp, lineSpans)); + break; + + case TABLE: + sc.addTable(buildTable(sm, cepp, lineSpans)); + break; + + case EMPTYLINE: + lineSpans.removeFirst(); + break; + + default: + logger.error("unknown lineStart!: \"" + sm.substring(s) + "\""); + lineSpans.removeFirst(); + } + } + + // add the remaining Section to the list. + contentSections.add(sc); + + return buildSectionStructure(contentSections); + } + + private Span removeHr(SpanManager sm, Span s) { + int start = s.getStart(); + final int end = s.getEnd(); + while (sm.charAt(start) == '-' && start < end) { + start++; + } + return s.setStart(start).trim(sm); + } + + /** + * The Line Types wich are possible... + */ + private enum lineType { + SECTION, TABLE, NESTEDLIST, NESTEDLIST_NR, DEFINITIONLIST, HR, PARAGRAPH, PARAGRAPH_INDENTED, PARAGRAPH_BOXED, EMPTYLINE + } + + /** + * Retunrns the Type of a line, this is mainly done by the First Char of the + * Line... + */ + private lineType getLineType(SpanManager sm, Span lineSpan) { + + switch (lineSpan.charAt(0, sm)) { + + case '{': + if (lineSpan.charAt(1, sm) == '|') { + return lineType.TABLE; + } else { + return lineType.PARAGRAPH; + } + + case '=': + if (lineSpan.length() > 2 + && sm.charAt(lineSpan.getEnd() - 1) == '=') { + return lineType.SECTION; + } else { + return lineType.PARAGRAPH; + } + + case '-': + if (lineSpan.charAt(1, sm) == '-' && lineSpan.charAt(2, sm) == '-' + && lineSpan.charAt(3, sm) == '-') { + return lineType.HR; + } else { + return lineType.PARAGRAPH; + } + + case '*': + return lineType.NESTEDLIST; + + case '#': + return lineType.NESTEDLIST_NR; + + case ';': + return lineType.DEFINITIONLIST; + + case ':': + if (lineSpan.length() > 1) { + if (lineSpan.length() > 2 && lineSpan.charAt(1, sm) == '{' + && lineSpan.charAt(2, sm) == '|') { + return lineType.TABLE; + } else { + return lineType.PARAGRAPH_INDENTED; + } + } else { + return lineType.PARAGRAPH; + } + + case ' ': + int nonWSPos = lineSpan.nonWSCharPos(sm); + switch (lineSpan.charAt(nonWSPos, sm)) { + case Span.ERRORCHAR: + return lineType.EMPTYLINE; + case '{': + if (lineSpan.charAt(nonWSPos + 1, sm) == '|') { + return lineType.TABLE; + } + default: + return lineType.PARAGRAPH_BOXED; + } + + case Span.ERRORCHAR: + return lineType.EMPTYLINE; + + default: + return lineType.PARAGRAPH; + } + } + + /** + * Returns the number of Equality Chars which are used to specify the level + * of the Section. + */ + private int getSectionLevel(SpanManager sm, Span sectionNameSpan) { + int begin = sectionNameSpan.getStart(); + int end = sectionNameSpan.getEnd(); + int level = 0; + + try { + while ((sm.charAt(begin + level) == '=') + && (sm.charAt(end - 1 - level) == '=')) { + level++; + } + } catch (StringIndexOutOfBoundsException e) { + // there is no need to do anything! + logger.debug("EXCEPTION IS OK: {}", e.getLocalizedMessage()); + } + + if (begin + level == end) { + level = (level - 1) / 2; + } + + return level; + } + + /** + * Takes a list of SectionContent and returns a SectionContainer with the + * given SectionContent s in the right structure. + */ + private SectionContainer buildSectionStructure(List<SectionContent> scl) { + SectionContainer result = new SectionContainer(0); + + for (SectionContent sContent : scl) { + int contentLevel = sContent.getLevel(); + SectionContainer sContainer = result; + + // get the right SectionContainer or create it + for (int containerLevel = result.getLevel() + 1; containerLevel < contentLevel; containerLevel++) { + int containerSubSections = sContainer.nrOfSubSections(); + if (containerSubSections != 0) { + Section temp = sContainer + .getSubSection(containerSubSections - 1); + if (temp.getClass() == SectionContainer.class) { + sContainer = (SectionContainer) temp; + } else { + SectionContainer sct = new SectionContainer(temp + .getTitleElement(), containerLevel); + sct.addSection(temp); + if (calculateSrcSpans) { + sct.setSrcSpan(temp.getSrcSpan()); + } + temp.setTitleElement(null); + temp.setLevel(containerLevel + 1); + sContainer.removeSection(temp); + sContainer.addSection(sct); + sContainer = sct; + } + } else { + sContainer = new SectionContainer(null, containerLevel); + } + } + + sContainer.addSection(sContent); + } + + if (calculateSrcSpans) { + result.setSrcSpan(new SrcSpan(0, -1)); + } + + return result; + } + + private boolean startsWithIgnoreCase(String s1, String s2) { + final int s2len = s2.length(); + if (s1.length() < s2len) { + return false; + } + return s1.substring(0, s2len).equalsIgnoreCase(s2); + } + + private Span getTag(SpanManager sm, int offset) { + int start = sm.indexOf("<", offset); + if (start == -1) { + return null; + } + int end = sm.indexOf(">", start); + if (end == -1) { + return null; + } + + Span s = new Span(start, end + 1); + if (calculateSrcSpans) { + s + .setSrcSpan(new SrcSpan(sm.getSrcPos(start), sm + .getSrcPos(end) + 1)); + } + return s; + } + + private String getTagText(SpanManager sm, Span tag) { + return sm.substring(new Span(tag.getStart() + 1, tag.getEnd() - 1) + .trim(sm)); + } + + private void parseSpecifiedTag(SpanManager sm, List<Span> spans, + List<String> strings, String specifier) { + parseSpecifiedTag(sm, spans, strings, specifier, ""); + } + + private void parseSpecifiedTag(SpanManager sm, List<Span> spans, + List<String> strings, String specifier, String prefix) { + int offset = 0; + + Span s; + while ((s = getTag(sm, offset)) != null) { + offset = s.getEnd(); + String tagText = getTagText(sm, s); + if (startsWithIgnoreCase(tagText, specifier)) { + + Span e; + while ((e = getTag(sm, offset)) != null) { + offset = e.getEnd(); + tagText = getTagText(sm, e); + if (startsWithIgnoreCase(tagText, "/" + specifier)) { + break; + } + } + + if (e == null) { + /* + * OF: Setting e to sm.length()results in ArrayIndexOutOfBoundsExeption if calculateSrcSpans=true + */ + //e = new Span(sm.length(), sm.length()); + e = new Span(Math.max(0, sm.length() - 1), Math.max(0, sm.length() - 1)); + } + + strings.add(sm.substring(s.getEnd(), e.getStart())); + + Span tSpan = new Span(s.getStart(), e.getEnd()); + if (calculateSrcSpans) { + tSpan.setSrcSpan(new SrcSpan(sm.getSrcPos(s.getStart()), sm + .getSrcPos(e.getEnd()))); + } + + spans.add(tSpan); + sm.replace(tSpan, prefix + "(" + specifier + ")"); + tSpan.adjustStart(prefix.length()); + + offset = tSpan.getEnd(); + } + } + } + + private void parseTags(SpanManager sm, List<Span> spans) { + sm.manageList(spans); + + Span s = new Span(0, 0); + while ((s = getTag(sm, s.getEnd())) != null) { + spans.add(s); + } + + if (spans.size() == 0) { + sm.removeManagedList(spans); + } + } + + private void parseTemplates(SpanManager sm, + List<Span> resolvedTemplateSpans, + List<ResolvedTemplate> resolvedTemplates, ParsedPage pp) { + + sm.manageList(resolvedTemplateSpans); + + int pos = -2; + Stack<Integer> templateOpenTags = new Stack<>(); + while ((pos = sm.indexOf("{{", pos + 2)) != -1) { + if (sm.length() > pos + 3 && sm.charAt(pos + 2) == '{' + && sm.charAt(pos + 3) != '{') { + pos++; + } + templateOpenTags.push(pos); + } + + while (!templateOpenTags.empty()) { + int templateOpenTag = templateOpenTags.pop(); + int templateCloseTag = sm.indexOf("}}", templateOpenTag); + if (templateCloseTag == -1) { + continue; + } + + int templateOptionTag = sm.indexOf("|", templateOpenTag, + templateCloseTag); + int templateNameEnd; + List<String> templateOptions; + + if (templateOptionTag != -1) { + templateNameEnd = templateOptionTag; + templateOptions = tokenize(sm, templateOptionTag + 1, + templateCloseTag, "|"); + } else { + templateNameEnd = templateCloseTag; + templateOptions = new ArrayList<>(); + } + + Span ts = new Span(templateOpenTag, templateCloseTag + 2); + + Template t = new Template(ts, encodeWikistyle(sm.substring( + templateOpenTag + 2, templateNameEnd).trim()), + templateOptions); + + if (calculateSrcSpans) { + t.setSrcSpan(new SrcSpan(sm.getSrcPos(templateOpenTag), sm + .getSrcPos(templateCloseTag + 2))); + } + + t.setPos(ts); + + ResolvedTemplate rt = templateParser.parseTemplate(t, pp); + + resolvedTemplateSpans.add(ts); + resolvedTemplates.add(rt); + + sm.replace(ts, rt.getPreParseReplacement()); + } + + if (resolvedTemplateSpans.isEmpty()) { + sm.removeManagedList(resolvedTemplateSpans); + } + } + + private void convertGalleriesToImages(SpanManager sm, List<Span> tagSpans) { + // Quick Hack, not very efficent, should be improved, wont work with + // calculateSrcSpans == true ! + + for (int i = 0; i < tagSpans.size() - 1; i++) { + String openText = getTagText(sm, tagSpans.get(i)); + if (startsWithIgnoreCase(openText, "GALLERY")) { + + if (startsWithIgnoreCase(getTagText(sm, tagSpans.get(i + 1)), + "/GALLERY")) { + + // gallery range is tag(i).end() .. tag(i+1).start() + Span startSpan = tagSpans.remove(i); + Span endSpan = tagSpans.remove(i); + i--; + + StringBuilder sb = new StringBuilder(); + + // caption (any option will be treated as caption) + int eqPos = openText.indexOf('='); + if (eqPos != -1) { + int captionStart = eqPos + 1; + int captionEnd = openText.length(); + + if (captionStart < captionEnd + && openText.charAt(captionStart) == '"' + && openText.charAt(captionEnd - 1) == '"') { + captionStart++; + captionEnd--; + } + + if (captionStart < captionEnd) { + sb.append(openText.substring(captionStart, + captionEnd) + + lineSeparator); + } + } + + // images + for (String s : tokenize(sm, startSpan.getEnd(), endSpan + .getStart(), lineSeparator)) { + sb.append("[[" + s + "]]" + lineSeparator); + } + + // replace the source and remove the tags + sm.replace(startSpan.getStart(), endSpan.getEnd(), sb + .toString()); + } else { + continue; + } + } + } + } + + private Table buildTable(SpanManager sm, + ContentElementParsingParameters cepp, LinkedList<Span> lineSpans) { + + Table result = new Table(); + int col = -1; + int row = 0; + int subTables = 0; + LinkedList<Span> tableDataSpans = new LinkedList<>(); + sm.manageList(tableDataSpans); + + if (calculateSrcSpans) { + result.setSrcSpan(new SrcSpan(sm.getSrcPos(lineSpans.getFirst() + .getStart()), -1)); + } + + lineSpans.removeFirst(); + + while (!lineSpans.isEmpty()) { + Span s = lineSpans.removeFirst(); + + int pos = s.nonWSCharPos(sm); + char c0 = s.charAt(pos, sm); + char c1 = s.charAt(pos + 1, sm); + + if (subTables == 0 && (c0 == '!' || c0 == '|')) { + if (!tableDataSpans.isEmpty()) { + lineSpans.addFirst(s); + + SrcSpan ei = null; + if (calculateSrcSpans) { + ei = new SrcSpan(sm.getSrcPos(tableDataSpans.getFirst() + .getStart() - 1) + 1, -1); + } + + TableElement te = new TableElement(parseSections(sm, cepp, + tableDataSpans), row, col); + te.setSrcSpan(ei); + result.addTableElement(te); + lineSpans.removeFirst(); + } + + col++; + if (c1 == '-') { + row++; + col = -1; + continue; + } else if (c0 == '|' && c1 == '}') { + sm.removeManagedList(tableDataSpans); + + if (calculateSrcSpans) { + result.getSrcSpan().setEnd(sm.getSrcPos(s.getEnd())); + } + + return result; + } else if (c0 == '|' && c1 == '+') { + result.setTitleElement(parseContentElement(sm, cepp, + new Span(s.getStart() + pos + 2, s.getEnd()) + .trim(sm))); + continue; + } else { + int multipleCols; + if ((multipleCols = sm.indexOf("||", + s.getStart() + pos + 1, s.getEnd())) != -1) { + lineSpans.addFirst(new Span(multipleCols + 1, s + .getEnd())); + s.setEnd(multipleCols); + } + + int optionTagPos = sm.indexOf("|", s.getStart() + pos + 1, + s.getEnd()); + + if (optionTagPos != -1) { + s.setStart(optionTagPos + 1).trim(sm); + } else { + s.adjustStart(pos + 1).trim(sm); + } + } + } else if (c0 == '|' && c1 == '}') { + subTables--; + } else if (c0 == '{' && c1 == '|') { + subTables++; + } + + tableDataSpans.addLast(s); + } + + if (tableDataSpans.size() != 0) { + + SrcSpan ei = null; + if (calculateSrcSpans) { + ei = new SrcSpan(sm.getSrcPos(tableDataSpans.getFirst() + .getStart() - 1) + 1, -1); + } + + TableElement te = new TableElement(parseSections(sm, cepp, + tableDataSpans), row, col); + te.setSrcSpan(ei); + + result.addTableElement(te); + } + + sm.removeManagedList(tableDataSpans); + + if (calculateSrcSpans) { + result.getSrcSpan().setEnd(-1); + } + + return result; + } + + private NestedListContainer buildNestedList(SpanManager sm, + ContentElementParsingParameters cepp, LinkedList<Span> lineSpans, + lineType listType) { + + boolean numbered = listType == lineType.NESTEDLIST_NR; + NestedListContainer result = new NestedListContainer(numbered); + + if (calculateSrcSpans) { + result.setSrcSpan(new SrcSpan(sm.getSrcPos(lineSpans.getFirst() + .getStart()), -1)); + } + + LinkedList<Span> nestedListSpans = new LinkedList<>(); + while (!lineSpans.isEmpty()) { + Span s = lineSpans.getFirst(); + if (listType != getLineType(sm, s)) { + break; + } + nestedListSpans + .add(new Span(s.getStart() + 1, s.getEnd()).trim(sm)); + lineSpans.removeFirst(); + } + sm.manageList(nestedListSpans); + + if (calculateSrcSpans) { + result.getSrcSpan().setEnd( + sm.getSrcPos(nestedListSpans.getLast().getEnd())); + } + + while (!nestedListSpans.isEmpty()) { + Span s = nestedListSpans.getFirst(); + lineType t = getLineType(sm, s); + if (t == lineType.NESTEDLIST || t == lineType.NESTEDLIST_NR) { + result.add(buildNestedList(sm, cepp, nestedListSpans, t)); + } else { + nestedListSpans.removeFirst(); + result.add((NestedListElement) parseContentElement(sm, cepp, s, + new NestedListElement())); + } + } + + sm.removeManagedList(nestedListSpans); + + return result; + } + + private DefinitionList buildDefinitionList(SpanManager sm, + ContentElementParsingParameters cepp, LinkedList<Span> lineSpans) { + List<ContentElement> content = new ArrayList<>(); + + Span s = lineSpans.removeFirst(); + + int temp = sm.indexOf(":", s); + if (temp == -1) { + content.add(parseContentElement(sm, cepp, new Span( + s.getStart() + 1, s.getEnd()))); + } else { + content.add(parseContentElement(sm, cepp, new Span(temp + 1, s + .getEnd()))); + content.add(0, parseContentElement(sm, cepp, new Span( + s.getStart() + 1, temp))); + } + + while (!lineSpans.isEmpty()) { + Span ns = lineSpans.getFirst(); + if (sm.charAt(ns.getStart()) != ':') { + break; + } + lineSpans.removeFirst(); + content.add(parseContentElement(sm, cepp, new Span( + ns.getStart() + 1, ns.getEnd()))); + } + + DefinitionList result = new DefinitionList(content); + + if (calculateSrcSpans) { + result.setSrcSpan(new SrcSpan(sm.getSrcPos(s.getStart()), content + .get(content.size() - 1).getSrcSpan().getEnd())); + } + + return result; + } + + private Paragraph buildParagraph(SpanManager sm, + ContentElementParsingParameters cepp, LinkedList<Span> lineSpans, + lineType paragraphType) { + + LinkedList<Span> paragraphSpans = new LinkedList<>(); + Paragraph result = new Paragraph(); + Span s = lineSpans.removeFirst(); + paragraphSpans.add(s); + + switch (paragraphType) { + case PARAGRAPH: + result.setType(Paragraph.type.NORMAL); + while (!lineSpans.isEmpty()) { + if (paragraphType != getLineType(sm, lineSpans.getFirst())) { + break; + } + paragraphSpans.add(lineSpans.removeFirst()); + } + break; + + case PARAGRAPH_BOXED: + result.setType(Paragraph.type.BOXED); + while (!lineSpans.isEmpty()) { + lineType lt = getLineType(sm, lineSpans.getFirst()); + if (paragraphType != lt && lineType.EMPTYLINE != lt) { + break; + } + paragraphSpans.add(lineSpans.removeFirst()); + } + break; + + case PARAGRAPH_INDENTED: + result.setType(Paragraph.type.INDENTED); + s.trim(sm.setCharAt(s.getStart(), ' ')); + break; + + default: + return null; + } + + parseContentElement(sm, cepp, paragraphSpans, result); + + return result; + } + + private List<String> tokenize(SpanManager sm, int start, int end, + String delim) { + List<String> result = new ArrayList<>(); + + if (start > end) { + logger.debug("tokenize({},{}) doesn't make sense", start, end); + return result; + } + + int s = start; + int e; + String token; + // Span rs; + while ((e = sm.indexOf(delim, s, end)) != -1) { + // rs = new Span(s, e).trim( sm ); + // if( rs.length()>0 ) result.add( sm.substring( rs ) ); + token = sm.substring(s, e).trim(); + if (token.length() > 0) { + result.add(token); + } + s = e + delim.length(); + } + // rs = new Span(s, end).trim( sm ); + // if( rs.length()>0 ) result.add( sm.substring( rs ) ); + token = sm.substring(s, end).trim(); + if (token.length() > 0) { + result.add(token); + } + + return result; + } + + private void parseExternalLinks(SpanManager sm, Span s, String protocol, + List<Span> managedList, List<Link> links, Content home_cc) { + int extLinkTargetStart; + Span extLinkSpan = new Span(0, s.getStart()); + + while ((extLinkTargetStart = sm.indexOf(protocol, extLinkSpan.getEnd(), + s.getEnd())) != -1) { + + // Allowed char before the protocol identifer ? + if (extLinkTargetStart > s.getStart() + && (" [").indexOf(sm.charAt(extLinkTargetStart - 1)) == -1) { + extLinkSpan = new Span(0, extLinkTargetStart + 1); + continue; + } + + // Target + int extLinkTargetEnd = extLinkTargetStart; + while ((lineSeparator + " ]").indexOf(sm.charAt(extLinkTargetEnd)) == -1) { + extLinkTargetEnd++; + } + + // Open/Close Tags + int extLinkOpenTag = extLinkTargetStart - 1; + int extLinkCloseTag; + int extLinkTextStart = extLinkTargetStart; + int extLinkTextEnd = extLinkTargetEnd; + + while (extLinkOpenTag >= s.getStart() + && sm.charAt(extLinkOpenTag) == ' ') { + extLinkOpenTag--; + } + + if (extLinkOpenTag >= s.getStart() + && sm.charAt(extLinkOpenTag) == '[') { + extLinkCloseTag = sm.indexOf("]", extLinkTargetEnd, s.getEnd()); + + if (extLinkCloseTag != -1) { + extLinkTextStart = extLinkTargetEnd; + // nicht wie bei "normalen" links durhc | getrennt sondenr + // durhc leerzeichen !!! schei�e !!! + while (sm.charAt(extLinkTextStart) == ' ') { + extLinkTextStart++; + } + extLinkTextEnd = extLinkCloseTag; + extLinkCloseTag++; + + if (extLinkTextStart == extLinkTextEnd) { + sm.insert(extLinkTextStart, "[ ]"); + extLinkTextEnd += 3; + extLinkCloseTag += 3; + } + } else { + extLinkOpenTag = extLinkTargetStart; + extLinkCloseTag = extLinkTargetEnd; + } + } else { + extLinkOpenTag = extLinkTargetStart; + extLinkCloseTag = extLinkTargetEnd; + } + + extLinkSpan = new Span(extLinkOpenTag, extLinkCloseTag); + managedList.add(extLinkSpan); + + Link l = new Link(home_cc, extLinkSpan, sm.substring( + extLinkTargetStart, extLinkTargetEnd), Link.type.EXTERNAL, + null); + links.add(l); + + if (calculateSrcSpans) { + l.setSrcSpan(new SrcSpan(sm.getSrcPos(extLinkOpenTag), sm + .getSrcPos(extLinkCloseTag - 1) + 1)); + } + + sm.delete(extLinkTextEnd, extLinkCloseTag); + sm.delete(extLinkOpenTag, extLinkTextStart); + } + } + + /** + * Returns the LOWERCASE NameSpace of the link target + */ + private static String getLinkNameSpace(String target) { + int pos = target.indexOf(':'); + if (pos == -1) { + return null; + } else { + return target.substring(0, pos).replace('_', ' ').trim() + .toLowerCase(); + } + } + + /** + * There is not much differences between links an images, so they are parsed + * in a single step + */ + private void parseImagesAndInternalLinks(SpanManager sm, + List<Span> linkSpans, List<Link> links) { + + sm.manageList(linkSpans); + + int pos = -1; + Stack<Integer> linkOpenTags = new Stack<>(); + while ((pos = sm.indexOf("[[", pos + 1)) != -1) { + linkOpenTags.push(pos); + } + + Span lastLinkSpan = new Span(sm.length() + 1, sm.length() + 1); + Link.type linkType = Link.type.INTERNAL; + + while (!linkOpenTags.empty()) { + int linkStartTag = linkOpenTags.pop(); + int linkEndTag = sm.indexOf("]]", linkStartTag); + if (linkEndTag == -1) { + continue; + } + + int linkOptionTag = sm.indexOf("|", linkStartTag, linkEndTag); + + int linkTextStart; + String linkTarget; + + if (linkOptionTag != -1) { + linkTextStart = linkOptionTag + 1; + linkTarget = sm.substring(new Span(linkStartTag + 2, + linkOptionTag).trim(sm)); + } else { + linkTextStart = linkStartTag + 2; + linkTarget = sm + .substring(new Span(linkStartTag + 2, linkEndTag) + .trim(sm)); + } + + // is is a regular link ? + if (linkTarget.contains(lineSeparator)) { + continue; + } + linkTarget = encodeWikistyle(linkTarget); + + // so it is a Link or image!!! + List<String> parameters; + + String namespace = getLinkNameSpace(linkTarget); + if (namespace != null) { + if (imageIdentifers.indexOf(namespace) != -1) { + if (linkOptionTag != -1) { + int temp; + while ((temp = sm.indexOf("|", linkTextStart, + linkEndTag)) != -1) { + linkTextStart = temp + 1; + } + + parameters = tokenize(sm, linkOptionTag + 1, + linkEndTag, "|"); + + // maybe there is an external link at the end of the + // image description... + if (sm.charAt(linkEndTag + 2) == ']' + && sm.indexOf("[", linkTextStart, linkEndTag) != -1) { + linkEndTag++; + } + } else { + parameters = null; + } + linkType = Link.type.IMAGE; + } else { + //Link has namespace but is not image + linkType = Link.type.UNKNOWN; + parameters = null; + } + } else { + if (linkType == Link.type.INTERNAL + && lastLinkSpan.hits(new Span(linkStartTag, + linkEndTag + 2))) { + continue; + } + parameters = null; + linkType = Link.type.INTERNAL; + } + + Span posSpan = new Span(linkTextStart, linkEndTag).trim(sm); + linkSpans.add(posSpan); + + Link l = new Link(null, posSpan, linkTarget, linkType, parameters); + links.add(l); + + if (calculateSrcSpans) { + l.setSrcSpan(new SrcSpan(sm.getSrcPos(linkStartTag), sm + .getSrcPos(linkEndTag + 2))); + } + + sm.delete(posSpan.getEnd(), linkEndTag + 2); + sm.delete(linkStartTag, posSpan.getStart()); + + // removing line separators in link text + int lsinlink; + while ((lsinlink = sm.indexOf(lineSeparator, posSpan)) != -1) { + sm.replace(lsinlink, lsinlink + lineSeparator.length(), " "); + } + + lastLinkSpan = posSpan; + } + } + + /** + * Searches the Range given by the Span s for the double occurence of + * "quotation" and puts the results in the List quotedSpans. The Quotation + * tags will be deleted. + * + * @param sm , the Source in which will be searched + * @param s , the range in which will be searched + * @param quotedSpans , the List where the Spans will be placed, should be managed + * by the SpanManager sm + * @param quotation , the start and end tag as String + */ + private void parseQuotedSpans(SpanManager sm, Span s, + List<Span> quotedSpans, String quotation) { + + final int qlen = quotation.length(); + + // get the start position + int start = sm.indexOf(quotation, s.getStart(), s.getEnd()); + + while (start != -1) { + + // get the end position + int end = sm.indexOf(quotation, start + qlen, s.getEnd()); + if (end == -1) { + break; + } + + // build a new span from start and end position. + Span qs = new Span(start, end); + quotedSpans.add(qs); + + // calculate the original src positions. + if (calculateSrcSpans) { + qs.setSrcSpan(new SrcSpan(sm.getSrcPos(start), sm.getSrcPos(end + + qlen - 1) + 1)); + } + + // delete the tags. + sm.delete(end, end + qlen); + sm.delete(start, start + qlen); + + // get the next start position + start = sm.indexOf(quotation, qs.getEnd(), s.getEnd()); + } + } + + /** + * Searches a line for Bold and Italic quotations, this has to be done + * linewhise. + */ + private void parseBoldAndItalicSpans(SpanManager sm, Span line, + List<Span> boldSpans, List<Span> italicSpans) { + // Das suchen nach BOLD und ITALIC muss in den Jeweiligen + // Zeilen geschenhen, da ein LineSeparator immer BOLD und + // Italic Tags schliesst. + + // Bold Spans + parseQuotedSpans(sm, line, boldSpans, "'''"); + + // Italic Spans + parseQuotedSpans(sm, line, italicSpans, "''"); + + // Maybe there is ONE SINGLE OPEN TAG left... handel these... + int openTag = sm.indexOf("''", line); + if (openTag != -1) { + // build a Span from this Tag. + Span qs = new Span(openTag, line.getEnd()); + + // calculate the original src positions. + if (calculateSrcSpans) { + qs.setSrcSpan(new SrcSpan(sm.getSrcPos(openTag), sm + .getSrcPos(line.getEnd()))); + } + + // is it a Bold or an Italic tag ? + if (sm.indexOf("'''", openTag, openTag + 3) != -1) { + // --> BOLD + boldSpans.add(qs); + sm.delete(openTag, openTag + 3); + } else { + // --> ITALIC + italicSpans.add(qs); + sm.delete(openTag, openTag + 2); + } + } + } + + private static String encodeWikistyle(String str) { + return str.replace(' ', '_'); + } + + /** + * Building a ContentElement from a String + */ + @Override + public ContentElement parseContentElement(String src) { + SpanManager sm = new SpanManager(src); + ContentElementParsingParameters cepp = new ContentElementParsingParameters(); + + parseImagesAndInternalLinks(sm, cepp.linkSpans, cepp.links); + + LinkedList<Span> lineSpans = new LinkedList<>(); + getLineSpans(sm, lineSpans); + sm.removeManagedList(lineSpans); + return (parseContentElement(sm, cepp, lineSpans, new ContentElement())); + } + + /** + * Building a ContentElement from a single line. + */ + private ContentElement parseContentElement(SpanManager sm, + ContentElementParsingParameters cepp, Span lineSpan) { + LinkedList<Span> lineSpans = new LinkedList<>(); + lineSpans.add(lineSpan); + return parseContentElement(sm, cepp, lineSpans, new ContentElement()); + } + + /** + * Building a ContentElement from a single line. But the result is given, so + * e.g. a NestedListElement can be filled with information... + */ + private ContentElement parseContentElement(SpanManager sm, + ContentElementParsingParameters cepp, Span lineSpan, + ContentElement result) { + LinkedList<Span> lineSpans = new LinkedList<>(); + lineSpans.add(lineSpan); + return parseContentElement(sm, cepp, lineSpans, result); + } + + /** + * Building a ContentElement, this funciton is calles by all the other + * parseContentElement(..) functions + */ + private ContentElement parseContentElement(SpanManager sm, + ContentElementParsingParameters cepp, LinkedList<Span> lineSpans, + ContentElement result) { + + List<Link> localLinks = new ArrayList<>(); + List<Template> localTemplates = new ArrayList<>(); + + List<Span> boldSpans = new ArrayList<>(); + List<Span> italicSpans = new ArrayList<>(); + sm.manageList(boldSpans); + sm.manageList(italicSpans); + + List<Span> managedSpans = new ArrayList<>(); + sm.manageList(managedSpans); + + Span contentElementRange = new Span(lineSpans.getFirst().getStart(), + lineSpans.getLast().getEnd()).trim(sm); + managedSpans.add(contentElementRange); + + // set the SrcSpan + if (calculateSrcSpans) { + result.setSrcSpan(new SrcSpan(sm.getSrcPos(contentElementRange + .getStart()), sm.getSrcPos(contentElementRange.getEnd()))); + } + + sm.manageList(lineSpans); + while (!lineSpans.isEmpty()) { + Span line = lineSpans.getFirst(); + + parseBoldAndItalicSpans(sm, line, boldSpans, italicSpans); + + // External links + parseExternalLinks(sm, line, "http://", managedSpans, localLinks, + result); + parseExternalLinks(sm, line, "https://", managedSpans, localLinks, + result); + parseExternalLinks(sm, line, "ftp://", managedSpans, localLinks, + result); + parseExternalLinks(sm, line, "mailto:", managedSpans, localLinks, + result); + + // end of linewhise opperations + lineSpans.removeFirst(); + } + sm.removeManagedList(lineSpans); + + // Links + int i; + i = 0; + while (i < cepp.linkSpans.size()) { + if (contentElementRange.hits(cepp.linkSpans.get(i))) { + Span linkSpan = cepp.linkSpans.remove(i); + managedSpans.add(linkSpan); + Link l = cepp.links.remove(i).setHomeElement(result); + localLinks.add(l); + if (!showImageText && l.getType() == Link.type.IMAGE) { + // deletes the Image Text from the ContentElement Text. + sm.delete(linkSpan); + } + } else { + i++; + } + } + + // Templates + i = 0; + while (i < cepp.templateSpans.size()) { + Span ts = cepp.templateSpans.get(i); + if (contentElementRange.hits(ts)) { + ResolvedTemplate rt = cepp.templates.remove(i); + + if (rt.getPostParseReplacement() != null) { + sm.replace(ts, rt.getPostParseReplacement()); + } + cepp.templateSpans.remove(i); + + Object parsedObject = rt.getParsedObject(); + if (parsedObject != null) { + managedSpans.add(ts); + + Class<?> parsedObjectClass = parsedObject.getClass(); + if (parsedObjectClass == Template.class) { + localTemplates.add((Template) parsedObject); + } else if (parsedObjectClass == Link.class) { + localLinks.add(((Link) parsedObject) + .setHomeElement(result)); + } else { + localTemplates.add(rt.getTemplate()); + } + } + } else { + i++; + } + } + + // HTML/XML Tags + i = 0; + List<Span> tags = new ArrayList<>(); + while (i < cepp.tagSpans.size()) { + Span s = cepp.tagSpans.get(i); + if (contentElementRange.hits(s)) { + cepp.tagSpans.remove(i); + if (deleteTags) { + sm.delete(s); + } else { + tags.add(s); + managedSpans.add(s); + } + } else { + i++; + } + } + + // noWiki + i = 0; + List<Span> localNoWikiSpans = new ArrayList<>(); + while (i < cepp.noWikiSpans.size()) { + Span s = cepp.noWikiSpans.get(i); + if (contentElementRange.hits(s)) { + cepp.noWikiSpans.remove(i); + sm.replace(s, cepp.noWikiStrings.remove(i)); + localNoWikiSpans.add(s); + managedSpans.add(s); + } else { + i++; + } + } + + // MATH Tags + i = 0; + List<Span> mathSpans = new ArrayList<>(); + while (i < cepp.mathSpans.size()) { + Span s = cepp.mathSpans.get(i); + if (contentElementRange.hits(s)) { + cepp.mathSpans.remove(i); + + if (showMathTagContent) { + mathSpans.add(s); + managedSpans.add(s); + sm.replace(s, cepp.mathStrings.remove(i)); + } else { + sm.delete(s); + } + } else { + i++; + } + } + + result.setText(sm.substring(contentElementRange)); + + // managed spans must be removed here and not earlier, because every + // change in the SpanManager affects the Spans! + sm.removeManagedList(boldSpans); + sm.removeManagedList(italicSpans); + sm.removeManagedList(managedSpans); + + // contentElementRange ist auch noch in managedSpans !!! deswegen: + final int adjust = -contentElementRange.getStart(); + for (Span s : boldSpans) { + s.adjust(adjust); + } + for (Span s : italicSpans) { + s.adjust(adjust); + } + for (Span s : managedSpans) { + s.adjust(adjust); + } + + result.setFormatSpans(FormatType.BOLD, boldSpans); + result.setFormatSpans(FormatType.ITALIC, italicSpans); + result.setFormatSpans(FormatType.TAG, tags); + result.setFormatSpans(FormatType.MATH, mathSpans); + result.setFormatSpans(FormatType.NOWIKI, localNoWikiSpans); + + result.setLinks(sortLinks(localLinks)); + result.setTemplates(sortTemplates(localTemplates)); + + return result; + } + + /** + * Sorts the Links... + */ + private static List<Link> sortLinks(List<Link> links) { + List<Link> result = new ArrayList<>(); + for (Link l : links) { + int pos = 0; + while (pos < result.size() + && l.getPos().getStart() > result.get(pos).getPos() + .getStart()) { + pos++; + } + result.add(pos, l); + } + return result; + } + + /** + * Sorts the Templates... + */ + private static List<Template> sortTemplates(List<Template> templates) { + List<Template> result = new ArrayList<>(); + for (Template t : templates) { + int pos = 0; + while (pos < result.size() + && t.getPos().getStart() > result.get(pos).getPos() + .getStart()) { + pos++; + } + result.add(pos, t); + } + return result; + } + + /** + * Algorithm to identify the first paragraph of a ParsedPage + */ + private void setFirstParagraph(ParsedPage pp) { + int nr = pp.nrOfParagraphs(); + + // the paragraph with the lowest number, must not be the first, maybe it + // is only an Image... + for (int i = 0; i < nr; i++) { + Paragraph p = pp.getParagraph(i); + + // get the Text from the paragraph + SpanManager ptext = new SpanManager(p.getText()); + List<Span> delete = new ArrayList<>(); + ptext.manageList(delete); + + // getting the spans to remove from the text, for templates + List<Template> tl = p.getTemplates(); + for (int j = tl.size() - 1; j >= 0; j--) { + delete.add(tl.get(j).getPos()); + } + + // getting the spans to remove from the text, for Tags + List<Span> sl = p.getFormatSpans(FormatType.TAG); + for (int j = sl.size() - 1; j >= 0; j--) { + delete.add(sl.get(j)); + } + + // getting the spans to remove from the text, for image text + if (showImageText) { + List<Link> ll = p.getLinks(Link.type.IMAGE); + for (int j = ll.size() - 1; j >= 0; j--) { + delete.add(ll.get(j).getPos()); + } + } + + // delete the spans in reverse order, the spans are managed, so + // there is no need to sort them + for (int j = delete.size() - 1; j >= 0; j--) { + ptext.delete(delete.remove(j)); + } + + // removing line separators if exist, so the result can be trimmed + // in the next step + int pos = ptext.indexOf(lineSeparator); + while (pos != -1) { + ptext.delete(pos, pos + lineSeparator.length()); + pos = ptext.indexOf(lineSeparator); + } + + // if the result is not an empty string, we got the number of the + // first paragraph + if (!ptext.toString().trim().equals("")) { + pp.setFirstParagraphNr(i); + return; + } + } + } + + /** + * Container for all the Parameters needed in the parseing process + */ + class ContentElementParsingParameters { + final List<Span> noWikiSpans; + final List<String> noWikiStrings; + final List<Span> linkSpans; + final List<Link> links; + final List<Span> templateSpans; + final List<ResolvedTemplate> templates; + final List<Span> tagSpans; + final List<Span> mathSpans; + final List<String> mathStrings; + + ContentElementParsingParameters() { + noWikiSpans = new ArrayList<>(); + noWikiStrings = new ArrayList<>(); + linkSpans = new ArrayList<>(); + links = new ArrayList<>(); + templateSpans = new ArrayList<>(); + templates = new ArrayList<>(); + tagSpans = new ArrayList<>(); + mathSpans = new ArrayList<>(); + mathStrings = new ArrayList<>(); + } + } } diff --git a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/mediawiki/ParserConstants.java b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/mediawiki/ParserConstants.java index 7cf0f6aa..9c7f0f57 100644 --- a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/mediawiki/ParserConstants.java +++ b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/mediawiki/ParserConstants.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -19,9 +19,9 @@ public interface ParserConstants { - /** - * Shortcut for System.getProperty("line.separator"). - */ - String LF = System.getProperty("line.separator"); + /** + * Shortcut for System.getProperty("line.separator"). + */ + String LF = System.getProperty("line.separator"); } diff --git a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/mediawiki/ResolvedTemplate.java b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/mediawiki/ResolvedTemplate.java index aa1b1e67..9ebabd6b 100644 --- a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/mediawiki/ResolvedTemplate.java +++ b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/mediawiki/ResolvedTemplate.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -19,93 +19,95 @@ import org.dkpro.jwpl.parser.Template; -public class ResolvedTemplate{ +public class ResolvedTemplate { - public final static String TEMPLATESPACER = "(TEMPLATE)"; + public final static String TEMPLATESPACER = "(TEMPLATE)"; - private final Template template; - private String preParseReplacement; - private String postParseReplacement; + private final Template template; + private String preParseReplacement; + private String postParseReplacement; - /** - * is the Object which the Template Parser has been parsed, and will be - * integrated by the ContentElementParseing process. <br> - * If parsedObject == null, the template will be discarded... - */ - private Object parsedObject; + /** + * is the Object which the Template Parser has been parsed, and will be + * integrated by the ContentElementParseing process. <br> + * If parsedObject == null, the template will be discarded... + */ + private Object parsedObject; - /** - * Creates a new ResolvedTemplate linked to the original template. - * @param template the original template - */ - public ResolvedTemplate(Template template){ - this.template = template; - this.postParseReplacement = ""; - checkPreParseReplacement(); - } + /** + * Creates a new ResolvedTemplate linked to the original template. + * + * @param template the original template + */ + public ResolvedTemplate(Template template) { + this.template = template; + this.postParseReplacement = ""; + checkPreParseReplacement(); + } - private void checkPreParseReplacement(){ - if( preParseReplacement==null || preParseReplacement.length()==0 ) { - preParseReplacement = TEMPLATESPACER; - } - } + private void checkPreParseReplacement() { + if (preParseReplacement == null || preParseReplacement.length() == 0) { + preParseReplacement = TEMPLATESPACER; + } + } - /** - * Will be called by the parser after the parsing process and will replace - * the TEXT which is within the bounds of the original template src. <br> - * If NULL is returned, the parser won't do anything. - */ - public String getPostParseReplacement() { - return postParseReplacement; - } + /** + * Will be called by the parser after the parsing process and will replace + * the TEXT which is within the bounds of the original template src. <br> + * If NULL is returned, the parser won't do anything. + */ + public String getPostParseReplacement() { + return postParseReplacement; + } - /** - * Look at getPostParseReplacement... - */ - public void setPostParseReplacement(String postParseReplacement) { - this.postParseReplacement = postParseReplacement; - } + /** + * Look at getPostParseReplacement... + */ + public void setPostParseReplacement(String postParseReplacement) { + this.postParseReplacement = postParseReplacement; + } - /** - * will be called by the parser before the Parsing process and replaces the original - * template code. MediaWiki code which is returned here, will be parsed.<br> - * length() > 0 ! empty stings would not be accepted. - */ - public String getPreParseReplacement() { - return preParseReplacement; - } + /** + * will be called by the parser before the Parsing process and replaces the original + * template code. MediaWiki code which is returned here, will be parsed.<br> + * length() > 0 ! empty stings would not be accepted. + */ + public String getPreParseReplacement() { + return preParseReplacement; + } - /** - * Look at getPreParseReplacement... - */ - public void setPreParseReplacement(String preParseReplacement) { - this.preParseReplacement = preParseReplacement; - checkPreParseReplacement(); - } + /** + * Look at getPreParseReplacement... + */ + public void setPreParseReplacement(String preParseReplacement) { + this.preParseReplacement = preParseReplacement; + checkPreParseReplacement(); + } - /** - * In case of an Error the Parser will use the Original Template - * as parsed object. - */ - public Template getTemplate() { - return template; - } + /** + * In case of an Error the Parser will use the Original Template + * as parsed object. + */ + public Template getTemplate() { + return template; + } - /** - * Returns the Object which is representative for the Template Code. - * It can be a Template or any object the parser knows.<br> - * If the Template is e.g. a Link the Link will be returned here. - */ - public Object getParsedObject() { - return parsedObject; - } + /** + * Returns the Object which is representative for the Template Code. + * It can be a Template or any object the parser knows.<br> + * If the Template is e.g. a Link the Link will be returned here. + */ + public Object getParsedObject() { + return parsedObject; + } - /** - * Look at getParsedObject for Details. - * @param parsedObject - */ - public void setParsedObject(Object parsedObject) { - this.parsedObject = parsedObject; - } + /** + * Look at getParsedObject for Details. + * + * @param parsedObject + */ + public void setParsedObject(Object parsedObject) { + this.parsedObject = parsedObject; + } } diff --git a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/mediawiki/ShowTemplateNamesAndParameters.java b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/mediawiki/ShowTemplateNamesAndParameters.java index 36655a66..2be25ce0 100644 --- a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/mediawiki/ShowTemplateNamesAndParameters.java +++ b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/mediawiki/ShowTemplateNamesAndParameters.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -23,33 +23,32 @@ /** * This TemplateParser simply shows the name of the Template with all * parameters, without any exception. - * */ public class ShowTemplateNamesAndParameters implements MediaWikiTemplateParser { - private final String templatePrefix = "TEMPLATE["; - private final String templatePostfix = "]"; - private final String parameterDivisor = ", "; - - public ResolvedTemplate parseTemplate(Template t, ParsedPage pp) { - ResolvedTemplate result = new ResolvedTemplate( t ); - result.setPreParseReplacement( ResolvedTemplate.TEMPLATESPACER ); - - StringBuilder sb = new StringBuilder(); - sb.append(templatePrefix); - sb.append( t.getName()+parameterDivisor ); - for( String s: t.getParameters()){ - sb.append( s +parameterDivisor ); - } - sb.delete( sb.length()-parameterDivisor.length(), sb.length() ); - sb.append(templatePostfix); - result.setPostParseReplacement( sb.toString() ); - - result.setParsedObject( t ); - return result; - } - - public String configurationInfo(){ - return "shows the Template names and all parameters"; - } + private final String templatePrefix = "TEMPLATE["; + private final String templatePostfix = "]"; + private final String parameterDivisor = ", "; + + public ResolvedTemplate parseTemplate(Template t, ParsedPage pp) { + ResolvedTemplate result = new ResolvedTemplate(t); + result.setPreParseReplacement(ResolvedTemplate.TEMPLATESPACER); + + StringBuilder sb = new StringBuilder(); + sb.append(templatePrefix); + sb.append(t.getName() + parameterDivisor); + for (String s : t.getParameters()) { + sb.append(s + parameterDivisor); + } + sb.delete(sb.length() - parameterDivisor.length(), sb.length()); + sb.append(templatePostfix); + result.setPostParseReplacement(sb.toString()); + + result.setParsedObject(t); + return result; + } + + public String configurationInfo() { + return "shows the Template names and all parameters"; + } } diff --git a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/mediawiki/SpanManager.java b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/mediawiki/SpanManager.java index 499de4cc..49b11c95 100644 --- a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/mediawiki/SpanManager.java +++ b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/mediawiki/SpanManager.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -27,223 +27,235 @@ * A Class which manages Spans which are related to a StringBuilder. * With the SpanManager it is possible to work on a String (delete, insert, replace) * with no need to adjust the Spans related to the StringBuilder manually. - * */ public class SpanManager implements CharSequence { - private final StringBuilder sb; - private final List< List<Span> > managedLists; - - private List<Integer> ib; - private boolean calculateSrcPositions; - - /** - * Creates a new SpanManager with src as base. - * @param src - */ - public SpanManager(String src){ - sb = new StringBuilder(src); - managedLists = new ArrayList<>(); - calculateSrcPositions = false; - } - - /** - * Enables the Calculation of Src Position. The base for these position - * will be the aktual, not the initial, String wich is uses as Base for - * the SpanManager. - */ - public void enableSrcPosCalculation(){ - calculateSrcPositions = true; - final int len = sb.length(); - ib = new ArrayList<>(len); - for( int i=0; i<len; i++) ib.add( i ); - } - - /** - * Retruns a SrcPos for the index of the aktual SpanManager base. - * @return the Position the index has, when enableSrcPosCaulation() has been called, - * or -1 if it is not possible. - */ - public int getSrcPos( int index ){ - if( calculateSrcPositions ){ - return ib.get( index ); - } - else{ - System.err.println("SrcSpanCalculation not enabled!"); - return -1; - } - } - - /** - * Adds a List of Spans, which should be managed. - */ - public void manageList( List<Span> spans ){ - managedLists.add( spans ); - } - - /** - * Removes a List of Spans (not the Spans in the List), which shouldn�t be managed anymore. - * @param spans - */ - public void removeManagedList( List<Span> spans ){ - final Span listIdentifer = new Span(Integer.MAX_VALUE, Integer.MIN_VALUE); - spans.add( listIdentifer ); - managedLists.remove( spans ); - spans.remove( listIdentifer ); - } - - private void adjustLists(int offset, int n){ - for( List<Span> list: managedLists ) - for( Span s: list )s.adjust(offset, n); - } - - /** - * Deletes the content between s.getStart() (included) and s.getEnd() (excluded). - */ - public SpanManager delete(Span s){ return delete(s.getStart(), s.getEnd() ); } - - /** - * Deletes the content between start (included) and end (excluded). - */ - public SpanManager delete(int start, int end){ - sb.delete(start, end); - adjustLists( start, start-end ); - - if(calculateSrcPositions) for( int i = 0; i<end-start; i++) ib.remove( start ); - - return this; - } - - /** - * Insterts a String at the position offset. - */ - public SpanManager insert(int offset, String str){ - sb.insert(offset, str); - adjustLists( offset, str.length() ); - - if( calculateSrcPositions ) for( int i=0; i<str.length(); i++ ) ib.add( offset, -1 ); - - return this; - } - - /** - * Replaces the content between s.getStart() (included) and s.getEnd() (excluded) with - * a String - */ - public SpanManager replace(Span s, String str){ return replace( s.getStart(), s.getEnd(), str); } - - /** - * Replaces the content between start (included) and end (excluded) with a String - */ - public SpanManager replace(int start, int end, String str){ - sb.replace(start, end, str); - - if( calculateSrcPositions ){ - for( int i=0; i<end-start; i++) ib.remove( start ); - for( int i=0; i<str.length(); i++) ib.add( start, -1 ); - } - - adjustLists(start, str.length()-(end-start) ); - return this; - } - - public int indexOf(String str){ return this.indexOf(str, 0); } - public int indexOf(String str, int fromIndex){ return sb.indexOf(str, fromIndex); } - public int indexOf(String str, Span s){ return indexOf(str, s.getStart(), s.getEnd() ); } - - public int indexOf(String str, int fromIndex, int toIndex){ - int result = sb.indexOf(str, fromIndex); - if( result >= toIndex ) return -1 ; - return result; - } - - public String substring(int start){ - if (start < 0) { - start = 0; - } - return this.sb.substring(start); - } - - public String substring(int start, int end){ - if (start < 0) { - start = 0; - } - if (start > end) { - return ""; - } - - return sb.substring(start, end); - } - - public String substring( Span s ) { - if (s.getStart() < s.getEnd()) { - return sb.substring( s.getStart(), s.getEnd() ); - } - else { - return ""; - } - } - - /** - * <font color="#ff0000">This function is not implemented !!!</font> - */ - public CharSequence subSequence(int start, int end){ - //TODO Implementieren - System.err.println("CharSequence subSequence(int start, int end)\nSorry, not Implemented"); - sb.charAt(-1); //causes an error - return null; - } - - public int length(){ - return sb.length(); - } - - public SpanManager setCharAt(int index, char c){ - sb.setCharAt( index, c ); - if( calculateSrcPositions ) ib.set( index, -1 ); - return this; - } - - public char charAt(int index){ - return sb.charAt(index); - } - - @Override - public String toString(){ - return sb.toString(); - } - - /** - * Returnes some information about the content of the SpanManager an it�s manages - * Spans - */ - public String info(){ - StringBuilder result = new StringBuilder(); - - result.append("\n-=SPANMANAGER=----------------------------------------------------------------\n"); - - result.append("TEXT:"); - result.append( "\""+ sb + "\""); - result.append("\n"); - - result.append("\nMANAGED SPAN LISTS:"); - if( managedLists.isEmpty() ) - result.append(" NONE\n"); - else{ - result.append("\n"); - for( int k=0; k<managedLists.size(); k++ ){ - List<Span> sl = managedLists.get(k); - result.append("{"); - if( sl.size() != 0 ){ - for( int i=1; i<sl.size()-1; i++ ) result.append(sl.get(i)+", "); - result.append(sl.get( sl.size()-1)); - } - result.append("}\n"); - } - } - - result.append("------------------------------------------------------------------------------"); - - return result.toString(); - } + private final StringBuilder sb; + private final List<List<Span>> managedLists; + + private List<Integer> ib; + private boolean calculateSrcPositions; + + /** + * Creates a new SpanManager with src as base. + * + * @param src + */ + public SpanManager(String src) { + sb = new StringBuilder(src); + managedLists = new ArrayList<>(); + calculateSrcPositions = false; + } + + /** + * Enables the Calculation of Src Position. The base for these position + * will be the aktual, not the initial, String wich is uses as Base for + * the SpanManager. + */ + public void enableSrcPosCalculation() { + calculateSrcPositions = true; + final int len = sb.length(); + ib = new ArrayList<>(len); + for (int i = 0; i < len; i++) ib.add(i); + } + + /** + * Retruns a SrcPos for the index of the aktual SpanManager base. + * + * @return the Position the index has, when enableSrcPosCaulation() has been called, + * or -1 if it is not possible. + */ + public int getSrcPos(int index) { + if (calculateSrcPositions) { + return ib.get(index); + } else { + System.err.println("SrcSpanCalculation not enabled!"); + return -1; + } + } + + /** + * Adds a List of Spans, which should be managed. + */ + public void manageList(List<Span> spans) { + managedLists.add(spans); + } + + /** + * Removes a List of Spans (not the Spans in the List), which shouldn�t be managed anymore. + * + * @param spans + */ + public void removeManagedList(List<Span> spans) { + final Span listIdentifer = new Span(Integer.MAX_VALUE, Integer.MIN_VALUE); + spans.add(listIdentifer); + managedLists.remove(spans); + spans.remove(listIdentifer); + } + + private void adjustLists(int offset, int n) { + for (List<Span> list : managedLists) + for (Span s : list) s.adjust(offset, n); + } + + /** + * Deletes the content between s.getStart() (included) and s.getEnd() (excluded). + */ + public SpanManager delete(Span s) { + return delete(s.getStart(), s.getEnd()); + } + + /** + * Deletes the content between start (included) and end (excluded). + */ + public SpanManager delete(int start, int end) { + sb.delete(start, end); + adjustLists(start, start - end); + + if (calculateSrcPositions) for (int i = 0; i < end - start; i++) ib.remove(start); + + return this; + } + + /** + * Insterts a String at the position offset. + */ + public SpanManager insert(int offset, String str) { + sb.insert(offset, str); + adjustLists(offset, str.length()); + + if (calculateSrcPositions) for (int i = 0; i < str.length(); i++) ib.add(offset, -1); + + return this; + } + + /** + * Replaces the content between s.getStart() (included) and s.getEnd() (excluded) with + * a String + */ + public SpanManager replace(Span s, String str) { + return replace(s.getStart(), s.getEnd(), str); + } + + /** + * Replaces the content between start (included) and end (excluded) with a String + */ + public SpanManager replace(int start, int end, String str) { + sb.replace(start, end, str); + + if (calculateSrcPositions) { + for (int i = 0; i < end - start; i++) ib.remove(start); + for (int i = 0; i < str.length(); i++) ib.add(start, -1); + } + + adjustLists(start, str.length() - (end - start)); + return this; + } + + public int indexOf(String str) { + return this.indexOf(str, 0); + } + + public int indexOf(String str, int fromIndex) { + return sb.indexOf(str, fromIndex); + } + + public int indexOf(String str, Span s) { + return indexOf(str, s.getStart(), s.getEnd()); + } + + public int indexOf(String str, int fromIndex, int toIndex) { + int result = sb.indexOf(str, fromIndex); + if (result >= toIndex) return -1; + return result; + } + + public String substring(int start) { + if (start < 0) { + start = 0; + } + return this.sb.substring(start); + } + + public String substring(int start, int end) { + if (start < 0) { + start = 0; + } + if (start > end) { + return ""; + } + + return sb.substring(start, end); + } + + public String substring(Span s) { + if (s.getStart() < s.getEnd()) { + return sb.substring(s.getStart(), s.getEnd()); + } else { + return ""; + } + } + + /** + * <font color="#ff0000">This function is not implemented !!!</font> + */ + public CharSequence subSequence(int start, int end) { + //TODO Implementieren + System.err.println("CharSequence subSequence(int start, int end)\nSorry, not Implemented"); + sb.charAt(-1); //causes an error + return null; + } + + public int length() { + return sb.length(); + } + + public SpanManager setCharAt(int index, char c) { + sb.setCharAt(index, c); + if (calculateSrcPositions) ib.set(index, -1); + return this; + } + + public char charAt(int index) { + return sb.charAt(index); + } + + @Override + public String toString() { + return sb.toString(); + } + + /** + * Returnes some information about the content of the SpanManager an it�s manages + * Spans + */ + public String info() { + StringBuilder result = new StringBuilder(); + + result.append("\n-=SPANMANAGER=----------------------------------------------------------------\n"); + + result.append("TEXT:"); + result.append("\"" + sb + "\""); + result.append("\n"); + + result.append("\nMANAGED SPAN LISTS:"); + if (managedLists.isEmpty()) + result.append(" NONE\n"); + else { + result.append("\n"); + for (int k = 0; k < managedLists.size(); k++) { + List<Span> sl = managedLists.get(k); + result.append("{"); + if (sl.size() != 0) { + for (int i = 1; i < sl.size() - 1; i++) result.append(sl.get(i) + ", "); + result.append(sl.get(sl.size() - 1)); + } + result.append("}\n"); + } + } + + result.append("------------------------------------------------------------------------------"); + + return result.toString(); + } } diff --git a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/mediawiki/SrcPosRangeChecker.java b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/mediawiki/SrcPosRangeChecker.java index d9b35f36..327f91fc 100644 --- a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/mediawiki/SrcPosRangeChecker.java +++ b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/mediawiki/SrcPosRangeChecker.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -42,121 +42,120 @@ * that e.g. a ContentElement conatins a Link which isn't in the Range of * this ContentElement. This must be done because some positons will be * jammed by the parsing process, e.g. if a Link is the start of a Paragrah. - * */ public class SrcPosRangeChecker { - public static void checkRange( ParsedPage pp ){ - for( Section s: pp.getSections() ){ - if( s.getClass() == SectionContent.class ) - checkRange( (SectionContent)s ); - else - checkRange( (SectionContainer)s ); - } - } - - private static void checkRange( SectionContainer sc ){ - if( sc.getTitleElement()!= null ) - checkRange( sc.getTitleElement() ); - - for( Section s: sc.getSubSections() ){ - if( s.getClass() == SectionContent.class ) - checkRange( (SectionContent)s ); - else - checkRange( (SectionContainer)s ); - } - } - - private static void checkRange( SectionContent s ){ - List<SrcSpan> eil = new ArrayList<>(); - - if( s.getTitleElement()!= null ){ - checkRange( s.getTitleElement() ); - eil.add( s.getTitleElement().getSrcSpan() ); - } - - for( Paragraph p: s.getParagraphs() ){ - checkRange( p ); - eil.add( p.getSrcSpan() ); - } - - for( DefinitionList dl: s.getDefinitionLists() ){ - checkRange( dl ); - eil.add( dl.getSrcSpan() ); - } - - for( NestedListContainer nl: s.getNestedLists() ){ - checkRange( nl ); - eil.add( nl.getSrcSpan() ); - } - - for( Table t: s.getTables() ){ - checkRange( t ); - eil.add( t.getSrcSpan() ); - } - - s.setSrcSpan( getEvalInfo( s.getSrcSpan(), eil)); - } - - private static void checkRange( DefinitionList dl ){ - - } - - private static void checkRange( NestedListContainer nlc ){ - for( NestedList nl: nlc.getNestedLists() ){ - if( nl.getClass() == NestedListContainer.class ) - checkRange( (NestedListContainer)nl ); - else - checkRange( (ContentElement)nl ); - } - } - - private static void checkRange( Table t ){ - List<SrcSpan> eil = new ArrayList<>(); - - for( int i=0; i<t.nrOfTableElements(); i++){ - TableElement te = t.getTableElement(i); - checkRange( te ); - eil.add( te.getSrcSpan() ); - } - - t.setSrcSpan( getEvalInfo( t.getSrcSpan(), eil)); - } - - private static void checkRange( TableElement te ){ - List<SrcSpan> eil = new ArrayList<>(); - - for( Section s: te.getSubSections() ){ - if( s.getClass() == SectionContent.class ) - checkRange( (SectionContent)s ); - else - checkRange( (SectionContainer)s ); - } - - te.setSrcSpan( getEvalInfo( te.getSrcSpan(), eil ) ); - } - - private static void checkRange( ContentElement ce ){ - List<SrcSpan> eil = new ArrayList<>(); - for( Span s: ce.getFormatSpans( FormatType.BOLD ) ) eil.add( s.getSrcSpan() ); - for( Span s: ce.getFormatSpans( FormatType.ITALIC ) ) eil.add( s.getSrcSpan() ); - for( Span s: ce.getFormatSpans( FormatType.MATH ) ) eil.add( s.getSrcSpan() ); - for( Span s: ce.getFormatSpans( FormatType.TAG ) ) eil.add( s.getSrcSpan() ); - for( Span s: ce.getFormatSpans( FormatType.NOWIKI ) ) eil.add( s.getSrcSpan() ); - for( Link l: ce.getLinks()) eil.add( l.getSrcSpan() ); - for( Template t: ce.getTemplates() ) eil.add( t.getSrcSpan() ); - - ce.setSrcSpan( getEvalInfo( ce.getSrcSpan(), eil) ); - } - - private static SrcSpan getEvalInfo( SrcSpan e, List<SrcSpan> eil ){ - int start = e.getStart(); - int end = e.getEnd(); - - for( SrcSpan ei: eil ){ - if( start==-1 ||( start > ei.getStart() && ei.getStart() != -1 ) ) start = ei.getStart(); - if( end < ei.getEnd()) end = ei.getEnd(); - } - return new SrcSpan( start, end ); - } + public static void checkRange(ParsedPage pp) { + for (Section s : pp.getSections()) { + if (s.getClass() == SectionContent.class) + checkRange((SectionContent) s); + else + checkRange((SectionContainer) s); + } + } + + private static void checkRange(SectionContainer sc) { + if (sc.getTitleElement() != null) + checkRange(sc.getTitleElement()); + + for (Section s : sc.getSubSections()) { + if (s.getClass() == SectionContent.class) + checkRange((SectionContent) s); + else + checkRange((SectionContainer) s); + } + } + + private static void checkRange(SectionContent s) { + List<SrcSpan> eil = new ArrayList<>(); + + if (s.getTitleElement() != null) { + checkRange(s.getTitleElement()); + eil.add(s.getTitleElement().getSrcSpan()); + } + + for (Paragraph p : s.getParagraphs()) { + checkRange(p); + eil.add(p.getSrcSpan()); + } + + for (DefinitionList dl : s.getDefinitionLists()) { + checkRange(dl); + eil.add(dl.getSrcSpan()); + } + + for (NestedListContainer nl : s.getNestedLists()) { + checkRange(nl); + eil.add(nl.getSrcSpan()); + } + + for (Table t : s.getTables()) { + checkRange(t); + eil.add(t.getSrcSpan()); + } + + s.setSrcSpan(getEvalInfo(s.getSrcSpan(), eil)); + } + + private static void checkRange(DefinitionList dl) { + + } + + private static void checkRange(NestedListContainer nlc) { + for (NestedList nl : nlc.getNestedLists()) { + if (nl.getClass() == NestedListContainer.class) + checkRange((NestedListContainer) nl); + else + checkRange((ContentElement) nl); + } + } + + private static void checkRange(Table t) { + List<SrcSpan> eil = new ArrayList<>(); + + for (int i = 0; i < t.nrOfTableElements(); i++) { + TableElement te = t.getTableElement(i); + checkRange(te); + eil.add(te.getSrcSpan()); + } + + t.setSrcSpan(getEvalInfo(t.getSrcSpan(), eil)); + } + + private static void checkRange(TableElement te) { + List<SrcSpan> eil = new ArrayList<>(); + + for (Section s : te.getSubSections()) { + if (s.getClass() == SectionContent.class) + checkRange((SectionContent) s); + else + checkRange((SectionContainer) s); + } + + te.setSrcSpan(getEvalInfo(te.getSrcSpan(), eil)); + } + + private static void checkRange(ContentElement ce) { + List<SrcSpan> eil = new ArrayList<>(); + for (Span s : ce.getFormatSpans(FormatType.BOLD)) eil.add(s.getSrcSpan()); + for (Span s : ce.getFormatSpans(FormatType.ITALIC)) eil.add(s.getSrcSpan()); + for (Span s : ce.getFormatSpans(FormatType.MATH)) eil.add(s.getSrcSpan()); + for (Span s : ce.getFormatSpans(FormatType.TAG)) eil.add(s.getSrcSpan()); + for (Span s : ce.getFormatSpans(FormatType.NOWIKI)) eil.add(s.getSrcSpan()); + for (Link l : ce.getLinks()) eil.add(l.getSrcSpan()); + for (Template t : ce.getTemplates()) eil.add(t.getSrcSpan()); + + ce.setSrcSpan(getEvalInfo(ce.getSrcSpan(), eil)); + } + + private static SrcSpan getEvalInfo(SrcSpan e, List<SrcSpan> eil) { + int start = e.getStart(); + int end = e.getEnd(); + + for (SrcSpan ei : eil) { + if (start == -1 || (start > ei.getStart() && ei.getStart() != -1)) start = ei.getStart(); + if (end < ei.getEnd()) end = ei.getEnd(); + } + return new SrcSpan(start, end); + } } diff --git a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/selectiveaccess/ConfigLoader.java b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/selectiveaccess/ConfigLoader.java index 5f3b4426..426cf2c3 100644 --- a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/selectiveaccess/ConfigLoader.java +++ b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/selectiveaccess/ConfigLoader.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -26,114 +26,92 @@ import org.dkpro.jwpl.parser.selectiveaccess.SelectiveAccessHandler.CIT; import org.dkpro.jwpl.parser.selectiveaccess.SelectiveAccessHandler.SIT; -class ConfigLoader extends DefaultHandler{ - final SelectiveAccessHandler sah; - - private EnumMap<CIT, Boolean> citm; - private EnumMap<SIT, EnumMap<CIT, Boolean>> sitm; - private Attributes secatt; - - private Map<String, EnumMap<SIT, EnumMap<CIT, Boolean>> > sectionHandling; - - public ConfigLoader( SelectiveAccessHandler sah ){ - this.sah = sah; - } +class ConfigLoader extends DefaultHandler { + final SelectiveAccessHandler sah; - public void startElement(String uri, String localName, String qName, Attributes att){ - if( localName.equalsIgnoreCase( "cit" )){ - citm = SelectiveAccessHandler.buildCITMap( - "true".equalsIgnoreCase( att.getValue( "text" ) ), - "true".equalsIgnoreCase( att.getValue( "bold" ) ), - "true".equalsIgnoreCase( att.getValue( "italic" ) ), - "true".equalsIgnoreCase( att.getValue( "link" ) ) - ); - } - else if( localName.equalsIgnoreCase("section") ){ - sitm = new EnumMap<>(SIT.class); - secatt = att; - } - else if( localName.equalsIgnoreCase( SIT.SUBS.toString() ) ){ - citm = null; - } - else if( localName.equalsIgnoreCase( SIT.TITLE.toString() ) ){ - citm = null; - } - else if( localName.equalsIgnoreCase( SIT.DEFLIST.toString() ) ){ - citm = null; - } - else if( localName.equalsIgnoreCase( SIT.TABLE.toString() ) ){ - citm = null; - } - else if( localName.equalsIgnoreCase( SIT.NESTLIST.toString() ) ){ - citm = null; - } - else if( localName.equalsIgnoreCase( SIT.PARA.toString() ) ){ - citm = null; - } - else if( localName.equalsIgnoreCase("page") ){ - citm = null; - } - else if( localName.equalsIgnoreCase("firstParagraph")){ - citm = null; - } - else if( localName.equalsIgnoreCase("SelectiveAccessHandlerConfig")){ - sah.setPageHandling( null ); - sah.setFirstParagraphHandling( null ); - sectionHandling = sah.getSectionHandling(); - sectionHandling.clear(); - } - else{ - System.err.println("UnhandledElement: "+localName); - } - } - - public void endElement(String uri, String localName, String qName){ - if( localName.equalsIgnoreCase( "cit" )){ - // do nothing... - } - else if( localName.equalsIgnoreCase("section") ){ - String name = secatt.getValue("name"); - - if( name != null ) - if( name.startsWith( SelectiveAccessHandler.SectionType.DEFAULT_SECTION.toString()) || - name.startsWith( SelectiveAccessHandler.SectionType.SECTION_LEVEL.toString()) || - name.startsWith( SelectiveAccessHandler.SectionType.USER_SECTION.toString()) ) - sectionHandling.put( name, sitm ); - else - sectionHandling.put( SelectiveAccessHandler.SectionType.USER_SECTION +name, sitm ); - else - sah.setDefaultSectionHandling( sitm ); - - } - else if( localName.equalsIgnoreCase( SIT.SUBS.toString() ) ){ - sitm.put( SIT.SUBS, citm ); - } - else if( localName.equalsIgnoreCase( SIT.TITLE.toString() ) ){ - sitm.put( SIT.TITLE, citm ); - } - else if( localName.equalsIgnoreCase( SIT.TABLE.toString() ) ){ - sitm.put( SIT.TABLE, citm ); - } - else if( localName.equalsIgnoreCase( SIT.DEFLIST.toString() ) ){ - sitm.put( SIT.DEFLIST, citm ); - } - else if( localName.equalsIgnoreCase( SIT.NESTLIST.toString() ) ){ - sitm.put( SIT.NESTLIST, citm ); - } - else if( localName.equalsIgnoreCase( SIT.PARA.toString() ) ){ - sitm.put( SIT.PARA, citm ); - } - else if( localName.equalsIgnoreCase("page") ){ - sah.setPageHandling( citm ); - } - else if( localName.equalsIgnoreCase("firstParagraph")){ - sah.setFirstParagraphHandling( citm ); - } - else if( localName.equalsIgnoreCase("SelectiveAccessHandlerConfig")){ - - } - else{ - System.err.println("UnhandledElement: "+localName); - } - } + private EnumMap<CIT, Boolean> citm; + private EnumMap<SIT, EnumMap<CIT, Boolean>> sitm; + private Attributes secatt; + + private Map<String, EnumMap<SIT, EnumMap<CIT, Boolean>>> sectionHandling; + + public ConfigLoader(SelectiveAccessHandler sah) { + this.sah = sah; + } + + public void startElement(String uri, String localName, String qName, Attributes att) { + if (localName.equalsIgnoreCase("cit")) { + citm = SelectiveAccessHandler.buildCITMap( + "true".equalsIgnoreCase(att.getValue("text")), + "true".equalsIgnoreCase(att.getValue("bold")), + "true".equalsIgnoreCase(att.getValue("italic")), + "true".equalsIgnoreCase(att.getValue("link")) + ); + } else if (localName.equalsIgnoreCase("section")) { + sitm = new EnumMap<>(SIT.class); + secatt = att; + } else if (localName.equalsIgnoreCase(SIT.SUBS.toString())) { + citm = null; + } else if (localName.equalsIgnoreCase(SIT.TITLE.toString())) { + citm = null; + } else if (localName.equalsIgnoreCase(SIT.DEFLIST.toString())) { + citm = null; + } else if (localName.equalsIgnoreCase(SIT.TABLE.toString())) { + citm = null; + } else if (localName.equalsIgnoreCase(SIT.NESTLIST.toString())) { + citm = null; + } else if (localName.equalsIgnoreCase(SIT.PARA.toString())) { + citm = null; + } else if (localName.equalsIgnoreCase("page")) { + citm = null; + } else if (localName.equalsIgnoreCase("firstParagraph")) { + citm = null; + } else if (localName.equalsIgnoreCase("SelectiveAccessHandlerConfig")) { + sah.setPageHandling(null); + sah.setFirstParagraphHandling(null); + sectionHandling = sah.getSectionHandling(); + sectionHandling.clear(); + } else { + System.err.println("UnhandledElement: " + localName); + } + } + + public void endElement(String uri, String localName, String qName) { + if (localName.equalsIgnoreCase("cit")) { + // do nothing... + } else if (localName.equalsIgnoreCase("section")) { + String name = secatt.getValue("name"); + + if (name != null) + if (name.startsWith(SelectiveAccessHandler.SectionType.DEFAULT_SECTION.toString()) || + name.startsWith(SelectiveAccessHandler.SectionType.SECTION_LEVEL.toString()) || + name.startsWith(SelectiveAccessHandler.SectionType.USER_SECTION.toString())) + sectionHandling.put(name, sitm); + else + sectionHandling.put(SelectiveAccessHandler.SectionType.USER_SECTION + name, sitm); + else + sah.setDefaultSectionHandling(sitm); + + } else if (localName.equalsIgnoreCase(SIT.SUBS.toString())) { + sitm.put(SIT.SUBS, citm); + } else if (localName.equalsIgnoreCase(SIT.TITLE.toString())) { + sitm.put(SIT.TITLE, citm); + } else if (localName.equalsIgnoreCase(SIT.TABLE.toString())) { + sitm.put(SIT.TABLE, citm); + } else if (localName.equalsIgnoreCase(SIT.DEFLIST.toString())) { + sitm.put(SIT.DEFLIST, citm); + } else if (localName.equalsIgnoreCase(SIT.NESTLIST.toString())) { + sitm.put(SIT.NESTLIST, citm); + } else if (localName.equalsIgnoreCase(SIT.PARA.toString())) { + sitm.put(SIT.PARA, citm); + } else if (localName.equalsIgnoreCase("page")) { + sah.setPageHandling(citm); + } else if (localName.equalsIgnoreCase("firstParagraph")) { + sah.setFirstParagraphHandling(citm); + } else if (localName.equalsIgnoreCase("SelectiveAccessHandlerConfig")) { + + } else { + System.err.println("UnhandledElement: " + localName); + } + } } diff --git a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/selectiveaccess/SelectiveAccessHandler.java b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/selectiveaccess/SelectiveAccessHandler.java index 68aa12dd..e8d3d21c 100644 --- a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/selectiveaccess/SelectiveAccessHandler.java +++ b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/selectiveaccess/SelectiveAccessHandler.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -48,361 +48,356 @@ */ public class SelectiveAccessHandler { - enum CIT{ TEXT, BOLD, ITALIC, LINK } + enum CIT {TEXT, BOLD, ITALIC, LINK} - enum SIT{ SUBS, TITLE, TABLE, DEFLIST, NESTLIST, PARA } + enum SIT {SUBS, TITLE, TABLE, DEFLIST, NESTLIST, PARA} - protected enum SectionType{ DEFAULT_SECTION, SECTION_LEVEL, USER_SECTION } + protected enum SectionType {DEFAULT_SECTION, SECTION_LEVEL, USER_SECTION} private EnumMap<CIT, Boolean> firstParagraphHandling; - private EnumMap<CIT, Boolean> pageHandling; - private Map<String, EnumMap<SIT, EnumMap<CIT, Boolean>> > sectionHandling; - private int levelModifier = 0; - - /** - * Creates an SelectiveAccessHandler... ready to config... - */ - public SelectiveAccessHandler() { - loadConfig(); - } - - /** - * Creates an SelectiveAccessHandler and loads the config from an XMLFile - */ - public SelectiveAccessHandler(String XMLFile ) { - loadConfig( XMLFile ); - } - - public static EnumMap<CIT, Boolean> buildCITMap( boolean text, boolean bold, boolean italic, boolean link ){ - EnumMap<CIT, Boolean> result = new EnumMap<>(CIT.class); - result.put( CIT.TEXT, text ); - result.put( CIT.BOLD, bold ); - result.put( CIT.ITALIC, italic ); - result.put( CIT.LINK, link ); - return result; - } - - public static EnumMap<SIT, EnumMap<CIT, Boolean>> buildSITMap( EnumMap<CIT, Boolean> subs, EnumMap<CIT, Boolean> title, EnumMap<CIT, Boolean> table, EnumMap<CIT, Boolean> deflist, EnumMap<CIT, Boolean> nestedlist, EnumMap<CIT, Boolean> paragraph ){ - EnumMap<SIT, EnumMap<CIT, Boolean>> result = new EnumMap<>(SIT.class); - result.put( SIT.SUBS, subs ); - result.put( SIT.TITLE, title ); - result.put( SIT.TABLE, table ); - result.put( SIT.DEFLIST, deflist ); - result.put( SIT.NESTLIST, nestedlist ); - result.put( SIT.PARA, paragraph ); - return result; - } - - /** - * if firstParagraphHandling is null, there will be no special handling for the FirstParagraph... - */ - public void setFirstParagraphHandling( EnumMap<CIT, Boolean> firstParagraphHandling ) { - this.firstParagraphHandling = firstParagraphHandling; - } - - /** - * if pageHandling is null, there will be no special handling for the WHOLE PAGE, this means, the handling will be sectionwhise... - */ - public void setPageHandling( EnumMap<CIT, Boolean> pageHandling ) { - this.pageHandling = pageHandling; - } - - /** - * @return the sectionHandling - */ - public Map<String, EnumMap<SIT, EnumMap<CIT, Boolean>>> getSectionHandling() { - return sectionHandling; - } - - /** - * Be sure to set the Default Section Handling to avoid errors... - */ - public void setSectionHandling( Map<String, EnumMap<SIT, EnumMap<CIT, Boolean>>> sectionHandling ) { - this.sectionHandling = sectionHandling; - } - - /** - * adds section handling for a specified relative level... - */ - public void addSectionHandling( int level, EnumMap<SIT, EnumMap<CIT, Boolean>> sh ){ - sectionHandling.put( SectionType.SECTION_LEVEL.toString()+level, sh ); - } - - /** - * adds section handling for a specila section name... - */ - public void addSectionHandling( String name, EnumMap<SIT, EnumMap<CIT, Boolean>> sh ){ - sectionHandling.put( SectionType.USER_SECTION + name.toUpperCase(), sh); - } - - /** - * sets the section handling for all sections which are not set by level or name... - */ - public void setDefaultSectionHandling( EnumMap<SIT, EnumMap<CIT, Boolean>> sh ){ - sectionHandling.put( SectionType.DEFAULT_SECTION.toString(), sh ); - } - - /** - * Returns information which infomations are selected by the actual configuration - */ - public String getSelectionInfo(){ - StringBuilder result = new StringBuilder(); - - result.append( "SelectionInfo: "+ this.getClass() +"\n" ); - result.append( "Page:"+ CITInfo( pageHandling )+"\n" ); - result.append( "FirstParagraph:" +CITInfo( firstParagraphHandling )+"\n"); - for( String key: sectionHandling.keySet() ){ - final String uss = SectionType.USER_SECTION.toString(); - if( key.startsWith( uss ) ) - result.append(uss+"["+key.substring( uss.length() )+"]:\n"); - else - result.append(key+":\n"); - - result.append( SITInfo( sectionHandling.get(key))+"\n" ); - } - - return result.toString(); - } - - /** - * Converts a CITMap into a human readable String - */ - public static String CITInfo( EnumMap<CIT, Boolean> hp ){ - StringBuilder result = new StringBuilder(); - result.append( "["); - if( hp!= null ){ - for( CIT key: hp.keySet()) - result.append( key.toString()+":"+hp.get(key)+", "); - result.delete( result.length()-2, result.length() ); - } - result.append( "]" ); - return result.toString(); - } - - /** - * Converts a SITMap into a human readable String - */ - public static String SITInfo( EnumMap<SIT, EnumMap<CIT, Boolean>> shp ){ - StringBuilder result = new StringBuilder(); - for( SIT key: shp.keySet() ){ - result.append("\t"+key.toString()+":"+CITInfo( shp.get(key))+"\n"); - } - return result.toString(); - } - - private void deleteParagraph( int nr, List<Section> sections ){ - int temp = nr; - - for( Section s: sections ){ - nr = temp; - temp -= s.nrOfParagraphs(); - - if( temp >= 0 ) continue; - - if( s.getClass() == SectionContainer.class ) - deleteParagraph( nr ,((SectionContainer)s).getSubSections() ); - else{ - SectionContent sc = (SectionContent)s; - sc.removeParagraph( sc.getParagraph( nr ) ); - } - - break; - } - } - - /** - * Returns the Information of a ParsedPage which are selected by the actual configuration - */ - public String getSelectedText( ParsedPage pp ){ - if( pp == null ) return null; - - StringBuilder sb = new StringBuilder(); - - levelModifier = pp.getSection(0).getLevel()-1; - - if( pageHandling == null ){ - if( firstParagraphHandling != null ){ - handleContent( pp.getFirstParagraph(), firstParagraphHandling, sb ); - deleteParagraph( pp.getFirstParagraphNr(), pp.getSections() ); - } - for( Section s: pp.getSections() ) - handleSection( s, sb ); - } - else{ - if( pageHandling.get( CIT.TEXT ) ){ - sb.append( pp.getText() ); - } - else{ - if( pageHandling.get( CIT.BOLD )){ - handleSpans( pp.getFormatSpans( FormatType.BOLD ), pp.getText(), sb ); - } - if( pageHandling.get( CIT.ITALIC )){ - handleSpans( pp.getFormatSpans( FormatType.ITALIC ), pp.getText(), sb ); - } - } - - if( pageHandling.get( CIT.LINK )) - handleLinks( pp.getLinks(), !pageHandling.get( CIT.TEXT ), sb ); - } - - return sb.toString().trim(); - } - - private static void handleContent( Content c, EnumMap<CIT, Boolean> hp, StringBuilder sb ){ - if( hp != null ){ - if( hp.get( CIT.TEXT )) - sb.append( c.getText()+" " ); - else{ - if( hp.get( CIT.BOLD ) ) - handleSpans( c.getFormatSpans( FormatType.BOLD), c.getText(), sb ); - if( hp.get( CIT.ITALIC )) - handleSpans( c.getFormatSpans( FormatType.ITALIC), c.getText(), sb ); - } - if( hp.get( CIT.LINK )) - handleLinks( c.getLinks(), !hp.get( CIT.TEXT ), sb ); - } - } - - private void handleSection( Section s, StringBuilder sb ){ - EnumMap<SIT, EnumMap<CIT, Boolean>> hp = null; - - if( s.getTitle()!= null ) hp = sectionHandling.get( SectionType.USER_SECTION +s.getTitle().toUpperCase() ); - if( hp == null ) hp = sectionHandling.get(SectionType.SECTION_LEVEL.toString()+(s.getLevel()-levelModifier)); - if( hp == null ) hp = sectionHandling.get(SectionType.DEFAULT_SECTION.toString()); - if( hp == null ){ - System.err.println( "Cannot get Handling Parameters for Section:\""+ s.getTitle()+"\" Level:"+s.getLevel() ); - return; - } - - handleContent( s.getTitleElement(), hp.get( SIT.TITLE ), sb ); - - if( s.getClass() == SectionContainer.class ){ - if( hp.get( SIT.SUBS )!= null ) - handleContent( s, hp.get( SIT.SUBS ), sb ); - else - for( Section ss: ((SectionContainer)s).getSubSections() ) - handleSection( ss, sb ); - } - else{ - EnumMap<CIT, Boolean> hpx; - - hpx = hp.get( SIT.TABLE ); - if( hpx != null ) - for( Table t: s.getTables() ) - handleContent( t, hpx, sb ); - - hpx = hp.get( SIT.NESTLIST ); - if( hpx != null ) - for( NestedList nl: s.getNestedLists() ) - handleContent( nl, hpx, sb ); - - hpx = hp.get( SIT.PARA ); - if( hpx != null ) - for( Paragraph p: s.getParagraphs() ) - handleContent( p, hpx, sb ); - - hpx = hp.get( SIT.DEFLIST ); - if( hpx != null ) - for( DefinitionList dl: s.getDefinitionLists() ) - handleContent( dl, hpx, sb ); - } - } - - private static void handleSpans( List<Span> spans, String text, StringBuilder sb ){ - for( Span s: spans ) - sb.append( text.substring( s.getStart(), s.getEnd() )+" "); - } - - private static void handleLinks( List<Link> links, boolean linktext, StringBuilder sb ){ - for( Link l: links ){ - switch( l.getType() ){ - case INTERNAL: - String lText = l.getText(); - String lTarget = l.getTarget(); - if( linktext ) sb.append( lText+" " ); - if( !lText.equals( lTarget )) sb.append( lTarget+" " ); - break; - case EXTERNAL: - sb.append( l.getText()+" " ); - break; - case IMAGE: - case AUDIO: - case VIDEO: - // do nothing ! - break; - } - } - } - - /** - * Loads the Default Config... (shows nothing at all, but ready to config...) - */ - private void loadConfig(){ - firstParagraphHandling = null; - pageHandling = null; - sectionHandling = new HashMap<>(); - setDefaultSectionHandling( buildSITMap( buildCITMap( false, false, false, false ), null, null, null, null, null ) ); - } - - /** - * Loads a Configuration from an XMLFile... - */ - public void loadConfig( String XMLFile ){ - try{ - sectionHandling = new HashMap<>(); - SAXParserFactory factory = SAXParserFactory.newInstance(); - factory.setNamespaceAware(true); - SAXParser sp = factory.newSAXParser(); - DefaultHandler handler = new ConfigLoader( this ); - sp.parse( XMLFile, handler ); - } - catch( Exception e ){ - System.err.println( e ); - loadConfig(); - } - } - - private static String XMLCIT( EnumMap<CIT, Boolean> em ){ - StringBuilder result = new StringBuilder(); - result.append( "<cit" ); - if( em != null ) - for( CIT key: em.keySet() ) - result.append( " "+ key.toString()+"=\""+em.get(key)+"\"" ); - result.append( "/>" ); - return result.toString(); - } - - private static String XMLSIT( EnumMap<SIT, EnumMap<CIT, Boolean>> sem ){ - StringBuilder result = new StringBuilder(); - for( SIT key: sem.keySet() ){ - result.append( "<"+key.toString()+">"); - result.append( XMLCIT( sem.get( key ) ) ); - result.append( "</"+ key +">\n"); - } - return result.toString(); - } - - /** - * writes an XML configuration file... - */ - public void writeConfig( String XMLFile ){ - try{ - BufferedWriter bw = new BufferedWriter( new FileWriter( XMLFile ) ); - - bw.write( "<SelectiveAccessHandlerConfig>\n" ); - bw.write( "<page>"+XMLCIT( pageHandling )+"</page>\n" ); - bw.write( "<firstparagraph>"+XMLCIT( pageHandling )+"</firstparagraph>\n" ); - for( String key: sectionHandling.keySet() ){ - bw.write( "<section name=\""+key+"\">\n" ); - bw.write( XMLSIT( sectionHandling.get(key) )); - bw.write( "</section>\n" ); - } - bw.write( "<SelectiveAccessHandlerConfig>\n" ); - - bw.close(); - } - catch( IOException e ){ - System.err.println( e ); - } - } + private EnumMap<CIT, Boolean> pageHandling; + private Map<String, EnumMap<SIT, EnumMap<CIT, Boolean>>> sectionHandling; + private int levelModifier = 0; + + /** + * Creates an SelectiveAccessHandler... ready to config... + */ + public SelectiveAccessHandler() { + loadConfig(); + } + + /** + * Creates an SelectiveAccessHandler and loads the config from an XMLFile + */ + public SelectiveAccessHandler(String XMLFile) { + loadConfig(XMLFile); + } + + public static EnumMap<CIT, Boolean> buildCITMap(boolean text, boolean bold, boolean italic, boolean link) { + EnumMap<CIT, Boolean> result = new EnumMap<>(CIT.class); + result.put(CIT.TEXT, text); + result.put(CIT.BOLD, bold); + result.put(CIT.ITALIC, italic); + result.put(CIT.LINK, link); + return result; + } + + public static EnumMap<SIT, EnumMap<CIT, Boolean>> buildSITMap(EnumMap<CIT, Boolean> subs, EnumMap<CIT, Boolean> title, EnumMap<CIT, Boolean> table, EnumMap<CIT, Boolean> deflist, EnumMap<CIT, Boolean> nestedlist, EnumMap<CIT, Boolean> paragraph) { + EnumMap<SIT, EnumMap<CIT, Boolean>> result = new EnumMap<>(SIT.class); + result.put(SIT.SUBS, subs); + result.put(SIT.TITLE, title); + result.put(SIT.TABLE, table); + result.put(SIT.DEFLIST, deflist); + result.put(SIT.NESTLIST, nestedlist); + result.put(SIT.PARA, paragraph); + return result; + } + + /** + * if firstParagraphHandling is null, there will be no special handling for the FirstParagraph... + */ + public void setFirstParagraphHandling(EnumMap<CIT, Boolean> firstParagraphHandling) { + this.firstParagraphHandling = firstParagraphHandling; + } + + /** + * if pageHandling is null, there will be no special handling for the WHOLE PAGE, this means, the handling will be sectionwhise... + */ + public void setPageHandling(EnumMap<CIT, Boolean> pageHandling) { + this.pageHandling = pageHandling; + } + + /** + * @return the sectionHandling + */ + public Map<String, EnumMap<SIT, EnumMap<CIT, Boolean>>> getSectionHandling() { + return sectionHandling; + } + + /** + * Be sure to set the Default Section Handling to avoid errors... + */ + public void setSectionHandling(Map<String, EnumMap<SIT, EnumMap<CIT, Boolean>>> sectionHandling) { + this.sectionHandling = sectionHandling; + } + + /** + * adds section handling for a specified relative level... + */ + public void addSectionHandling(int level, EnumMap<SIT, EnumMap<CIT, Boolean>> sh) { + sectionHandling.put(SectionType.SECTION_LEVEL.toString() + level, sh); + } + + /** + * adds section handling for a specila section name... + */ + public void addSectionHandling(String name, EnumMap<SIT, EnumMap<CIT, Boolean>> sh) { + sectionHandling.put(SectionType.USER_SECTION + name.toUpperCase(), sh); + } + + /** + * sets the section handling for all sections which are not set by level or name... + */ + public void setDefaultSectionHandling(EnumMap<SIT, EnumMap<CIT, Boolean>> sh) { + sectionHandling.put(SectionType.DEFAULT_SECTION.toString(), sh); + } + + /** + * Returns information which infomations are selected by the actual configuration + */ + public String getSelectionInfo() { + StringBuilder result = new StringBuilder(); + + result.append("SelectionInfo: " + this.getClass() + "\n"); + result.append("Page:" + CITInfo(pageHandling) + "\n"); + result.append("FirstParagraph:" + CITInfo(firstParagraphHandling) + "\n"); + for (String key : sectionHandling.keySet()) { + final String uss = SectionType.USER_SECTION.toString(); + if (key.startsWith(uss)) + result.append(uss + "[" + key.substring(uss.length()) + "]:\n"); + else + result.append(key + ":\n"); + + result.append(SITInfo(sectionHandling.get(key)) + "\n"); + } + + return result.toString(); + } + + /** + * Converts a CITMap into a human readable String + */ + public static String CITInfo(EnumMap<CIT, Boolean> hp) { + StringBuilder result = new StringBuilder(); + result.append("["); + if (hp != null) { + for (CIT key : hp.keySet()) + result.append(key.toString() + ":" + hp.get(key) + ", "); + result.delete(result.length() - 2, result.length()); + } + result.append("]"); + return result.toString(); + } + + /** + * Converts a SITMap into a human readable String + */ + public static String SITInfo(EnumMap<SIT, EnumMap<CIT, Boolean>> shp) { + StringBuilder result = new StringBuilder(); + for (SIT key : shp.keySet()) { + result.append("\t" + key.toString() + ":" + CITInfo(shp.get(key)) + "\n"); + } + return result.toString(); + } + + private void deleteParagraph(int nr, List<Section> sections) { + int temp = nr; + + for (Section s : sections) { + nr = temp; + temp -= s.nrOfParagraphs(); + + if (temp >= 0) continue; + + if (s.getClass() == SectionContainer.class) + deleteParagraph(nr, ((SectionContainer) s).getSubSections()); + else { + SectionContent sc = (SectionContent) s; + sc.removeParagraph(sc.getParagraph(nr)); + } + + break; + } + } + + /** + * Returns the Information of a ParsedPage which are selected by the actual configuration + */ + public String getSelectedText(ParsedPage pp) { + if (pp == null) return null; + + StringBuilder sb = new StringBuilder(); + + levelModifier = pp.getSection(0).getLevel() - 1; + + if (pageHandling == null) { + if (firstParagraphHandling != null) { + handleContent(pp.getFirstParagraph(), firstParagraphHandling, sb); + deleteParagraph(pp.getFirstParagraphNr(), pp.getSections()); + } + for (Section s : pp.getSections()) + handleSection(s, sb); + } else { + if (pageHandling.get(CIT.TEXT)) { + sb.append(pp.getText()); + } else { + if (pageHandling.get(CIT.BOLD)) { + handleSpans(pp.getFormatSpans(FormatType.BOLD), pp.getText(), sb); + } + if (pageHandling.get(CIT.ITALIC)) { + handleSpans(pp.getFormatSpans(FormatType.ITALIC), pp.getText(), sb); + } + } + + if (pageHandling.get(CIT.LINK)) + handleLinks(pp.getLinks(), !pageHandling.get(CIT.TEXT), sb); + } + + return sb.toString().trim(); + } + + private static void handleContent(Content c, EnumMap<CIT, Boolean> hp, StringBuilder sb) { + if (hp != null) { + if (hp.get(CIT.TEXT)) + sb.append(c.getText() + " "); + else { + if (hp.get(CIT.BOLD)) + handleSpans(c.getFormatSpans(FormatType.BOLD), c.getText(), sb); + if (hp.get(CIT.ITALIC)) + handleSpans(c.getFormatSpans(FormatType.ITALIC), c.getText(), sb); + } + if (hp.get(CIT.LINK)) + handleLinks(c.getLinks(), !hp.get(CIT.TEXT), sb); + } + } + + private void handleSection(Section s, StringBuilder sb) { + EnumMap<SIT, EnumMap<CIT, Boolean>> hp = null; + + if (s.getTitle() != null) hp = sectionHandling.get(SectionType.USER_SECTION + s.getTitle().toUpperCase()); + if (hp == null) hp = sectionHandling.get(SectionType.SECTION_LEVEL.toString() + (s.getLevel() - levelModifier)); + if (hp == null) hp = sectionHandling.get(SectionType.DEFAULT_SECTION.toString()); + if (hp == null) { + System.err.println("Cannot get Handling Parameters for Section:\"" + s.getTitle() + "\" Level:" + s.getLevel()); + return; + } + + handleContent(s.getTitleElement(), hp.get(SIT.TITLE), sb); + + if (s.getClass() == SectionContainer.class) { + if (hp.get(SIT.SUBS) != null) + handleContent(s, hp.get(SIT.SUBS), sb); + else + for (Section ss : ((SectionContainer) s).getSubSections()) + handleSection(ss, sb); + } else { + EnumMap<CIT, Boolean> hpx; + + hpx = hp.get(SIT.TABLE); + if (hpx != null) + for (Table t : s.getTables()) + handleContent(t, hpx, sb); + + hpx = hp.get(SIT.NESTLIST); + if (hpx != null) + for (NestedList nl : s.getNestedLists()) + handleContent(nl, hpx, sb); + + hpx = hp.get(SIT.PARA); + if (hpx != null) + for (Paragraph p : s.getParagraphs()) + handleContent(p, hpx, sb); + + hpx = hp.get(SIT.DEFLIST); + if (hpx != null) + for (DefinitionList dl : s.getDefinitionLists()) + handleContent(dl, hpx, sb); + } + } + + private static void handleSpans(List<Span> spans, String text, StringBuilder sb) { + for (Span s : spans) + sb.append(text.substring(s.getStart(), s.getEnd()) + " "); + } + + private static void handleLinks(List<Link> links, boolean linktext, StringBuilder sb) { + for (Link l : links) { + switch (l.getType()) { + case INTERNAL: + String lText = l.getText(); + String lTarget = l.getTarget(); + if (linktext) sb.append(lText + " "); + if (!lText.equals(lTarget)) sb.append(lTarget + " "); + break; + case EXTERNAL: + sb.append(l.getText() + " "); + break; + case IMAGE: + case AUDIO: + case VIDEO: + // do nothing ! + break; + } + } + } + + /** + * Loads the Default Config... (shows nothing at all, but ready to config...) + */ + private void loadConfig() { + firstParagraphHandling = null; + pageHandling = null; + sectionHandling = new HashMap<>(); + setDefaultSectionHandling(buildSITMap(buildCITMap(false, false, false, false), null, null, null, null, null)); + } + + /** + * Loads a Configuration from an XMLFile... + */ + public void loadConfig(String XMLFile) { + try { + sectionHandling = new HashMap<>(); + SAXParserFactory factory = SAXParserFactory.newInstance(); + factory.setNamespaceAware(true); + SAXParser sp = factory.newSAXParser(); + DefaultHandler handler = new ConfigLoader(this); + sp.parse(XMLFile, handler); + } catch (Exception e) { + System.err.println(e); + loadConfig(); + } + } + + private static String XMLCIT(EnumMap<CIT, Boolean> em) { + StringBuilder result = new StringBuilder(); + result.append("<cit"); + if (em != null) + for (CIT key : em.keySet()) + result.append(" " + key.toString() + "=\"" + em.get(key) + "\""); + result.append("/>"); + return result.toString(); + } + + private static String XMLSIT(EnumMap<SIT, EnumMap<CIT, Boolean>> sem) { + StringBuilder result = new StringBuilder(); + for (SIT key : sem.keySet()) { + result.append("<" + key.toString() + ">"); + result.append(XMLCIT(sem.get(key))); + result.append("</" + key + ">\n"); + } + return result.toString(); + } + + /** + * writes an XML configuration file... + */ + public void writeConfig(String XMLFile) { + try { + BufferedWriter bw = new BufferedWriter(new FileWriter(XMLFile)); + + bw.write("<SelectiveAccessHandlerConfig>\n"); + bw.write("<page>" + XMLCIT(pageHandling) + "</page>\n"); + bw.write("<firstparagraph>" + XMLCIT(pageHandling) + "</firstparagraph>\n"); + for (String key : sectionHandling.keySet()) { + bw.write("<section name=\"" + key + "\">\n"); + bw.write(XMLSIT(sectionHandling.get(key))); + bw.write("</section>\n"); + } + bw.write("<SelectiveAccessHandlerConfig>\n"); + + bw.close(); + } catch (IOException e) { + System.err.println(e); + } + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/AbstractRevisionService.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/AbstractRevisionService.java index eac3d0f3..4e0932b3 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/AbstractRevisionService.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/AbstractRevisionService.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -30,62 +30,61 @@ */ public abstract class AbstractRevisionService { - private static final Logger logger = LoggerFactory.getLogger(AbstractRevisionService.class); + private static final Logger logger = LoggerFactory.getLogger(AbstractRevisionService.class); - /** Reference to database connection */ - protected Connection connection; + /** + * Reference to database connection + */ + protected Connection connection; - /** Reference to the configuration parameters */ - protected RevisionAPIConfiguration config; + /** + * Reference to the configuration parameters + */ + protected RevisionAPIConfiguration config; - /** - * Helper method to obtain a connection via the given {@link RevisionAPIConfiguration} parameter. - * @param config Must not be {@code null}. - * @return A valid {@link Connection} to the database endpoint. - * @throws WikiApiException Thrown if errors occurred while opening a connection. - */ - protected Connection getConnection(RevisionAPIConfiguration config) throws WikiApiException - { - Connection c; - try { + /** + * Helper method to obtain a connection via the given {@link RevisionAPIConfiguration} parameter. + * + * @param config Must not be {@code null}. + * @return A valid {@link Connection} to the database endpoint. + * @throws WikiApiException Thrown if errors occurred while opening a connection. + */ + protected Connection getConnection(RevisionAPIConfiguration config) throws WikiApiException { + Connection c; + try { - String driverDB = config.getDatabaseDriver(); - Class.forName(driverDB); + String driverDB = config.getDatabaseDriver(); + Class.forName(driverDB); - c = DriverManager.getConnection(config.getJdbcURL(), config.getUser(), config.getPassword()); - if (!c.isValid(5)) { - throw new WikiApiException("Connection could not be established."); - } - } - catch (SQLException | ClassNotFoundException e) { - throw new WikiApiException(e); - } - - return c; + c = DriverManager.getConnection(config.getJdbcURL(), config.getUser(), config.getPassword()); + if (!c.isValid(5)) { + throw new WikiApiException("Connection could not be established."); + } + } catch (SQLException | ClassNotFoundException e) { + throw new WikiApiException(e); } - /** - * This method closes any open {@link Connection connections} to the database. - * - * @throws SQLException - * if an error occurs while closing the connection - */ - public final void close() throws SQLException - { - if (this.connection != null) { - this.connection.close(); - } + return c; + } + + /** + * This method closes any open {@link Connection connections} to the database. + * + * @throws SQLException if an error occurs while closing the connection + */ + public final void close() throws SQLException { + if (this.connection != null) { + this.connection.close(); } + } - protected void reconnect() throws SQLException - { - close(); - try { - this.connection = getConnection(config); - } - catch (WikiApiException e) { - close(); - logger.error("Could not reconnect. Closing connection...", e); - } + protected void reconnect() throws SQLException { + close(); + try { + this.connection = getConnection(config); + } catch (WikiApiException e) { + close(); + logger.error("Could not reconnect. Closing connection...", e); } + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/ChronoRevisionIterator.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/ChronoRevisionIterator.java index 79ff1ff0..ed27d367 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/ChronoRevisionIterator.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/ChronoRevisionIterator.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -31,165 +31,176 @@ * This class represents the iteration in chronological order. */ public class ChronoRevisionIterator - implements RevisionIteratorInterface -{ - - /** Reference to the configuration parameters */ - private final RevisionAPIConfiguration config; - - /** Reference to the database connection */ - private final Connection connection; - - /** Reference to the currently used result set */ - private ResultSet resultArticles; - - /** Number of revisions of the current read article */ - private int maxRevision; - - /** Reference to the Revision Iterator */ - private RevisionIterator revisionIterator; - - /** Reference to the ChronoIterator */ - private ChronoIterator chronoIterator; - - /** Retrieval mode */ - private int modus; - - /** Retrieval mode id - undefined */ - private final static int INIT = 0; - - /** Retrieval mode id - article is in chronological order */ - private final static int ITERATE_WITHOUT_MAPPING = 2; + implements RevisionIteratorInterface { + + /** + * Reference to the configuration parameters + */ + private final RevisionAPIConfiguration config; + + /** + * Reference to the database connection + */ + private final Connection connection; + + /** + * Reference to the currently used result set + */ + private ResultSet resultArticles; + + /** + * Number of revisions of the current read article + */ + private int maxRevision; + + /** + * Reference to the Revision Iterator + */ + private RevisionIterator revisionIterator; + + /** + * Reference to the ChronoIterator + */ + private ChronoIterator chronoIterator; + + /** + * Retrieval mode + */ + private int modus; + + /** + * Retrieval mode id - undefined + */ + private final static int INIT = 0; + + /** + * Retrieval mode id - article is in chronological order + */ + private final static int ITERATE_WITHOUT_MAPPING = 2; + + /** + * Retrieval mode id - article is not in chronological order + */ + private final static int ITERATE_WITH_MAPPING = 1; + + /** + * ID of the current article (Should be 0 to enable an iteration over all + * article) + */ + private int currentArticleID; + + /** + * ID of the last article to retrieve + */ + private int lastArticleID; + + /** + * Parameter - buffer size + */ + private final int MAX_NUMBER_RESULTS; + + /** + * (Constructor) Creates a new ChronoRevisionIterator + * + * @param config Reference to the configuration parameters + * @throws WikiApiException if an error occurs + */ + public ChronoRevisionIterator(final RevisionAPIConfiguration config) + throws WikiApiException { + + this.config = config; + try { + this.MAX_NUMBER_RESULTS = config.getBufferSize(); + + this.resultArticles = null; + this.currentArticleID = 0; + this.lastArticleID = -1; + + reset(); + + String driverDB = "com.mysql.jdbc.Driver"; + Class.forName(driverDB); + + this.connection = DriverManager.getConnection("jdbc:mysql://" + + config.getHost() + "/" + config.getDatabase(), + config.getUser(), config.getPassword()); + + } catch (SQLException | ClassNotFoundException e) { + throw new WikiApiException(e); + } + } - /** Retrieval mode id - article is not in chronological order */ - private final static int ITERATE_WITH_MAPPING = 1; + /** + * (Constructor) Creates a new ChronoRevisionIterator + * + * @param config Reference to the configuration parameters + * @throws WikiApiException if an error occurs + */ + public ChronoRevisionIterator(final RevisionAPIConfiguration config, + final int firstArticleID, final int lastArticleID) + throws WikiApiException { - /** - * ID of the current article (Should be 0 to enable an iteration over all - * article) - */ - private int currentArticleID; + this(config); - /** ID of the last article to retrieve */ - private int lastArticleID; + this.currentArticleID = firstArticleID - 1; + this.lastArticleID = lastArticleID; + } - /** Parameter - buffer size */ - private final int MAX_NUMBER_RESULTS; + /** + * Retrieves the next articles from the article index. + * + * @return whether the query contains results or not + * @throws SQLException if an error occurs while executing the query + */ + private boolean queryArticle() + throws SQLException { - /** - * (Constructor) Creates a new ChronoRevisionIterator - * - * @param config - * Reference to the configuration parameters - * @throws WikiApiException - * if an error occurs - */ - public ChronoRevisionIterator(final RevisionAPIConfiguration config) - throws WikiApiException - { + Statement statement = this.connection.createStatement(); - this.config = config; - try { - this.MAX_NUMBER_RESULTS = config.getBufferSize(); + String query = "SELECT ArticleID, FullRevisionPKs, RevisionCounter " + + "FROM index_articleID_rc_ts " + "WHERE articleID > " + + this.currentArticleID + " LIMIT " + MAX_NUMBER_RESULTS; - this.resultArticles = null; - this.currentArticleID = 0; - this.lastArticleID = -1; + resultArticles = statement.executeQuery(query); - reset(); + if (resultArticles.next()) { - String driverDB = "com.mysql.jdbc.Driver"; - Class.forName(driverDB); + this.currentArticleID = resultArticles.getInt(1); + return (this.lastArticleID == -1) + || (this.currentArticleID <= this.lastArticleID); + } - this.connection = DriverManager.getConnection("jdbc:mysql://" - + config.getHost() + "/" + config.getDatabase(), - config.getUser(), config.getPassword()); + return false; + } - } - catch (SQLException | ClassNotFoundException e) { - throw new WikiApiException(e); - } + /** + * Resets the modus to INIT. + */ + private void reset() { + this.modus = INIT; } - /** - * (Constructor) Creates a new ChronoRevisionIterator - * - * @param config - * Reference to the configuration parameters - * @throws WikiApiException - * if an error occurs - */ - public ChronoRevisionIterator(final RevisionAPIConfiguration config, - final int firstArticleID, final int lastArticleID) - throws WikiApiException - { - - this(config); - - this.currentArticleID = firstArticleID - 1; - this.lastArticleID = lastArticleID; - } - - /** - * Retrieves the next articles from the article index. - * - * @return whether the query contains results or not - * @throws SQLException - * if an error occurs while executing the query - */ - private boolean queryArticle() - throws SQLException - { - - Statement statement = this.connection.createStatement(); - - String query = "SELECT ArticleID, FullRevisionPKs, RevisionCounter " - + "FROM index_articleID_rc_ts " + "WHERE articleID > " - + this.currentArticleID + " LIMIT " + MAX_NUMBER_RESULTS; - - resultArticles = statement.executeQuery(query); - - if (resultArticles.next()) { - - this.currentArticleID = resultArticles.getInt(1); - return (this.lastArticleID == -1) - || (this.currentArticleID <= this.lastArticleID); - } - - return false; - } - - /** - * Resets the modus to INIT. - */ - private void reset() - { - this.modus = INIT; - } - - /** - * Initiates the iteration over of a new article. - * - * @return First Revision - * @throws WikiApiException - * if an error occurs - */ - private Revision init() - throws WikiApiException - { - - try { - currentArticleID = resultArticles.getInt(1); - String fullRevisionPKs = resultArticles.getString(2); - String revisionCounters = resultArticles.getString(3); - - int index = revisionCounters.lastIndexOf(' '); - if (index == -1) { - throw new RuntimeException("Invalid revisioncounter content"); - } - - this.maxRevision = Integer.parseInt(revisionCounters.substring( - index + 1, revisionCounters.length())); + /** + * Initiates the iteration over of a new article. + * + * @return First Revision + * @throws WikiApiException if an error occurs + */ + private Revision init() + throws WikiApiException { + + try { + currentArticleID = resultArticles.getInt(1); + String fullRevisionPKs = resultArticles.getString(2); + String revisionCounters = resultArticles.getString(3); + + int index = revisionCounters.lastIndexOf(' '); + if (index == -1) { + throw new RuntimeException("Invalid revisioncounter content"); + } + + this.maxRevision = Integer.parseInt(revisionCounters.substring( + index + 1, revisionCounters.length())); try (Statement statement = this.connection.createStatement(); ResultSet result = statement.executeQuery("SELECT Mapping " + "FROM index_chronological " + "WHERE ArticleID=" @@ -243,171 +254,161 @@ private Revision init() } } - } - catch (WikiApiException e) { - throw e; - } - catch (Exception e) { - throw new WikiApiException(e); - } - } - - /** - * Returns the next revision. - * - * @return Revision - */ - public Revision next() - { - try { - switch (modus) { - case INIT: - return init(); - - case ITERATE_WITH_MAPPING: - return chronoIterator.next(); - - // revisionEncoder.getRevision(currentArticleID, revisionIndex); - - case ITERATE_WITHOUT_MAPPING: - return revisionIterator.next(); - - default: - throw new RuntimeException("Illegal mode"); - } - } - catch (Exception e) { - throw new RuntimeException(e); - } - } - - /** - * Returns whether another revision is available or not. - * - * @return TRUE or FALSE - */ - public boolean hasNext() - { - - try { - switch (modus) { - case INIT: - return queryArticle(); - - case ITERATE_WITH_MAPPING: - if (chronoIterator.hasNext()) { - return true; - } - - reset(); - - if (resultArticles.next()) { - - this.currentArticleID = resultArticles.getInt(1); - return (this.lastArticleID == -1) - || (this.currentArticleID <= this.lastArticleID); - } - - resultArticles.close(); - return queryArticle(); - - case ITERATE_WITHOUT_MAPPING: - - if (revisionIterator.hasNext()) { - return true; - } - - reset(); - - if (resultArticles.next()) { - - this.currentArticleID = resultArticles.getInt(1); - return (this.lastArticleID == -1) - || (this.currentArticleID <= this.lastArticleID); - } - - resultArticles.close(); - return queryArticle(); - - default: - throw new RuntimeException("Illegal mode"); - } - - } - catch (SQLException e) { - throw new RuntimeException(e); - } - } - - /** - * This method is unsupported. - * - * @deprecated - * @throws UnsupportedOperationException - */ - @Deprecated - public void remove() - { - throw new UnsupportedOperationException(); - } - - /** - * This method closes the connection to the input component. - * - * @throws SQLException - * if an error occurs while closing the connection to the - * database. - */ - public void close() - throws SQLException - { - if (this.connection != null) { - this.connection.close(); - } - } - - public static void main(final String[] args) - throws Exception - { - - RevisionAPIConfiguration config = new RevisionAPIConfiguration(); - - config.setHost("localhost"); - config.setDatabase("en_wiki"); - config.setUser("root"); - config.setPassword("1234"); - - config.setCharacterSet("UTF-8"); - config.setBufferSize(10000); - config.setMaxAllowedPacket(1024 * 1023); - config.setChronoStorageSpace(400 * 1024 * 1024); - - long count = 1; - long last = 0, now, start = System.currentTimeMillis(); - - Revision rev; - ChronoRevisionIterator it = new ChronoRevisionIterator(config); - - System.out.println(Time.toClock(System.currentTimeMillis() - start)); - - while (it.hasNext()) { - rev = it.next(); - - if (count++ % 1000 == 0) { - - now = System.currentTimeMillis() - start; - if (it.chronoIterator != null) { - System.out.println(it.chronoIterator.getStorageSize()); - } - if (rev != null) { - System.out.println(rev); - } - System.out.println(Time.toClock(now) + "\t" + (now - last) - + "\tREBUILDING " + count); - last = now; - } - } - - System.out.println(Time.toClock(System.currentTimeMillis() - start)); - } + } catch (WikiApiException e) { + throw e; + } catch (Exception e) { + throw new WikiApiException(e); + } + } + + /** + * Returns the next revision. + * + * @return Revision + */ + public Revision next() { + try { + switch (modus) { + case INIT: + return init(); + + case ITERATE_WITH_MAPPING: + return chronoIterator.next(); + + // revisionEncoder.getRevision(currentArticleID, revisionIndex); + + case ITERATE_WITHOUT_MAPPING: + return revisionIterator.next(); + + default: + throw new RuntimeException("Illegal mode"); + } + } catch (Exception e) { + throw new RuntimeException(e); + } + } + + /** + * Returns whether another revision is available or not. + * + * @return TRUE or FALSE + */ + public boolean hasNext() { + + try { + switch (modus) { + case INIT: + return queryArticle(); + + case ITERATE_WITH_MAPPING: + if (chronoIterator.hasNext()) { + return true; + } + + reset(); + + if (resultArticles.next()) { + + this.currentArticleID = resultArticles.getInt(1); + return (this.lastArticleID == -1) + || (this.currentArticleID <= this.lastArticleID); + } + + resultArticles.close(); + return queryArticle(); + + case ITERATE_WITHOUT_MAPPING: + + if (revisionIterator.hasNext()) { + return true; + } + + reset(); + + if (resultArticles.next()) { + + this.currentArticleID = resultArticles.getInt(1); + return (this.lastArticleID == -1) + || (this.currentArticleID <= this.lastArticleID); + } + + resultArticles.close(); + return queryArticle(); + + default: + throw new RuntimeException("Illegal mode"); + } + + } catch (SQLException e) { + throw new RuntimeException(e); + } + } + + /** + * This method is unsupported. + * + * @throws UnsupportedOperationException + * @deprecated + */ + @Deprecated + public void remove() { + throw new UnsupportedOperationException(); + } + + /** + * This method closes the connection to the input component. + * + * @throws SQLException if an error occurs while closing the connection to the + * database. + */ + public void close() + throws SQLException { + if (this.connection != null) { + this.connection.close(); + } + } + + public static void main(final String[] args) + throws Exception { + + RevisionAPIConfiguration config = new RevisionAPIConfiguration(); + + config.setHost("localhost"); + config.setDatabase("en_wiki"); + config.setUser("root"); + config.setPassword("1234"); + + config.setCharacterSet("UTF-8"); + config.setBufferSize(10000); + config.setMaxAllowedPacket(1024 * 1023); + config.setChronoStorageSpace(400 * 1024 * 1024); + + long count = 1; + long last = 0, now, start = System.currentTimeMillis(); + + Revision rev; + ChronoRevisionIterator it = new ChronoRevisionIterator(config); + + System.out.println(Time.toClock(System.currentTimeMillis() - start)); + + while (it.hasNext()) { + rev = it.next(); + + if (count++ % 1000 == 0) { + + now = System.currentTimeMillis() - start; + if (it.chronoIterator != null) { + System.out.println(it.chronoIterator.getStorageSize()); + } + if (rev != null) { + System.out.println(rev); + } + System.out.println(Time.toClock(now) + "\t" + (now - last) + + "\tREBUILDING " + count); + last = now; + } + } + + System.out.println(Time.toClock(System.currentTimeMillis() - start)); + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/Contributor.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/Contributor.java index 2baf0d51..fc74d336 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/Contributor.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/Contributor.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -22,50 +22,48 @@ /** * Provides basic user/contributor information in a single object */ -public class Contributor -{ - private String name; - private Integer id; - private List<String> groups; +public class Contributor { + private String name; + private Integer id; + private List<String> groups; - public Contributor(String name){ - this.name=name; - } + public Contributor(String name) { + this.name = name; + } - public Contributor(String name, Integer id){ - this.name=name; - this.id=id; - } + public Contributor(String name, Integer id) { + this.name = name; + this.id = id; + } - public Contributor(String name, Integer id, List<String> groups){ - this.name=name; - this.id=id; - this.groups=groups; - } + public Contributor(String name, Integer id, List<String> groups) { + this.name = name; + this.id = id; + this.groups = groups; + } - public String getName() - { - return name; - } - public void setName(String aName) - { - name = aName; - } - public Integer getId() - { - return id; - } - public void setId(Integer aId) - { - id = aId; - } - public List<String> getGroups() - { - return groups; - } - public void setGroups(List<String> groups) - { - this.groups = groups; - } + public String getName() { + return name; + } + + public void setName(String aName) { + name = aName; + } + + public Integer getId() { + return id; + } + + public void setId(Integer aId) { + id = aId; + } + + public List<String> getGroups() { + return groups; + } + + public void setGroups(List<String> groups) { + this.groups = groups; + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/Revision.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/Revision.java index 59b591d6..2cf08d3e 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/Revision.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/Revision.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -29,451 +29,423 @@ import org.dkpro.jwpl.revisionmachine.difftool.data.tasks.content.DiffPart; /** - * * This class contains all revision data. * <p> * The revision text is loaded upon first access (lazy loading). * When serializing a Revision, the revisionText will be loaded first. - * */ public class Revision - implements ISizeable, Comparable<Revision>, RevisionDataInterface, Serializable -{ - - private static final long serialVersionUID = 7955292965697731279L; - - /** ID of the article */ - private int articleID; - - /** Full Revision ID */ - private int fullRevisionID; - - /** Primary Key */ - private int primaryKey; - - /** Revision counter */ - private final int revisionCounter; - - /** ID of the revision */ - private int revisionId; - - /** Content */ - private String revisionText; - - /** Timestamp */ - private Timestamp timeStamp; - - /** Username of the contributor who created this revision */ - private String contributorName; - - /** Username of the contributor who created this revision */ - private Integer contributorId; - - /** The user comment for this revision */ - private String comment; - - /** Determine whether revision is a minor revision */ - private boolean isMinor = false; - - /** - * Determine whether the contributor was registered. True: contributorName= - * username False: contributorName= IP - */ - private boolean contributorIsRegistered; - - /** Reference to RevisionApi */ - private transient RevisionApi revisionApi; - - // TODO add fields for the revision flags - - /** - * A collection of DiffParts that make up this revision. This can be used to - * get Information about the actions that have been performed to create this - * revision - */ - private Collection<DiffPart> parts; - - /** - * (Constructor) Creates a new Revision object. - * - * @param revisionCounter - * revision counter - */ - public Revision(final int revisionCounter) - { - this.revisionCounter = revisionCounter; - } - - /** - * (Constructor) Creates a new Revision object. - * - * @param revisionCounter - * revision counter - * @param revisionApi - * revision API - */ - public Revision(final int revisionCounter, RevisionApi revisionApi) - { - this.revisionCounter = revisionCounter; - this.revisionApi = revisionApi; - } - - /** - * Returns the estimated number of bytes used to encode the contained - * information. - * - * @return estimated size in bytes - */ - @Override - public long byteSize() - { - if (this.revisionText == null) { - return 0; - } - return this.revisionText.length(); - } - - /** - * Returns the ID of the article. - * - * @return article ID - */ - @Override - public int getArticleID() - { - return articleID; - } - - /** - * Returns the full revision ID. - * - * @return full revision ID - */ - public int getFullRevisionID() - { - return this.fullRevisionID; - } - - /** - * Returns the primary key. - * - * @return primary key - */ - public int getPrimaryKey() - { - return primaryKey; - } - - /* - * (non-Javadoc) - * - * @see java.lang.Comparable#compareTo(java.lang.Object) - */ - @Override - public int compareTo(final Revision r) - { - long value = this.timeStamp.getTime() - r.getTimeStamp().getTime(); - - if (value == 0) { - return this.getRevisionID() - r.getRevisionID(); - } - else if (value > 0) { - return 1; - } - else { - return -1; - } - } - - /** - * Sets the revision api - * - * @param revisionApi - * api to set - * - */ - public void setRevisionApi(RevisionApi revisionApi) - { - this.revisionApi = revisionApi; - } - - /** - * Returns the revision counter. - * - * @return revision counter - */ - @Override - public int getRevisionCounter() - { - return revisionCounter; - } - - /** - * Returns the ID of the revision. - * - * @return revision ID - */ - @Override - public int getRevisionID() - { - return revisionId; - } - - /** - * Returns the textual content of this revision. - * - * @return content - */ - public String getRevisionText() - { - if (this.revisionText == null) { - revisionApi.setRevisionTextAndParts(this); - } - return StringEscapeUtils.unescapeHtml4(this.revisionText); - } - - /** - * Returns the timestamp. - * - * @return timestamp - */ - @Override - public Timestamp getTimeStamp() - { - return timeStamp; - } - - /** - * Returns a collection of DiffPart objects that make up this revision - * - * @return a collection of DiffPart object that make up this revision - */ - public Collection<DiffPart> getParts() - { - if (this.parts == null) { - revisionApi.setRevisionTextAndParts(this); - } - return this.parts; - } - - /** - * Sets the ID of the article. - * - * @param articleID - * article ID - */ - public void setArticleID(final int articleID) - { - this.articleID = articleID; - } - - /** - * Set the ID of the full revision. - * - * @param fullRevisionID - * full revision ID - */ - public void setFullRevisionID(final int fullRevisionID) - { - this.fullRevisionID = fullRevisionID; - } - - /** - * Sets the primary key. - * - * @param primaryKey - * primary key - */ - public void setPrimaryKey(final int primaryKey) - { - this.primaryKey = primaryKey; - } - - /** - * Sets the ID of the revision. - * - * @param revisionId - * revision ID - */ - public void setRevisionID(final int revisionId) - { - this.revisionId = revisionId; - } - - /** - * Sets the revision text. - * - * @param revisionText - * content - */ - public void setRevisionText(final String revisionText) - { - this.revisionText = revisionText; - } - - /** - * Sets the timestamp information. - * <p> - * The input is expected to be the wikipedia version of the timestamp as - * String (YYYY-MM-DDThh-mm-ssZ). T and Z will be replaced with spaces. - * - * @param timeStamp - * timestamp (wikipedia version) - */ - public void setTimeStamp(final String timeStamp) - { - - String time = timeStamp.replace('T', ' '); - time = time.replace('Z', ' '); - - this.timeStamp = Timestamp.valueOf(time); - } - - /** - * Sets the timestamp information. - * - * @param timeStamp - * timestamp - */ - public void setTimeStamp(final Timestamp timeStamp) - { - - this.timeStamp = timeStamp; - } - - /** - * Sets the collection of DiffPart objects that make up this revision - * - * @param parts - * a collection of DiffPart object that make up this revision - */ - public void setParts(Collection<DiffPart> parts) - { - this.parts = parts; - } - - /** - * Returns the string representation of this object. - * - * @return (ArticleID, RevisionCounter, Timestamp, RevisionID, TextLength) - */ - @Override - public String toString() - { - - StringBuilder sRep = new StringBuilder(); - sRep.append('('); - sRep.append(articleID); - sRep.append(", "); - sRep.append(revisionCounter); - sRep.append(", "); - sRep.append(timeStamp); - sRep.append(", "); - sRep.append(revisionId); - - if (revisionText != null) { - sRep.append(", "); - sRep.append(revisionText.length()); - } - sRep.append(')'); - - return sRep.toString(); - } - - /** - * Sets the user comment for this revision - * - * @param comment - * the user comment for this revision - */ - public void setComment(String comment) - { - this.comment = comment; - } - - /** - * Returns the user comment for this revision - * - * - * @return the user comment for this revision - */ - @Override - public String getComment() - { - return comment; - } - - public void setMinor(boolean isMinor) - { - this.isMinor = isMinor; - } - - @Override - public boolean isMinor() - { - return isMinor; - } - - public void setContributorName(String contributorName) - { - this.contributorName = contributorName; - } - - @Override - public String getContributorName() - { - return contributorName; - } - - public void setContributorIsRegistered(boolean contributorIsRegistered) - { - this.contributorIsRegistered = contributorIsRegistered; - } - - @Override - public boolean contributorIsRegistered() - { - return contributorIsRegistered; - } - - public void setContributorId(Integer contributorId) - { - this.contributorId = contributorId; - } - - @Override - public Integer getContributorId() - { - return contributorId; - } - - private void writeObject(ObjectOutputStream out) throws IOException { - //load DiffParts before serializing - getParts(); - //load revision text before serializing - getRevisionText(); - //now we can serialize the object with the default write method - out.defaultWriteObject(); - } - - /* (non-Javadoc) - * @see java.lang.Object#equals(java.lang.Object) - * - * Revisions are equal if their ids are equal - */ - @Override - public boolean equals(Object anObject) { - - if(!(anObject instanceof Revision)){ - return false; - }else{ - Revision otherRev = (Revision)anObject; - if (this.getRevisionID()==otherRev.getRevisionID()) { - return true; - }else{ - return false; - } - } + implements ISizeable, Comparable<Revision>, RevisionDataInterface, Serializable { + + private static final long serialVersionUID = 7955292965697731279L; + + /** + * ID of the article + */ + private int articleID; + + /** + * Full Revision ID + */ + private int fullRevisionID; + + /** + * Primary Key + */ + private int primaryKey; + + /** + * Revision counter + */ + private final int revisionCounter; + + /** + * ID of the revision + */ + private int revisionId; + + /** + * Content + */ + private String revisionText; + + /** + * Timestamp + */ + private Timestamp timeStamp; + + /** + * Username of the contributor who created this revision + */ + private String contributorName; + + /** + * Username of the contributor who created this revision + */ + private Integer contributorId; + + /** + * The user comment for this revision + */ + private String comment; + + /** + * Determine whether revision is a minor revision + */ + private boolean isMinor = false; + + /** + * Determine whether the contributor was registered. True: contributorName= + * username False: contributorName= IP + */ + private boolean contributorIsRegistered; + + /** + * Reference to RevisionApi + */ + private transient RevisionApi revisionApi; + + // TODO add fields for the revision flags + + /** + * A collection of DiffParts that make up this revision. This can be used to + * get Information about the actions that have been performed to create this + * revision + */ + private Collection<DiffPart> parts; + + /** + * (Constructor) Creates a new Revision object. + * + * @param revisionCounter revision counter + */ + public Revision(final int revisionCounter) { + this.revisionCounter = revisionCounter; + } + + /** + * (Constructor) Creates a new Revision object. + * + * @param revisionCounter revision counter + * @param revisionApi revision API + */ + public Revision(final int revisionCounter, RevisionApi revisionApi) { + this.revisionCounter = revisionCounter; + this.revisionApi = revisionApi; + } + + /** + * Returns the estimated number of bytes used to encode the contained + * information. + * + * @return estimated size in bytes + */ + @Override + public long byteSize() { + if (this.revisionText == null) { + return 0; + } + return this.revisionText.length(); + } + + /** + * Returns the ID of the article. + * + * @return article ID + */ + @Override + public int getArticleID() { + return articleID; + } + + /** + * Returns the full revision ID. + * + * @return full revision ID + */ + public int getFullRevisionID() { + return this.fullRevisionID; + } + + /** + * Returns the primary key. + * + * @return primary key + */ + public int getPrimaryKey() { + return primaryKey; + } + + /* + * (non-Javadoc) + * + * @see java.lang.Comparable#compareTo(java.lang.Object) + */ + @Override + public int compareTo(final Revision r) { + long value = this.timeStamp.getTime() - r.getTimeStamp().getTime(); + + if (value == 0) { + return this.getRevisionID() - r.getRevisionID(); + } else if (value > 0) { + return 1; + } else { + return -1; + } + } + + /** + * Sets the revision api + * + * @param revisionApi api to set + */ + public void setRevisionApi(RevisionApi revisionApi) { + this.revisionApi = revisionApi; + } + + /** + * Returns the revision counter. + * + * @return revision counter + */ + @Override + public int getRevisionCounter() { + return revisionCounter; + } + + /** + * Returns the ID of the revision. + * + * @return revision ID + */ + @Override + public int getRevisionID() { + return revisionId; + } + + /** + * Returns the textual content of this revision. + * + * @return content + */ + public String getRevisionText() { + if (this.revisionText == null) { + revisionApi.setRevisionTextAndParts(this); + } + return StringEscapeUtils.unescapeHtml4(this.revisionText); + } + + /** + * Returns the timestamp. + * + * @return timestamp + */ + @Override + public Timestamp getTimeStamp() { + return timeStamp; + } + + /** + * Returns a collection of DiffPart objects that make up this revision + * + * @return a collection of DiffPart object that make up this revision + */ + public Collection<DiffPart> getParts() { + if (this.parts == null) { + revisionApi.setRevisionTextAndParts(this); + } + return this.parts; + } + + /** + * Sets the ID of the article. + * + * @param articleID article ID + */ + public void setArticleID(final int articleID) { + this.articleID = articleID; + } + + /** + * Set the ID of the full revision. + * + * @param fullRevisionID full revision ID + */ + public void setFullRevisionID(final int fullRevisionID) { + this.fullRevisionID = fullRevisionID; + } + + /** + * Sets the primary key. + * + * @param primaryKey primary key + */ + public void setPrimaryKey(final int primaryKey) { + this.primaryKey = primaryKey; + } + + /** + * Sets the ID of the revision. + * + * @param revisionId revision ID + */ + public void setRevisionID(final int revisionId) { + this.revisionId = revisionId; + } + + /** + * Sets the revision text. + * + * @param revisionText content + */ + public void setRevisionText(final String revisionText) { + this.revisionText = revisionText; + } + + /** + * Sets the timestamp information. + * <p> + * The input is expected to be the wikipedia version of the timestamp as + * String (YYYY-MM-DDThh-mm-ssZ). T and Z will be replaced with spaces. + * + * @param timeStamp timestamp (wikipedia version) + */ + public void setTimeStamp(final String timeStamp) { + + String time = timeStamp.replace('T', ' '); + time = time.replace('Z', ' '); + + this.timeStamp = Timestamp.valueOf(time); + } + + /** + * Sets the timestamp information. + * + * @param timeStamp timestamp + */ + public void setTimeStamp(final Timestamp timeStamp) { + + this.timeStamp = timeStamp; + } + + /** + * Sets the collection of DiffPart objects that make up this revision + * + * @param parts a collection of DiffPart object that make up this revision + */ + public void setParts(Collection<DiffPart> parts) { + this.parts = parts; + } + + /** + * Returns the string representation of this object. + * + * @return (ArticleID, RevisionCounter, Timestamp, RevisionID, TextLength) + */ + @Override + public String toString() { + + StringBuilder sRep = new StringBuilder(); + sRep.append('('); + sRep.append(articleID); + sRep.append(", "); + sRep.append(revisionCounter); + sRep.append(", "); + sRep.append(timeStamp); + sRep.append(", "); + sRep.append(revisionId); + + if (revisionText != null) { + sRep.append(", "); + sRep.append(revisionText.length()); + } + sRep.append(')'); + + return sRep.toString(); + } + + /** + * Sets the user comment for this revision + * + * @param comment the user comment for this revision + */ + public void setComment(String comment) { + this.comment = comment; + } + + /** + * Returns the user comment for this revision + * + * @return the user comment for this revision + */ + @Override + public String getComment() { + return comment; + } + + public void setMinor(boolean isMinor) { + this.isMinor = isMinor; + } + + @Override + public boolean isMinor() { + return isMinor; + } + + public void setContributorName(String contributorName) { + this.contributorName = contributorName; + } + + @Override + public String getContributorName() { + return contributorName; + } + + public void setContributorIsRegistered(boolean contributorIsRegistered) { + this.contributorIsRegistered = contributorIsRegistered; + } + + @Override + public boolean contributorIsRegistered() { + return contributorIsRegistered; + } + + public void setContributorId(Integer contributorId) { + this.contributorId = contributorId; + } + + @Override + public Integer getContributorId() { + return contributorId; + } + + private void writeObject(ObjectOutputStream out) throws IOException { + //load DiffParts before serializing + getParts(); + //load revision text before serializing + getRevisionText(); + //now we can serialize the object with the default write method + out.defaultWriteObject(); + } + + /* (non-Javadoc) + * @see java.lang.Object#equals(java.lang.Object) + * + * Revisions are equal if their ids are equal + */ + @Override + public boolean equals(Object anObject) { + + if (!(anObject instanceof Revision)) { + return false; + } else { + Revision otherRev = (Revision) anObject; + if (this.getRevisionID() == otherRev.getRevisionID()) { + return true; + } else { + return false; + } } + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/RevisionAPIConfiguration.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/RevisionAPIConfiguration.java index 1ae32abe..8a4370b4 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/RevisionAPIConfiguration.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/RevisionAPIConfiguration.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -23,204 +23,192 @@ /** * This class contains the additional parameters for the {@link RevisionApi}. */ -public class RevisionAPIConfiguration extends DatabaseConfiguration -{ - - /** Number of maximum size of an result set */ - private int bufferSize; - - /** Character encoding */ - private String characterSet; - - /** Memory size for the storage of revisions for the chonological iteration */ - private long chronoStorageSpace; - - /** - * MAX_ALLOWED_PACKET - parameter of the MySQL Server This value indicates - * the maximum size of an sql query. - */ - private long maxAllowedPacket; - - /** Path for the IndexGenerator output */ - private String outputPath; - - /** Type of the IndexGenerator output */ - private OutputTypes outputType; - - /** - * <p>(Constructor) Creates the default configuration.</p> - * OutputType: UNCOMPRESSED (revisionIndex.sql)<br> - * - */ - public RevisionAPIConfiguration() - { - - super(); - this.setHost("localhost"); - - characterSet = "UTF-8"; - maxAllowedPacket = 1024 * 1023; - bufferSize = 10000; - - chronoStorageSpace = 100 * 1024 * 1024; - - outputPath = "revisionIndex.sql"; - outputType = OutputTypes.SQL; - } - - /** - * <p>Creates a (default) RevisionAPIConfiguration from an existing - * DatabaseConfiguration.</p> - * - * OutputType: DATABASE<br> - */ - public RevisionAPIConfiguration(DatabaseConfiguration existingWikiConfig) - { - - super(); - - characterSet = "UTF-8"; - maxAllowedPacket = 1024 * 1023; - bufferSize = 10000; - - chronoStorageSpace = 100 * 1024 * 1024; - - outputType = OutputTypes.DATABASE; - - setHost(existingWikiConfig.getHost()); - setDatabase(existingWikiConfig.getDatabase()); - setDatabaseDriver(existingWikiConfig.getDatabaseDriver()); - setJdbcURL(existingWikiConfig.getJdbcURL()); - setUser(existingWikiConfig.getUser()); - setPassword(existingWikiConfig.getPassword()); - setLanguage(existingWikiConfig.getLanguage()); - - } - - - /** - * Returns the maximum size of a result set. - * - * @return maximum size of a result set - */ - public int getBufferSize() - { - return bufferSize; - } - - /** - * Returns the character encoding. - * - * @return character encoding - */ - public String getCharacterSet() - { - return characterSet; - } - - /** - * Returns the memory size used for the purpose of storing revisions. - * - * @return memory size - */ - public long getChronoStorageSpace() - { - return this.chronoStorageSpace; - } - - /** - * Returns the value of MAX_ALLOWED_PACKET parameter. - * - * @return MAX_ALLOWED_PACKET - */ - public long getMaxAllowedPacket() - { - return maxAllowedPacket; - } - - /** - * Returns the output path of the index generator. - * - * @return output path - */ - public String getOutputPath() - { - return outputPath; - } - - /** - * Returns the output type of the index generator. - * - * @return output type - */ - public OutputTypes getOutputType() - { - return outputType; - } - - /** - * Sets the maximum size of a result set. - * - * @param bufferSize - * maximum size of a result set - */ - public void setBufferSize(final int bufferSize) - { - this.bufferSize = bufferSize; - } - - /** - * Sets the character encoding. - * - * @param characterSet - * character encoding - */ - public void setCharacterSet(final String characterSet) - { - this.characterSet = characterSet; - } - - /** - * Set the memory size used for the purpose of storing revisions. - * - * @param chronoStorageSpace - * memory size result - */ - public void setChronoStorageSpace(final long chronoStorageSpace) - { - this.chronoStorageSpace = chronoStorageSpace; - } - - /** - * Sets the value of MAX_ALLOWED_PACKET parameter. - * - * @param maxAllowedPacket - * MAX_ALLOWED_PACKET - */ - public void setMaxAllowedPacket(final long maxAllowedPacket) - { - this.maxAllowedPacket = maxAllowedPacket; - } - - /** - * Sets the output path of the index generator. - * - * @param outputPath - * output path - */ - public void setOutputPath(final String outputPath) - { - this.outputPath = outputPath; - } - - /** - * Sets the output type of the index generator. - * - * @param outputType - * output type - */ - public void setOutputType(final OutputTypes outputType) - { - this.outputType = outputType; - } +public class RevisionAPIConfiguration extends DatabaseConfiguration { + + /** + * Number of maximum size of an result set + */ + private int bufferSize; + + /** + * Character encoding + */ + private String characterSet; + + /** + * Memory size for the storage of revisions for the chonological iteration + */ + private long chronoStorageSpace; + + /** + * MAX_ALLOWED_PACKET - parameter of the MySQL Server This value indicates + * the maximum size of an sql query. + */ + private long maxAllowedPacket; + + /** + * Path for the IndexGenerator output + */ + private String outputPath; + + /** + * Type of the IndexGenerator output + */ + private OutputTypes outputType; + + /** + * <p>(Constructor) Creates the default configuration.</p> + * OutputType: UNCOMPRESSED (revisionIndex.sql)<br> + */ + public RevisionAPIConfiguration() { + + super(); + this.setHost("localhost"); + + characterSet = "UTF-8"; + maxAllowedPacket = 1024 * 1023; + bufferSize = 10000; + + chronoStorageSpace = 100 * 1024 * 1024; + + outputPath = "revisionIndex.sql"; + outputType = OutputTypes.SQL; + } + + /** + * <p>Creates a (default) RevisionAPIConfiguration from an existing + * DatabaseConfiguration.</p> + * <p> + * OutputType: DATABASE<br> + */ + public RevisionAPIConfiguration(DatabaseConfiguration existingWikiConfig) { + + super(); + + characterSet = "UTF-8"; + maxAllowedPacket = 1024 * 1023; + bufferSize = 10000; + + chronoStorageSpace = 100 * 1024 * 1024; + + outputType = OutputTypes.DATABASE; + + setHost(existingWikiConfig.getHost()); + setDatabase(existingWikiConfig.getDatabase()); + setDatabaseDriver(existingWikiConfig.getDatabaseDriver()); + setJdbcURL(existingWikiConfig.getJdbcURL()); + setUser(existingWikiConfig.getUser()); + setPassword(existingWikiConfig.getPassword()); + setLanguage(existingWikiConfig.getLanguage()); + + } + + + /** + * Returns the maximum size of a result set. + * + * @return maximum size of a result set + */ + public int getBufferSize() { + return bufferSize; + } + + /** + * Returns the character encoding. + * + * @return character encoding + */ + public String getCharacterSet() { + return characterSet; + } + + /** + * Returns the memory size used for the purpose of storing revisions. + * + * @return memory size + */ + public long getChronoStorageSpace() { + return this.chronoStorageSpace; + } + + /** + * Returns the value of MAX_ALLOWED_PACKET parameter. + * + * @return MAX_ALLOWED_PACKET + */ + public long getMaxAllowedPacket() { + return maxAllowedPacket; + } + + /** + * Returns the output path of the index generator. + * + * @return output path + */ + public String getOutputPath() { + return outputPath; + } + + /** + * Returns the output type of the index generator. + * + * @return output type + */ + public OutputTypes getOutputType() { + return outputType; + } + + /** + * Sets the maximum size of a result set. + * + * @param bufferSize maximum size of a result set + */ + public void setBufferSize(final int bufferSize) { + this.bufferSize = bufferSize; + } + + /** + * Sets the character encoding. + * + * @param characterSet character encoding + */ + public void setCharacterSet(final String characterSet) { + this.characterSet = characterSet; + } + + /** + * Set the memory size used for the purpose of storing revisions. + * + * @param chronoStorageSpace memory size result + */ + public void setChronoStorageSpace(final long chronoStorageSpace) { + this.chronoStorageSpace = chronoStorageSpace; + } + + /** + * Sets the value of MAX_ALLOWED_PACKET parameter. + * + * @param maxAllowedPacket MAX_ALLOWED_PACKET + */ + public void setMaxAllowedPacket(final long maxAllowedPacket) { + this.maxAllowedPacket = maxAllowedPacket; + } + + /** + * Sets the output path of the index generator. + * + * @param outputPath output path + */ + public void setOutputPath(final String outputPath) { + this.outputPath = outputPath; + } + + /** + * Sets the output type of the index generator. + * + * @param outputType output type + */ + public void setOutputType(final OutputTypes outputType) { + this.outputType = outputType; + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/RevisionApi.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/RevisionApi.java index bd820ba8..5d4c3f11 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/RevisionApi.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/RevisionApi.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -41,2046 +41,1814 @@ /** * This class can access the database and retrieve single revisions. */ -public class RevisionApi extends AbstractRevisionService -{ - - /** - * Creates a new {@link RevisionApi} object with an existing database connection. - * - * @param config - * Reference to the configuration parameters - * @param connection - * Reference to the database connection - */ - public RevisionApi(final RevisionAPIConfiguration config, final Connection connection) - { - this.config = config; - this.connection = connection; - } +public class RevisionApi extends AbstractRevisionService { + + /** + * Creates a new {@link RevisionApi} object with an existing database connection. + * + * @param config Reference to the configuration parameters + * @param connection Reference to the database connection + */ + public RevisionApi(final RevisionAPIConfiguration config, final Connection connection) { + this.config = config; + this.connection = connection; + } + + /** + * Creates a new {@link RevisionApi} object. + * + * @param config Reference to the configuration parameters + * @throws WikiApiException if an error occurs + */ + public RevisionApi(final RevisionAPIConfiguration config) throws WikiApiException { + this.config = config; + this.connection = getConnection(config); + } + + /** + * Creates a new {@link RevisionApi} object. + * + * @param dbConfig A database configuration object + * @throws WikiApiException if an error occurs + */ + public RevisionApi(final DatabaseConfiguration dbConfig) throws WikiApiException { + RevisionAPIConfiguration config = new RevisionAPIConfiguration(dbConfig); + this.config = config; + this.connection = getConnection(config); + } + + /** + * Retrieves all article ids for articles with a specified range of revisions (incl. redirects, + * disambiguation pages). <br> + * <b>Attention</b>: When called for the first time, this query needs write-access (ALTER and + * UPDATE) to the database and might take a while to process. + * + * @param minNumberRevisions the smallest number of revisions for an article to be selected + * @param maxNumberRevisions the highest number of revisions for an article to be selected (-1 for infinite) + * @return the set of selected article ids (includes redirects and disambiguation pages) + * @throws WikiApiException if an error occurs + */ + public Set<Integer> getArticleIDsWithNumberOfRevisions(final int minNumberRevisions, + int maxNumberRevisions) + throws WikiApiException { + + try { + if (minNumberRevisions < 0) { + throw new IllegalArgumentException("minNumberRevisions needs to be >= 0"); + } - /** - * Creates a new {@link RevisionApi} object. - * - * @param config - * Reference to the configuration parameters - * @throws WikiApiException - * if an error occurs - */ - public RevisionApi(final RevisionAPIConfiguration config) throws WikiApiException - { - this.config = config; - this.connection = getConnection(config); - } + PreparedStatement statement; + + // check whether the field has already been added + statement = this.connection + .prepareStatement("SELECT * FROM information_schema.COLUMNS WHERE TABLE_SCHEMA = '" + + config.getDatabase() + + "' AND TABLE_NAME = 'index_articleID_rc_ts' AND COLUMN_NAME = 'NumberRevisions'"); + if (!statement.executeQuery().next()) { + // create new column + statement = this.connection + .prepareStatement("ALTER TABLE index_articleID_rc_ts ADD NumberRevisions INT(10) unsigned NOT NULL"); + try { + statement.execute(); + } catch (SQLException e) { + throw new WikiApiException( + "To execute this query for the first time, you need to have write permissions for the database."); + } + // fill with information extracted from RevisionCounter field + statement = this.connection + .prepareStatement("UPDATE index_articleID_rc_ts SET NumberRevisions = (SELECT SUBSTRING_INDEX(RevisionCounter,' ',-1))"); + statement.execute(); + } - /** - * Creates a new {@link RevisionApi} object. - * - * @param dbConfig - * A database configuration object - * @throws WikiApiException - * if an error occurs - */ - public RevisionApi(final DatabaseConfiguration dbConfig) throws WikiApiException - { - RevisionAPIConfiguration config = new RevisionAPIConfiguration(dbConfig); - this.config = config; - this.connection = getConnection(config); - } + ResultSet result = null; + HashSet<Integer> articles = new HashSet<>(); - /** - * Retrieves all article ids for articles with a specified range of revisions (incl. redirects, - * disambiguation pages). <br> - * <b>Attention</b>: When called for the first time, this query needs write-access (ALTER and - * UPDATE) to the database and might take a while to process. - * - * @param minNumberRevisions - * the smallest number of revisions for an article to be selected - * @param maxNumberRevisions - * the highest number of revisions for an article to be selected (-1 for infinite) - * @return the set of selected article ids (includes redirects and disambiguation pages) - * @throws WikiApiException if an error occurs - */ - public Set<Integer> getArticleIDsWithNumberOfRevisions(final int minNumberRevisions, - int maxNumberRevisions) - throws WikiApiException - { + // make query + try { + if (maxNumberRevisions == -1) { + statement = this.connection + .prepareStatement("SELECT ArticleID FROM index_articleID_rc_ts " + + "WHERE NumberRevisions >= ?"); + statement.setInt(1, minNumberRevisions); + } else { + statement = this.connection.prepareStatement("SELECT ArticleID FROM index_articleID_rc_ts " + + "WHERE NumberRevisions BETWEEN ? AND ?"); + statement.setInt(1, minNumberRevisions); + statement.setInt(2, maxNumberRevisions); + } + result = statement.executeQuery(); - try { - if (minNumberRevisions < 0) { - throw new IllegalArgumentException("minNumberRevisions needs to be >= 0"); - } - - PreparedStatement statement; - - // check whether the field has already been added - statement = this.connection - .prepareStatement("SELECT * FROM information_schema.COLUMNS WHERE TABLE_SCHEMA = '" - + config.getDatabase() - + "' AND TABLE_NAME = 'index_articleID_rc_ts' AND COLUMN_NAME = 'NumberRevisions'"); - if (!statement.executeQuery().next()) { - // create new column - statement = this.connection - .prepareStatement("ALTER TABLE index_articleID_rc_ts ADD NumberRevisions INT(10) unsigned NOT NULL"); - try { - statement.execute(); - } - catch (SQLException e) { - throw new WikiApiException( - "To execute this query for the first time, you need to have write permissions for the database."); - } - // fill with information extracted from RevisionCounter field - statement = this.connection - .prepareStatement("UPDATE index_articleID_rc_ts SET NumberRevisions = (SELECT SUBSTRING_INDEX(RevisionCounter,' ',-1))"); - statement.execute(); - } - - ResultSet result = null; - HashSet<Integer> articles = new HashSet<>(); - - // make query - try { - if (maxNumberRevisions == -1) { - statement = this.connection - .prepareStatement("SELECT ArticleID FROM index_articleID_rc_ts " - + "WHERE NumberRevisions >= ?"); - statement.setInt(1, minNumberRevisions); - }else { - statement = this.connection.prepareStatement("SELECT ArticleID FROM index_articleID_rc_ts " - + "WHERE NumberRevisions BETWEEN ? AND ?"); - statement.setInt(1, minNumberRevisions); - statement.setInt(2, maxNumberRevisions); - } - result = statement.executeQuery(); - - while (result.next()) { - articles.add(result.getInt(1)); - } - } - finally { - if (statement != null) { - statement.close(); - } - if (result != null) { - result.close(); - } - } - return articles; - } - catch (Exception e) { - throw new WikiApiException(e); + while (result.next()) { + articles.add(result.getInt(1)); + } + } finally { + if (statement != null) { + statement.close(); } + if (result != null) { + result.close(); + } + } + return articles; + } catch (Exception e) { + throw new WikiApiException(e); } + } + + /** + * Returns the PrimaryKey for the first revision of the given article + * + * @param articleID ID of the article + * @return PK of the first revision + * @throws WikiApiException if an error occurs + */ + public int getFirstRevisionPK(final int articleID) + throws WikiApiException { + + try { + if (articleID < 1) { + throw new IllegalArgumentException(); + } - /** - * Returns the PrimaryKey for the first revision of the given article - * - * @param articleID - * ID of the article - * @return PK of the first revision - * - * @throws WikiApiException - * if an error occurs - */ - public int getFirstRevisionPK(final int articleID) - throws WikiApiException - { + PreparedStatement statement = null; + ResultSet result = null; + String firstRevPK; - try { - if (articleID < 1) { - throw new IllegalArgumentException(); - } + try { + // Retrieve the fullRevisionPK and calculate the limit + statement = this.connection.prepareStatement("SELECT PrimaryKey " + + "FROM revisions " + "WHERE ArticleID=? AND RevisionCounter =1 LIMIT 1"); + statement.setInt(1, articleID); + result = statement.executeQuery(); - PreparedStatement statement = null; - ResultSet result = null; - String firstRevPK; + if (result.next()) { - try { - // Retrieve the fullRevisionPK and calculate the limit - statement = this.connection.prepareStatement("SELECT PrimaryKey " - + "FROM revisions " + "WHERE ArticleID=? AND RevisionCounter =1 LIMIT 1"); - statement.setInt(1, articleID); - result = statement.executeQuery(); + firstRevPK = result.getString(1); - if (result.next()) { + } else { + throw new WikiPageNotFoundException("The article with the ID " + articleID + + " was not found."); + } + } finally { + if (statement != null) { + statement.close(); + } + if (result != null) { + result.close(); + } + } + + return Integer.parseInt(firstRevPK); + + } catch (WikiApiException e) { + throw e; + } catch (Exception e) { + throw new WikiApiException(e); + } + } + + + /** + * Returns the number of revisions for the specified article. + * + * @param articleID ID of the article + * @return number of revisions + * @throws WikiApiException if an error occurs + */ + public int getNumberOfRevisions(final int articleID) + throws WikiApiException { + + try { + if (articleID < 1) { + throw new IllegalArgumentException(); + } + + PreparedStatement statement = null; + ResultSet result = null; + String revCounters; - firstRevPK = result.getString(1); + try { + // Retrieve the fullRevisionPK and calculate the limit + statement = this.connection.prepareStatement("SELECT RevisionCounter " + + "FROM index_articleID_rc_ts " + "WHERE ArticleID=? LIMIT 1"); + statement.setInt(1, articleID); + result = statement.executeQuery(); - } - else { - throw new WikiPageNotFoundException("The article with the ID " + articleID - + " was not found."); - } - } - finally { - if (statement != null) { - statement.close(); - } - if (result != null) { - result.close(); - } - } + if (result.next()) { - return Integer.parseInt(firstRevPK); + revCounters = result.getString(1); + } else { + throw new WikiPageNotFoundException("The article with the ID " + articleID + + " was not found."); } - catch (WikiApiException e) { - throw e; + } finally { + if (statement != null) { + statement.close(); } - catch (Exception e) { - throw new WikiApiException(e); + if (result != null) { + result.close(); } - } + } + int index = revCounters.lastIndexOf(' '); + if (index == -1) { + throw new WikiApiException("Article data is inconsistent"); + } - /** - * Returns the number of revisions for the specified article. - * - * @param articleID - * ID of the article - * @return number of revisions - * - * @throws WikiApiException - * if an error occurs - */ - public int getNumberOfRevisions(final int articleID) - throws WikiApiException - { + return Integer.parseInt(revCounters.substring(index + 1, revCounters.length())); - try { - if (articleID < 1) { - throw new IllegalArgumentException(); - } - - PreparedStatement statement = null; - ResultSet result = null; - String revCounters; - - try { - // Retrieve the fullRevisionPK and calculate the limit - statement = this.connection.prepareStatement("SELECT RevisionCounter " - + "FROM index_articleID_rc_ts " + "WHERE ArticleID=? LIMIT 1"); - statement.setInt(1, articleID); - result = statement.executeQuery(); - - if (result.next()) { - - revCounters = result.getString(1); - - } - else { - throw new WikiPageNotFoundException("The article with the ID " + articleID - + " was not found."); - } - } - finally { - if (statement != null) { - statement.close(); - } - if (result != null) { - result.close(); - } - } - - int index = revCounters.lastIndexOf(' '); - if (index == -1) { - throw new WikiApiException("Article data is inconsistent"); - } - - return Integer.parseInt(revCounters.substring(index + 1, revCounters.length())); - - } - catch (WikiApiException e) { - throw e; - } - catch (Exception e) { - throw new WikiApiException(e); - } + } catch (WikiApiException e) { + throw e; + } catch (Exception e) { + throw new WikiApiException(e); } + } + + /** + * Returns the timestamps of all revisions that have been made before the given revision. + * + * @param articleID ID of the article + * @return List of revisions by each corresponding {@link Timestamp}. + * @throws WikiApiException if an error occurs + */ + public List<Timestamp> getRevisionTimestampsBetweenTimestamps(int articleID, final Timestamp from, final Timestamp to) + throws WikiApiException { + List<Timestamp> timestamps = new LinkedList<>(); + + try { + PreparedStatement statement = null; + ResultSet result = null; + + try { + // Check if necessary index exists + if (!indexExists("revisions")) { + throw new WikiInitializationException( + "Please create an index on revisions(ArticleID) in order to make this query feasible."); + } + + statement = connection + .prepareStatement("SELECT Timestamp FROM revisions WHERE ArticleID=? AND Timestamp >= ? AND Timestamp <= ?"); + statement.setInt(1, articleID); + statement.setLong(2, from.getTime()); + statement.setLong(3, to.getTime()); + result = statement.executeQuery(); + + // Make the query + if (result == null) { + throw new WikiPageNotFoundException("The article with the ID " + articleID + + " was not found."); + } + while (result.next()) { + timestamps.add(new Timestamp(result.getLong(1))); + } + } finally { + if (statement != null) { + statement.close(); + } + if (result != null) { + result.close(); + } + } - /** - * Returns the timestamps of all revisions that have been made before the given revision. - * - * @param articleID - * ID of the article - * @return List of revisions by each corresponding {@link Timestamp}. - * - * @throws WikiApiException - * if an error occurs - */ - public List<Timestamp> getRevisionTimestampsBetweenTimestamps(int articleID, final Timestamp from, final Timestamp to) - throws WikiApiException - { - List<Timestamp> timestamps = new LinkedList<>(); + return timestamps; - try { - PreparedStatement statement = null; - ResultSet result = null; - - try { - // Check if necessary index exists - if (!indexExists("revisions")) { - throw new WikiInitializationException( - "Please create an index on revisions(ArticleID) in order to make this query feasible."); - } - - statement = connection - .prepareStatement("SELECT Timestamp FROM revisions WHERE ArticleID=? AND Timestamp >= ? AND Timestamp <= ?"); - statement.setInt(1, articleID); - statement.setLong(2, from.getTime()); - statement.setLong(3, to.getTime()); - result = statement.executeQuery(); - - // Make the query - if (result == null) { - throw new WikiPageNotFoundException("The article with the ID " + articleID - + " was not found."); - } - while (result.next()) { - timestamps.add(new Timestamp(result.getLong(1))); - } - } - finally { - if (statement != null) { - statement.close(); - } - if (result != null) { - result.close(); - } - } - - return timestamps; - - } - catch (WikiApiException e) { - throw e; - } - catch (Exception e) { - throw new WikiApiException(e); - } + } catch (WikiApiException e) { + throw e; + } catch (Exception e) { + throw new WikiApiException(e); } + } + + /** + * Returns the timestamps of all revisions that have been made before the given revision. + * + * @param revisionId ID of the revision + * @return List of revisions by each corresponding {@link Timestamp}. + * @throws WikiApiException if an error occurs + */ + public List<Timestamp> getRevisionTimestampsBeforeRevision(final int revisionId) + throws WikiApiException { + List<Timestamp> timestamps = new LinkedList<>(); + + int articleID = getPageIdForRevisionId(revisionId); // TODO do this in the SQL query + Timestamp ts = getRevision(revisionId).getTimeStamp(); // TODO do this in the SQL query + + try { + PreparedStatement statement = null; + ResultSet result = null; + + try { + // Check if necessary index exists + if (!indexExists("revisions")) { + throw new WikiInitializationException( + "Please create an index on revisions(ArticleID) in order to make this query feasible."); + } + + statement = connection + .prepareStatement("SELECT Timestamp FROM revisions WHERE ArticleID=? AND Timestamp < ?"); + statement.setInt(1, articleID); + statement.setLong(2, ts.getTime()); + result = statement.executeQuery(); + + // Make the query + if (result == null) { + throw new WikiPageNotFoundException("The article with the ID " + articleID + + " was not found."); + } + while (result.next()) { + timestamps.add(new Timestamp(result.getLong(1))); + } + } finally { + if (statement != null) { + statement.close(); + } + if (result != null) { + result.close(); + } + } - /** - * Returns the timestamps of all revisions that have been made before the given revision. - * - * @param revisionId - * ID of the revision - * @return List of revisions by each corresponding {@link Timestamp}. - * - * @throws WikiApiException - * if an error occurs - */ - public List<Timestamp> getRevisionTimestampsBeforeRevision(final int revisionId) - throws WikiApiException - { - List<Timestamp> timestamps = new LinkedList<>(); - - int articleID = getPageIdForRevisionId(revisionId); // TODO do this in the SQL query - Timestamp ts = getRevision(revisionId).getTimeStamp(); // TODO do this in the SQL query + return timestamps; - try { - PreparedStatement statement = null; - ResultSet result = null; - - try { - // Check if necessary index exists - if (!indexExists("revisions")) { - throw new WikiInitializationException( - "Please create an index on revisions(ArticleID) in order to make this query feasible."); - } - - statement = connection - .prepareStatement("SELECT Timestamp FROM revisions WHERE ArticleID=? AND Timestamp < ?"); - statement.setInt(1, articleID); - statement.setLong(2, ts.getTime()); - result = statement.executeQuery(); - - // Make the query - if (result == null) { - throw new WikiPageNotFoundException("The article with the ID " + articleID - + " was not found."); - } - while (result.next()) { - timestamps.add(new Timestamp(result.getLong(1))); - } - } - finally { - if (statement != null) { - statement.close(); - } - if (result != null) { - result.close(); - } - } - - return timestamps; - - } - catch (WikiApiException e) { - throw e; - } - catch (Exception e) { - throw new WikiApiException(e); - } + } catch (WikiApiException e) { + throw e; + } catch (Exception e) { + throw new WikiApiException(e); } + } + + /** + * Returns the timestamps of all revisions connected to the specified article. + * <p> + * In order to make this query fast, create a MySQL-Index (BTREE) on the ArticleID in the + * revisions-table. + * + * @param articleID ID of the article + * @return collection of timestampf of all revisions + * @throws WikiApiException if an error occurs + */ + public List<Timestamp> getRevisionTimestamps(final int articleID) + throws WikiApiException { + + List<Timestamp> timestamps = new LinkedList<>(); + + try { + if (articleID < 1) { + throw new IllegalArgumentException(); + } - /** - * Returns the timestamps of all revisions connected to the specified article. - * - * In order to make this query fast, create a MySQL-Index (BTREE) on the ArticleID in the - * revisions-table. - * - * @param articleID - * ID of the article - * @return collection of timestampf of all revisions - * - * @throws WikiApiException - * if an error occurs - */ - public List<Timestamp> getRevisionTimestamps(final int articleID) - throws WikiApiException - { - - List<Timestamp> timestamps = new LinkedList<>(); + PreparedStatement statement = null; + ResultSet result = null; - try { - if (articleID < 1) { - throw new IllegalArgumentException(); - } - - PreparedStatement statement = null; - ResultSet result = null; - - try { - // Check if necessary index exists - if (!indexExists("revisions")) { - throw new WikiInitializationException( - "Please create an index on revisions(ArticleID) in order to make this query feasible."); - } - - statement = connection.prepareStatement("SELECT Timestamp " - + "FROM revisions WHERE ArticleID=?"); - statement.setInt(1, articleID); - result = statement.executeQuery(); - - // Make the query - if (result == null) { - throw new WikiPageNotFoundException("The article with the ID " + articleID - + " was not found."); - } - while (result.next()) { - - timestamps.add(new Timestamp(result.getLong(1))); - - } - } - finally { - if (statement != null) { - statement.close(); - } - if (result != null) { - result.close(); - } - } - - return timestamps; - - } - catch (WikiApiException e) { - throw e; - } - catch (Exception e) { - throw new WikiApiException(e); + try { + // Check if necessary index exists + if (!indexExists("revisions")) { + throw new WikiInitializationException( + "Please create an index on revisions(ArticleID) in order to make this query feasible."); } - } - /** - * Returns the number of unique contributors to an article based on the people who revised the - * article (revision contributors).<br> - * - * In order to make this query fast, create a MySQL-Index (BTREE) on the ArticleID in the - * revisions-table. - * - * @param articleID - * ID of the article - * @return the number of unique contributors to the article - * - * @throws WikiApiException - * if an error occurs - */ - public int getNumberOfUniqueContributors(final int articleID) - throws WikiApiException - { - return getNumberOfUniqueContributors(articleID, false); - } + statement = connection.prepareStatement("SELECT Timestamp " + + "FROM revisions WHERE ArticleID=?"); + statement.setInt(1, articleID); + result = statement.executeQuery(); + + // Make the query + if (result == null) { + throw new WikiPageNotFoundException("The article with the ID " + articleID + + " was not found."); + } + while (result.next()) { - /** - * Returns the number of unique contributors to an article based on the people who revised the - * article (revision contributors). - * - * It is possible to only count the registered users, if onlyRegistered is set to true - * <br> - * In order to make this query fast, create a MySQL-Index (BTREE) on the ArticleID in the - * revisions-table. - * - * @param articleID - * ID of the article - * @param onlyRegistered - * defines whether to count only registered users {@code true}, or all users (false) - * @return the number of unique contributors to the article - * - * @throws WikiApiException - * if an error occurs - */ - public int getNumberOfUniqueContributors(final int articleID, boolean onlyRegistered) - throws WikiApiException - { + timestamps.add(new Timestamp(result.getLong(1))); - try { - if (articleID < 1) { - throw new IllegalArgumentException(); - } - - int contrCount = 0; - PreparedStatement statement = null; - ResultSet result = null; - - try { - // Check if necessary index exists - if (!indexExists("revisions")) { - throw new WikiInitializationException( - "Please create an index on revisions(ArticleID) in order to make this query feasible."); - } - - StringBuffer sqlString = new StringBuffer(); - sqlString - .append("SELECT COUNT(DISTINCT ContributorName) FROM revisions WHERE ArticleID=?"); - if (onlyRegistered) { - sqlString.append(" AND ContributorIsRegistered=1"); - } - - statement = connection.prepareStatement(sqlString.toString()); - - statement.setInt(1, articleID); - result = statement.executeQuery(); - - // Make the query - if (result == null) { - throw new WikiPageNotFoundException("The article with the ID " + articleID - + " was not found."); - } - - if (result.next()) { - contrCount = result.getInt(1); - } - } - finally { - if (statement != null) { - statement.close(); - } - if (result != null) { - result.close(); - } - } - - return contrCount; - - } - catch (WikiApiException e) { - throw e; - } - catch (Exception e) { - throw new WikiApiException(e); } - } + } finally { + if (statement != null) { + statement.close(); + } + if (result != null) { + result.close(); + } + } - /** - * Returns the number of unique contributors to an article that have contributed before the - * given revision. - * - * In order to make this query fast, create a MySQL-Index (BTREE) on the ArticleID in the - * revisions-table. - * - * @param revisionID - * revision before which to count the contributors - * @return the number of unique contributors to the article - * - * @throws WikiApiException - * if an error occurs - */ - public int getNumberOfUniqueContributorsBeforeRevision(final int revisionID) - throws WikiApiException - { - return getNumberOfUniqueContributorsBeforeRevision(revisionID, false); + return timestamps; + + } catch (WikiApiException e) { + throw e; + } catch (Exception e) { + throw new WikiApiException(e); } + } + + /** + * Returns the number of unique contributors to an article based on the people who revised the + * article (revision contributors).<br> + * <p> + * In order to make this query fast, create a MySQL-Index (BTREE) on the ArticleID in the + * revisions-table. + * + * @param articleID ID of the article + * @return the number of unique contributors to the article + * @throws WikiApiException if an error occurs + */ + public int getNumberOfUniqueContributors(final int articleID) + throws WikiApiException { + return getNumberOfUniqueContributors(articleID, false); + } + + /** + * Returns the number of unique contributors to an article based on the people who revised the + * article (revision contributors). + * <p> + * It is possible to only count the registered users, if onlyRegistered is set to true + * <br> + * In order to make this query fast, create a MySQL-Index (BTREE) on the ArticleID in the + * revisions-table. + * + * @param articleID ID of the article + * @param onlyRegistered defines whether to count only registered users {@code true}, or all users (false) + * @return the number of unique contributors to the article + * @throws WikiApiException if an error occurs + */ + public int getNumberOfUniqueContributors(final int articleID, boolean onlyRegistered) + throws WikiApiException { + + try { + if (articleID < 1) { + throw new IllegalArgumentException(); + } - /** - * Returns the number of unique contributors to an article that have contributed before the - * given revision. - * - * In order to make this query fast, create a MySQL-Index (BTREE) on the ArticleID in the - * revisions-table. - * - * @param revisionID - * revision before which to count the contributors - * @param onlyRegistered - * defines whether to count only registered users {@code true}, or all users (false) - * @return the number of unique contributors to the article - * - * @throws WikiApiException - * if an error occurs - */ - public int getNumberOfUniqueContributorsBeforeRevision(final int revisionID, - boolean onlyRegistered) - throws WikiApiException - { + int contrCount = 0; + PreparedStatement statement = null; + ResultSet result = null; - try { - if (revisionID < 1) { - throw new IllegalArgumentException(); - } - - int articleID = getPageIdForRevisionId(revisionID); - Timestamp ts = getRevision(revisionID).getTimeStamp(); - - int contrCount = 0; - PreparedStatement statement = null; - ResultSet result = null; - - try { - // Check if necessary index exists - if (!indexExists("revisions")) { - throw new WikiInitializationException( - "Please create an index on revisions(ArticleID) in order to make this query feasible."); - } - - StringBuffer sqlString = new StringBuffer(); - sqlString - .append("SELECT COUNT(DISTINCT ContributorName) FROM revisions WHERE ArticleID=? AND Timestamp<?"); - if (onlyRegistered) { - sqlString.append(" AND ContributorIsRegistered=1"); - } - - statement = connection.prepareStatement(sqlString.toString()); - - statement.setInt(1, articleID); - statement.setLong(2, ts.getTime()); - result = statement.executeQuery(); - - // Make the query - if (result == null) { - throw new WikiPageNotFoundException("The article with the ID " + articleID - + " was not found."); - } - - if (result.next()) { - contrCount = result.getInt(1); - } - } - finally { - if (statement != null) { - statement.close(); - } - if (result != null) { - result.close(); - } - } - - return contrCount; - - } - catch (WikiApiException e) { - throw e; - } - catch (Exception e) { - throw new WikiApiException(e); + try { + // Check if necessary index exists + if (!indexExists("revisions")) { + throw new WikiInitializationException( + "Please create an index on revisions(ArticleID) in order to make this query feasible."); } - } - /** - * Returns a map of usernames mapped to the timestamps of their contributions. - * - * In order to make this query fast, create a MySQL-Index (BTREE) on the ArticleID in the - * revisions-table. - * - * @param articleID - * ID of the article - * @return map of Timestamp-DiffPart-Collection pairs - * - * @throws WikiApiException - * if an error occurs - */ - public Map<String, Timestamp> getUserContributionMap(final int articleID) - throws WikiApiException - { - return getUserContributionMap(articleID, null); - } + StringBuffer sqlString = new StringBuffer(); + sqlString + .append("SELECT COUNT(DISTINCT ContributorName) FROM revisions WHERE ArticleID=?"); + if (onlyRegistered) { + sqlString.append(" AND ContributorIsRegistered=1"); + } - /** - * Returns a map of usernames mapped to the timestamps of their contributions. - * - * Users of certain user groups (e.g. bots) can be filtered by providing the unwanted groups in - * the {@code groupFilter}. Nothing is filtered if the {@code groupFilter} is {@code null} or empty.<br> - * <br> - * Filtered results also include unregistered users (because they cannot be filtered using user - * groups) In order to get results containing only registered users, use {@link - * #getUserContributionMap(int, String[], boolean)} and set {@code onlyRegistered=true}.<br> - * <br> - * In order to make this query fast, create a MySQL-Index (BTREE) on the ArticleID in the - * revisions-table. - * - * @param articleID - * ID of the article - * @param groupfilter - * a list of unwanted user groups - * @return map of Timestamp-DiffPart-Collection pairs - * - * @throws WikiApiException - * if an error occurs - */ - public Map<String, Timestamp> getUserContributionMap(final int articleID, String[] groupfilter) - throws WikiApiException - { - return getUserContributionMap(articleID, groupfilter, false); - } + statement = connection.prepareStatement(sqlString.toString()); - /** - * Returns a map of usernames mapped to the timestamps of their contributions. - * <br> - * Users of certain user groups (e.g. bots) can be filtered by providing the unwanted groups in - * the {@code groupFilter}. Nothing is filtered if the {@code groupFilter} is {@code null} or empty.<br> - * <br> - * In order to make this query fast, create a MySQL-Index (BTREE) on the ArticleID in the - * revisions-table. - * - * @param articleID - * ID of the article - * @param groupfilter - * a list of unwanted user groups - * @param onlyRegistered - * {@code true} if result should only contain registered users. {@code false} otherwise - * @return map of Timestamp-DiffPart-Collection pairs - * - * @throws WikiApiException - * if an error occurs - */ - @SuppressWarnings("unused") - public Map<String, Timestamp> getUserContributionMap(final int articleID, String[] groupfilter, - boolean onlyRegistered) - throws WikiApiException - { - - Map<String, Timestamp> authorTSMap = new HashMap<>(); + statement.setInt(1, articleID); + result = statement.executeQuery(); - try { - if (articleID < 1) { - throw new IllegalArgumentException(); - } - - PreparedStatement statement = null; - ResultSet result = null; - - try { - - // Check if necessary index exists - if (!indexExists("revisions")) { - throw new WikiInitializationException( - "Please create an index on revisions(ArticleID) in order to make this query feasible."); - } - - StringBuilder statementStr = new StringBuilder(); - - if (groupfilter == null || groupfilter.length < 1 || !tableExists("user_groups")) { - // create statement WITHOUT filter - statementStr - .append("SELECT ContributorName, Timestamp FROM revisions WHERE ArticleID=?"); - statement = connection.prepareStatement(statementStr.toString()); - statement.setInt(1, articleID); - } - else { - // create statement WITH filter - statementStr - .append("SELECT ContributorName, Timestamp FROM revisions AS rev, user_groups AS ug WHERE ArticleID=?"); - statementStr.append(" AND rev.ContributorId=ug.ug_user"); - for (String element : groupfilter) { - statementStr.append(" AND NOT ug.ug_group=?"); - } - // and combine with results from unregistered users - if (!onlyRegistered) { - statementStr.append(" UNION ( SELECT ContributorName, Timestamp FROM revisions AS rev WHERE ArticleID=? AND rev.ContributorId IS NULL)"); - } - - statement = connection.prepareStatement(statementStr.toString()); - // insert article id in prepared statement - statement.setInt(1, articleID); - - // insert filtered groups in prepared statement - int curPrepStatValueIdx = 2; - for (String group : groupfilter) { - statement.setString(curPrepStatValueIdx++, group); - } - if (!onlyRegistered) { - // insert article id for second select in prepared statement - statement.setInt(curPrepStatValueIdx, articleID); - } - - } - - result = statement.executeQuery(); - - if (result == null) { - throw new WikiPageNotFoundException("The article with the ID " + articleID - + " was not found."); - } - while (result.next()) { - // Write data from current revision to Map - authorTSMap.put(result.getString(1), new Timestamp(result.getLong(2))); - } - } - finally { - if (statement != null) { - statement.close(); - } - if (result != null) { - result.close(); - } - } - - return authorTSMap; - - } - catch (WikiApiException e) { - throw e; - } - catch (Exception e) { - throw new WikiApiException(e); + // Make the query + if (result == null) { + throw new WikiPageNotFoundException("The article with the ID " + articleID + + " was not found."); } - } - /** - * Returns the group assignments of the specified user - * - * @param userID - * ID of the user (NOT THE USERNAME) - * @return collection of user groups - * - * @throws WikiApiException - * if an error occurs - */ - public List<String> getUserGroups(final int userID) - throws WikiApiException - { - - List<String> groups = new LinkedList<>(); + if (result.next()) { + contrCount = result.getInt(1); + } + } finally { + if (statement != null) { + statement.close(); + } + if (result != null) { + result.close(); + } + } - try { - if (userID < 1) { - throw new IllegalArgumentException(); - } + return contrCount; - if (!tableExists("user_groups")) { - throw new WikiInitializationException( - "User group assignment data is missing. Please download user_groups.sql for this Wikipedia from http://dumps.wikimedia.org and import the data into this database."); - } + } catch (WikiApiException e) { + throw e; + } catch (Exception e) { + throw new WikiApiException(e); + } + } + + /** + * Returns the number of unique contributors to an article that have contributed before the + * given revision. + * <p> + * In order to make this query fast, create a MySQL-Index (BTREE) on the ArticleID in the + * revisions-table. + * + * @param revisionID revision before which to count the contributors + * @return the number of unique contributors to the article + * @throws WikiApiException if an error occurs + */ + public int getNumberOfUniqueContributorsBeforeRevision(final int revisionID) + throws WikiApiException { + return getNumberOfUniqueContributorsBeforeRevision(revisionID, false); + } + + /** + * Returns the number of unique contributors to an article that have contributed before the + * given revision. + * <p> + * In order to make this query fast, create a MySQL-Index (BTREE) on the ArticleID in the + * revisions-table. + * + * @param revisionID revision before which to count the contributors + * @param onlyRegistered defines whether to count only registered users {@code true}, or all users (false) + * @return the number of unique contributors to the article + * @throws WikiApiException if an error occurs + */ + public int getNumberOfUniqueContributorsBeforeRevision(final int revisionID, + boolean onlyRegistered) + throws WikiApiException { + + try { + if (revisionID < 1) { + throw new IllegalArgumentException(); + } - PreparedStatement statement = null; - ResultSet result = null; + int articleID = getPageIdForRevisionId(revisionID); + Timestamp ts = getRevision(revisionID).getTimeStamp(); - try { - statement = connection.prepareStatement("SELECT ug_group " - + "FROM user_groups WHERE ug_user=?"); - statement.setInt(1, userID); - result = statement.executeQuery(); + int contrCount = 0; + PreparedStatement statement = null; + ResultSet result = null; - // Make the query - if (result == null) { - throw new WikiPageNotFoundException("The user with the ID " + userID - + " was not found."); - } - while (result.next()) { + try { + // Check if necessary index exists + if (!indexExists("revisions")) { + throw new WikiInitializationException( + "Please create an index on revisions(ArticleID) in order to make this query feasible."); + } - groups.add(result.getString(1)); + StringBuffer sqlString = new StringBuffer(); + sqlString + .append("SELECT COUNT(DISTINCT ContributorName) FROM revisions WHERE ArticleID=? AND Timestamp<?"); + if (onlyRegistered) { + sqlString.append(" AND ContributorIsRegistered=1"); + } - } - } - finally { - if (statement != null) { - statement.close(); - } - if (result != null) { - result.close(); - } - } + statement = connection.prepareStatement(sqlString.toString()); - return groups; + statement.setInt(1, articleID); + statement.setLong(2, ts.getTime()); + result = statement.executeQuery(); + // Make the query + if (result == null) { + throw new WikiPageNotFoundException("The article with the ID " + articleID + + " was not found."); } - catch (WikiApiException e) { - throw e; + + if (result.next()) { + contrCount = result.getInt(1); } - catch (Exception e) { - throw new WikiApiException(e); + } finally { + if (statement != null) { + statement.close(); } + if (result != null) { + result.close(); + } + } + + return contrCount; + + } catch (WikiApiException e) { + throw e; + } catch (Exception e) { + throw new WikiApiException(e); } + } + + /** + * Returns a map of usernames mapped to the timestamps of their contributions. + * <p> + * In order to make this query fast, create a MySQL-Index (BTREE) on the ArticleID in the + * revisions-table. + * + * @param articleID ID of the article + * @return map of Timestamp-DiffPart-Collection pairs + * @throws WikiApiException if an error occurs + */ + public Map<String, Timestamp> getUserContributionMap(final int articleID) + throws WikiApiException { + return getUserContributionMap(articleID, null); + } + + /** + * Returns a map of usernames mapped to the timestamps of their contributions. + * <p> + * Users of certain user groups (e.g. bots) can be filtered by providing the unwanted groups in + * the {@code groupFilter}. Nothing is filtered if the {@code groupFilter} is {@code null} or empty.<br> + * <br> + * Filtered results also include unregistered users (because they cannot be filtered using user + * groups) In order to get results containing only registered users, use {@link + * #getUserContributionMap(int, String[], boolean)} and set {@code onlyRegistered=true}.<br> + * <br> + * In order to make this query fast, create a MySQL-Index (BTREE) on the ArticleID in the + * revisions-table. + * + * @param articleID ID of the article + * @param groupfilter a list of unwanted user groups + * @return map of Timestamp-DiffPart-Collection pairs + * @throws WikiApiException if an error occurs + */ + public Map<String, Timestamp> getUserContributionMap(final int articleID, String[] groupfilter) + throws WikiApiException { + return getUserContributionMap(articleID, groupfilter, false); + } + + /** + * Returns a map of usernames mapped to the timestamps of their contributions. + * <br> + * Users of certain user groups (e.g. bots) can be filtered by providing the unwanted groups in + * the {@code groupFilter}. Nothing is filtered if the {@code groupFilter} is {@code null} or empty.<br> + * <br> + * In order to make this query fast, create a MySQL-Index (BTREE) on the ArticleID in the + * revisions-table. + * + * @param articleID ID of the article + * @param groupfilter a list of unwanted user groups + * @param onlyRegistered {@code true} if result should only contain registered users. {@code false} otherwise + * @return map of Timestamp-DiffPart-Collection pairs + * @throws WikiApiException if an error occurs + */ + @SuppressWarnings("unused") + public Map<String, Timestamp> getUserContributionMap(final int articleID, String[] groupfilter, + boolean onlyRegistered) + throws WikiApiException { + + Map<String, Timestamp> authorTSMap = new HashMap<>(); + + try { + if (articleID < 1) { + throw new IllegalArgumentException(); + } - /** - * Returns the revisionIds of all revisions created by given user - * - * @param userid - * id of the user (NOT USER NAME) - * @return Map of revision ids - * - * @throws WikiApiException - * if an error occurs - */ - public Map<Integer, List<Integer>> getUserRevisionIds(int userid) - throws WikiApiException - { - - Map<Integer, List<Integer>> revIds = new HashMap<>(); + PreparedStatement statement = null; + ResultSet result = null; - try { - if (userid < 1) { - throw new IllegalArgumentException(); - } - - if (!indexExists("revisions", "userids")) { - System.err.println("You should create and index for the field ContributorID: create index userids ON revisions(ContributorId(15));"); - } - - PreparedStatement statement = null; - ResultSet result = null; - - try { - statement = connection.prepareStatement("SELECT ArticleID, RevisionID " - + "FROM revisions WHERE ContributorId=?"); - statement.setInt(1, userid); - result = statement.executeQuery(); - - // Make the query - if (result == null) { - throw new WikiPageNotFoundException("No revisions for user " + userid); - } - while (result.next()) { - - int artId = result.getInt(1); - int revId = result.getInt(2); - - if (revIds.containsKey(artId)) { - revIds.get(artId).add(revId); - } - else { - List<Integer> revList = new ArrayList<>(); - revList.add(revId); - revIds.put(artId, revList); - } - } - } - finally { - if (statement != null) { - statement.close(); - } - if (result != null) { - result.close(); - } - } - - return revIds; - - } - catch (WikiApiException e) { - throw e; - } - catch (Exception e) { - throw new WikiApiException(e); + try { + + // Check if necessary index exists + if (!indexExists("revisions")) { + throw new WikiInitializationException( + "Please create an index on revisions(ArticleID) in order to make this query feasible."); } - } - /** - * Returns the revisionIds of all revisions created by given user - * - * @param username - * name of the user (NOT USER ID) - * @return Map of revision ids - * - * @throws WikiApiException - * if an error occurs - */ - public Map<Integer, List<Integer>> getUserRevisionIds(String username, int limit) - throws WikiApiException - { - - Map<Integer, List<Integer>> revIds = new HashMap<>(); + StringBuilder statementStr = new StringBuilder(); + + if (groupfilter == null || groupfilter.length < 1 || !tableExists("user_groups")) { + // create statement WITHOUT filter + statementStr + .append("SELECT ContributorName, Timestamp FROM revisions WHERE ArticleID=?"); + statement = connection.prepareStatement(statementStr.toString()); + statement.setInt(1, articleID); + } else { + // create statement WITH filter + statementStr + .append("SELECT ContributorName, Timestamp FROM revisions AS rev, user_groups AS ug WHERE ArticleID=?"); + statementStr.append(" AND rev.ContributorId=ug.ug_user"); + for (String element : groupfilter) { + statementStr.append(" AND NOT ug.ug_group=?"); + } + // and combine with results from unregistered users + if (!onlyRegistered) { + statementStr.append(" UNION ( SELECT ContributorName, Timestamp FROM revisions AS rev WHERE ArticleID=? AND rev.ContributorId IS NULL)"); + } + + statement = connection.prepareStatement(statementStr.toString()); + // insert article id in prepared statement + statement.setInt(1, articleID); + + // insert filtered groups in prepared statement + int curPrepStatValueIdx = 2; + for (String group : groupfilter) { + statement.setString(curPrepStatValueIdx++, group); + } + if (!onlyRegistered) { + // insert article id for second select in prepared statement + statement.setInt(curPrepStatValueIdx, articleID); + } - try { - if (username == null || username.isEmpty()) { - throw new IllegalArgumentException(); - } - - if (!indexExists("revisions", "usernames")) { - System.err - .println("You should create and index for the field ContributorName: create index usernames ON revisions(ContributorName(50));"); - } - - PreparedStatement statement = null; - ResultSet result = null; - - try { - statement = connection.prepareStatement("SELECT ArticleID, RevisionID " - + "FROM revisions WHERE ContributorName=? LIMIT " + limit); - statement.setString(1, username); - result = statement.executeQuery(); - - // Make the query - if (result == null) { - throw new WikiPageNotFoundException("No revisions for user " + username); - } - while (result.next()) { - - int artId = result.getInt(1); - int revId = result.getInt(2); - - if (revIds.containsKey(artId)) { - revIds.get(artId).add(revId); - } - else { - List<Integer> revList = new ArrayList<>(); - revList.add(revId); - revIds.put(artId, revList); - } - - } - } - finally { - if (statement != null) { - statement.close(); - } - if (result != null) { - result.close(); - } - } - - return revIds; - - } - catch (WikiApiException e) { - throw e; - } - catch (Exception e) { - throw new WikiApiException(e); } - } - /** - * Returns a map of timestamps mapped on the corresponding DiffPart-Collections. Can be used to - * compile statistics over all changes that have been made in one article. - * - * In order to make this query fast, create a MySQL-Index (BTREE) on the ArticleID in the - * revisions-table. - * - * @param articleID - * ID of the article - * @return map of Timestamp-DiffPart-Collection pairs - * - * @throws WikiApiException - * if an error occurs - */ - public Map<Timestamp, Collection<DiffPart>> getTimestampToRevisionMap(final int articleID) - throws WikiApiException - { - - Map<Timestamp, Collection<DiffPart>> tsDiffPartsMap = new HashMap<>(); + result = statement.executeQuery(); - try { - if (articleID < 1) { - throw new IllegalArgumentException(); - } - - PreparedStatement statement = null; - ResultSet result = null; - RevisionDecoder decoder = new RevisionDecoder(config.getCharacterSet()); - - try { - - // Check if necessary index exists - if (!indexExists("revisions")) { - throw new WikiInitializationException( - "Please create an index on revisions(ArticleID) in order to make this query feasible."); - } - - statement = connection.prepareStatement("SELECT Timestamp, Revision " - + "FROM revisions WHERE ArticleID=?"); - statement.setInt(1, articleID); - result = statement.executeQuery(); - - if (result == null) { - throw new WikiPageNotFoundException("The article with the ID " + articleID - + " was not found."); - } - while (result.next()) { - - // Decode String and create Diff-Object - boolean binaryData = result.getMetaData().getColumnType(2) == Types.LONGVARBINARY; - if (binaryData) { - decoder.setInput(result.getBinaryStream(2), true); - } - else { - decoder.setInput(result.getString(2)); - } - Diff diff = decoder.decode(); - - // Get DiffParts from Diff Object - Collection<DiffPart> parts = new LinkedList<>(); - Iterator<DiffPart> it = diff.iterator(); - while (it.hasNext()) { - parts.add(it.next()); - } - - // Write data from current revision to Map - tsDiffPartsMap.put(new Timestamp(result.getLong(1)), parts); - - } - } - finally { - if (statement != null) { - statement.close(); - } - if (result != null) { - result.close(); - } - } - - return tsDiffPartsMap; - - } - catch (WikiApiException e) { - throw e; - } - catch (Exception e) { - throw new WikiApiException(e); + if (result == null) { + throw new WikiPageNotFoundException("The article with the ID " + articleID + + " was not found."); } - } + while (result.next()) { + // Write data from current revision to Map + authorTSMap.put(result.getString(1), new Timestamp(result.getLong(2))); + } + } finally { + if (statement != null) { + statement.close(); + } + if (result != null) { + result.close(); + } + } - /** - * Returns the timestamp of the first revision connected to the specified article. - * - * @param articleID - * ID of the article - * @return first date of appearance or the article does not exist - * - * @throws WikiApiException - * if an error occurs - */ - public Timestamp getFirstDateOfAppearance(final int articleID) - throws WikiApiException - { - return getDateOfAppearance(articleID, "FirstAppearance"); - } + return authorTSMap; - /** - * Returns the timestamp of the last revision connected to the specified article. - * - * @param articleID - * ID of the article - * @return last date of appearance or the article does not exist - * - * @throws WikiApiException - * if an error occurs - */ - public Timestamp getLastDateOfAppearance(final int articleID) - throws WikiApiException - { - return getDateOfAppearance(articleID, "LastAppearance"); + } catch (WikiApiException e) { + throw e; + } catch (Exception e) { + throw new WikiApiException(e); } + } + + /** + * Returns the group assignments of the specified user + * + * @param userID ID of the user (NOT THE USERNAME) + * @return collection of user groups + * @throws WikiApiException if an error occurs + */ + public List<String> getUserGroups(final int userID) + throws WikiApiException { + + List<String> groups = new LinkedList<>(); + + try { + if (userID < 1) { + throw new IllegalArgumentException(); + } - /** - * Returns the timestamp of the first or last revision connected to the specified article. - * - * @param articleID - * ID of the article - * @param firstOrLast - * <code>"FirstAppearance"</code> if first date of appearance should be returned. - * <code>"LastAppearance"</code> if last date of appearance should be returned. - * - * @return first date of appearance or the article does not exist - * - * @throws WikiApiException - * if an error occurs - */ - private Timestamp getDateOfAppearance(final int articleID, final String firstOrLast) - throws WikiApiException - { + if (!tableExists("user_groups")) { + throw new WikiInitializationException( + "User group assignment data is missing. Please download user_groups.sql for this Wikipedia from http://dumps.wikimedia.org and import the data into this database."); + } - try { - if (articleID < 1) { - throw new IllegalArgumentException(); - } + PreparedStatement statement = null; + ResultSet result = null; + + try { + statement = connection.prepareStatement("SELECT ug_group " + + "FROM user_groups WHERE ug_user=?"); + statement.setInt(1, userID); + result = statement.executeQuery(); + + // Make the query + if (result == null) { + throw new WikiPageNotFoundException("The user with the ID " + userID + + " was not found."); + } + while (result.next()) { + + groups.add(result.getString(1)); + + } + } finally { + if (statement != null) { + statement.close(); + } + if (result != null) { + result.close(); + } + } + + return groups; - PreparedStatement statement = null; - ResultSet result = null; - long time; + } catch (WikiApiException e) { + throw e; + } catch (Exception e) { + throw new WikiApiException(e); + } + } + + /** + * Returns the revisionIds of all revisions created by given user + * + * @param userid id of the user (NOT USER NAME) + * @return Map of revision ids + * @throws WikiApiException if an error occurs + */ + public Map<Integer, List<Integer>> getUserRevisionIds(int userid) + throws WikiApiException { + + Map<Integer, List<Integer>> revIds = new HashMap<>(); + + try { + if (userid < 1) { + throw new IllegalArgumentException(); + } - try { - statement = this.connection.prepareStatement("SELECT " + firstOrLast - + " FROM index_articleID_rc_ts " + "WHERE ArticleID=? LIMIT 1"); - statement.setInt(1, articleID); - result = statement.executeQuery(); + if (!indexExists("revisions", "userids")) { + System.err.println("You should create and index for the field ContributorID: create index userids ON revisions(ContributorId(15));"); + } - if (result.next()) { + PreparedStatement statement = null; + ResultSet result = null; - time = result.getLong(1); + try { + statement = connection.prepareStatement("SELECT ArticleID, RevisionID " + + "FROM revisions WHERE ContributorId=?"); + statement.setInt(1, userid); + result = statement.executeQuery(); - } - else { - throw new WikiPageNotFoundException("The article with the ID " + articleID - + " was not found."); - } - } - finally { - if (statement != null) { - statement.close(); - } - if (result != null) { - result.close(); - } - } + // Make the query + if (result == null) { + throw new WikiPageNotFoundException("No revisions for user " + userid); + } + while (result.next()) { - return new Timestamp(time); + int artId = result.getInt(1); + int revId = result.getInt(2); + if (revIds.containsKey(artId)) { + revIds.get(artId).add(revId); + } else { + List<Integer> revList = new ArrayList<>(); + revList.add(revId); + revIds.put(artId, revList); + } } - catch (WikiApiException e) { - throw e; + } finally { + if (statement != null) { + statement.close(); } - catch (Exception e) { - throw new WikiApiException(e); + if (result != null) { + result.close(); } - } - - /** - * Returns the by the id specified revision. - * - * @param revisionID - * ID of the revision - * @return Revision - * - * @throws WikiApiException - * if an error occurs or the revision does not exists. - */ - public Revision getRevision(final int revisionID) - throws WikiApiException - { + } - try { - if (revisionID < 1) { - throw new IllegalArgumentException(); - } + return revIds; - int fullRevPK; - int limit; + } catch (WikiApiException e) { + throw e; + } catch (Exception e) { + throw new WikiApiException(e); + } + } + + /** + * Returns the revisionIds of all revisions created by given user + * + * @param username name of the user (NOT USER ID) + * @return Map of revision ids + * @throws WikiApiException if an error occurs + */ + public Map<Integer, List<Integer>> getUserRevisionIds(String username, int limit) + throws WikiApiException { + + Map<Integer, List<Integer>> revIds = new HashMap<>(); + + try { + if (username == null || username.isEmpty()) { + throw new IllegalArgumentException(); + } - PreparedStatement statement = null; - ResultSet result = null; + if (!indexExists("revisions", "usernames")) { + System.err + .println("You should create and index for the field ContributorName: create index usernames ON revisions(ContributorName(50));"); + } - try { - statement = this.connection.prepareStatement("SELECT FullRevisionPK, RevisionPK " - + "FROM index_revisionID " + "WHERE revisionID=? LIMIT 1"); - statement.setInt(1, revisionID); - result = statement.executeQuery(); + PreparedStatement statement = null; + ResultSet result = null; - if (result.next()) { - fullRevPK = result.getInt(1); - limit = (result.getInt(2) - fullRevPK) + 1; + try { + statement = connection.prepareStatement("SELECT ArticleID, RevisionID " + + "FROM revisions WHERE ContributorName=? LIMIT " + limit); + statement.setString(1, username); + result = statement.executeQuery(); - } - else { - throw new WikiPageNotFoundException("The revision with the ID " + revisionID - + " was not found."); - } + // Make the query + if (result == null) { + throw new WikiPageNotFoundException("No revisions for user " + username); + } + while (result.next()) { - } - finally { - if (statement != null) { - statement.close(); - } - if (result != null) { - result.close(); - } - } + int artId = result.getInt(1); + int revId = result.getInt(2); - return buildRevisionMetaData(fullRevPK, limit); + if (revIds.containsKey(artId)) { + revIds.get(artId).add(revId); + } else { + List<Integer> revList = new ArrayList<>(); + revList.add(revId); + revIds.put(artId, revList); + } } - catch (WikiPageNotFoundException e) { - throw e; + } finally { + if (statement != null) { + statement.close(); } - catch (Exception e) { - throw new WikiApiException(e); + if (result != null) { + result.close(); } + } + + return revIds; + + } catch (WikiApiException e) { + throw e; + } catch (Exception e) { + throw new WikiApiException(e); } + } + + /** + * Returns a map of timestamps mapped on the corresponding DiffPart-Collections. Can be used to + * compile statistics over all changes that have been made in one article. + * <p> + * In order to make this query fast, create a MySQL-Index (BTREE) on the ArticleID in the + * revisions-table. + * + * @param articleID ID of the article + * @return map of Timestamp-DiffPart-Collection pairs + * @throws WikiApiException if an error occurs + */ + public Map<Timestamp, Collection<DiffPart>> getTimestampToRevisionMap(final int articleID) + throws WikiApiException { + + Map<Timestamp, Collection<DiffPart>> tsDiffPartsMap = new HashMap<>(); + + try { + if (articleID < 1) { + throw new IllegalArgumentException(); + } - /** - * Returns the pageId (ArticleId) for the given revision - * - * @param revisionID - * ID of the revision - * @return the page if for the given revision - * - * @throws WikiApiException - * if an error occurs or the revision does not exists. - */ - public int getPageIdForRevisionId(final int revisionID) - throws WikiApiException - { + PreparedStatement statement = null; + ResultSet result = null; + RevisionDecoder decoder = new RevisionDecoder(config.getCharacterSet()); - try { - if (revisionID < 1) { - throw new IllegalArgumentException(); - } + try { - int pageId; + // Check if necessary index exists + if (!indexExists("revisions")) { + throw new WikiInitializationException( + "Please create an index on revisions(ArticleID) in order to make this query feasible."); + } - PreparedStatement statement = null; - ResultSet result = null; + statement = connection.prepareStatement("SELECT Timestamp, Revision " + + "FROM revisions WHERE ArticleID=?"); + statement.setInt(1, articleID); + result = statement.executeQuery(); - try { - statement = this.connection.prepareStatement("SELECT r.ArticleID " - + "FROM revisions as r, index_revisionID as idx " - + "WHERE idx.RevisionID=? AND idx.RevisionPK=r.PrimaryKey LIMIT 1"); - statement.setInt(1, revisionID); - result = statement.executeQuery(); + if (result == null) { + throw new WikiPageNotFoundException("The article with the ID " + articleID + + " was not found."); + } + while (result.next()) { - if (result.next()) { - pageId = result.getInt(1); - } - else { - throw new WikiPageNotFoundException("The revision with the ID " + revisionID - + " was not found."); - } + // Decode String and create Diff-Object + boolean binaryData = result.getMetaData().getColumnType(2) == Types.LONGVARBINARY; + if (binaryData) { + decoder.setInput(result.getBinaryStream(2), true); + } else { + decoder.setInput(result.getString(2)); + } + Diff diff = decoder.decode(); - } - finally { - if (statement != null) { - statement.close(); - } - if (result != null) { - result.close(); - } - } + // Get DiffParts from Diff Object + Collection<DiffPart> parts = new LinkedList<>(); + Iterator<DiffPart> it = diff.iterator(); + while (it.hasNext()) { + parts.add(it.next()); + } - return pageId; + // Write data from current revision to Map + tsDiffPartsMap.put(new Timestamp(result.getLong(1)), parts); } - catch (WikiPageNotFoundException e) { - throw e; + } finally { + if (statement != null) { + statement.close(); } - catch (Exception e) { - throw new WikiApiException(e); + if (result != null) { + result.close(); } + } + + return tsDiffPartsMap; + + } catch (WikiApiException e) { + throw e; + } catch (Exception e) { + throw new WikiApiException(e); } + } + + /** + * Returns the timestamp of the first revision connected to the specified article. + * + * @param articleID ID of the article + * @return first date of appearance or the article does not exist + * @throws WikiApiException if an error occurs + */ + public Timestamp getFirstDateOfAppearance(final int articleID) + throws WikiApiException { + return getDateOfAppearance(articleID, "FirstAppearance"); + } + + /** + * Returns the timestamp of the last revision connected to the specified article. + * + * @param articleID ID of the article + * @return last date of appearance or the article does not exist + * @throws WikiApiException if an error occurs + */ + public Timestamp getLastDateOfAppearance(final int articleID) + throws WikiApiException { + return getDateOfAppearance(articleID, "LastAppearance"); + } + + /** + * Returns the timestamp of the first or last revision connected to the specified article. + * + * @param articleID ID of the article + * @param firstOrLast <code>"FirstAppearance"</code> if first date of appearance should be returned. + * <code>"LastAppearance"</code> if last date of appearance should be returned. + * @return first date of appearance or the article does not exist + * @throws WikiApiException if an error occurs + */ + private Timestamp getDateOfAppearance(final int articleID, final String firstOrLast) + throws WikiApiException { + + try { + if (articleID < 1) { + throw new IllegalArgumentException(); + } - /** - * Returns the by the article ID and revisionCounter specified revision. Note that this method - * returns the revision in chronological order. - * - * @param articleID - * ID of the article - * @param revisionCounter - * number of revision - * @return Revision - * - * @throws WikiApiException - * if an error occurs or the revision does not exists. - */ - public Revision getRevision(final int articleID, final int revisionCounter) - throws WikiApiException - { + PreparedStatement statement = null; + ResultSet result = null; + long time; - try { - if (articleID < 1 || revisionCounter < 1) { - throw new IllegalArgumentException(); - } + try { + statement = this.connection.prepareStatement("SELECT " + firstOrLast + + " FROM index_articleID_rc_ts " + "WHERE ArticleID=? LIMIT 1"); + statement.setInt(1, articleID); + result = statement.executeQuery(); + + if (result.next()) { - int revisionIndex = checkMapping(articleID, revisionCounter); - String fullRevisions, revCounters; + time = result.getLong(1); + + } else { + throw new WikiPageNotFoundException("The article with the ID " + articleID + + " was not found."); + } + } finally { + if (statement != null) { + statement.close(); + } + if (result != null) { + result.close(); + } + } - PreparedStatement statement = null; - ResultSet result = null; + return new Timestamp(time); - try { - statement = this.connection - .prepareStatement("SELECT FullRevisionPKs, RevisionCounter FROM index_articleID_rc_ts WHERE ArticleID=? LIMIT 1"); - statement.setInt(1, articleID); - result = statement.executeQuery(); + } catch (WikiApiException e) { + throw e; + } catch (Exception e) { + throw new WikiApiException(e); + } + } + + /** + * Returns the by the id specified revision. + * + * @param revisionID ID of the revision + * @return Revision + * @throws WikiApiException if an error occurs or the revision does not exists. + */ + public Revision getRevision(final int revisionID) + throws WikiApiException { + + try { + if (revisionID < 1) { + throw new IllegalArgumentException(); + } - if (result.next()) { + int fullRevPK; + int limit; - fullRevisions = result.getString(1); - revCounters = result.getString(2); + PreparedStatement statement = null; + ResultSet result = null; - } - else { - throw new WikiPageNotFoundException("The article with the ID " + articleID - + " was not found."); - } - } - finally { - if (statement != null) { - statement.close(); - } - if (result != null) { - result.close(); - } - } + try { + statement = this.connection.prepareStatement("SELECT FullRevisionPK, RevisionPK " + + "FROM index_revisionID " + "WHERE revisionID=? LIMIT 1"); + statement.setInt(1, revisionID); + result = statement.executeQuery(); - return getReferencedRevision(articleID, revisionIndex, fullRevisions, revCounters); + if (result.next()) { + fullRevPK = result.getInt(1); + limit = (result.getInt(2) - fullRevPK) + 1; + } else { + throw new WikiPageNotFoundException("The revision with the ID " + revisionID + + " was not found."); } - catch (WikiPageNotFoundException e) { - throw e; + + } finally { + if (statement != null) { + statement.close(); } - catch (Exception e) { - throw new WikiApiException(e); + if (result != null) { + result.close(); } + } + + return buildRevisionMetaData(fullRevPK, limit); + + } catch (WikiPageNotFoundException e) { + throw e; + } catch (Exception e) { + throw new WikiApiException(e); } + } + + /** + * Returns the pageId (ArticleId) for the given revision + * + * @param revisionID ID of the revision + * @return the page if for the given revision + * @throws WikiApiException if an error occurs or the revision does not exists. + */ + public int getPageIdForRevisionId(final int revisionID) + throws WikiApiException { + + try { + if (revisionID < 1) { + throw new IllegalArgumentException(); + } - /** - * Returns the by the article ID and timestamp specified revision. Note that the timestamp is - * not an unique identifier of a revision related to an article. The returned revision should be - * the first revision that can be found inside the database. - * - * @param articleID - * ID of the article - * @param time - * Timestamp - * @return Revision - * - * @throws WikiApiException - * if an error occurs or the revision does not exists. - */ - public Revision getRevision(final int articleID, final Timestamp time) - throws WikiApiException - { + int pageId; - try { + PreparedStatement statement = null; + ResultSet result = null; + + try { + statement = this.connection.prepareStatement("SELECT r.ArticleID " + + "FROM revisions as r, index_revisionID as idx " + + "WHERE idx.RevisionID=? AND idx.RevisionPK=r.PrimaryKey LIMIT 1"); + statement.setInt(1, revisionID); + result = statement.executeQuery(); + + if (result.next()) { + pageId = result.getInt(1); + } else { + throw new WikiPageNotFoundException("The revision with the ID " + revisionID + + " was not found."); + } - PreparedStatement statement = null; - ResultSet result = null; - String fullRevisions; - String revisionCounters; - - if (articleID < 1 || time == null || time.getTime() <= 0) { - throw new IllegalArgumentException(); - } - - int firstPK, lastPK; - try { - statement = this.connection - .prepareStatement("SELECT FullRevisionPKs, RevisionCounter," - + " FirstAppearance " + "FROM index_articleID_rc_ts " - + "WHERE ArticleID=? LIMIT 1"); - statement.setInt(1, articleID); - result = statement.executeQuery(); - - if (result.next()) { - - fullRevisions = result.getString(1); - revisionCounters = result.getString(2); - long firstDate = result.getLong(3); - - // Find first and last FullRevision PK - int max = fullRevisions.length(); - int index = fullRevisions.indexOf(' '); - if (index == -1) { - index = max; - } - - firstPK = Integer.parseInt(fullRevisions.substring(0, index)); - - index = revisionCounters.lastIndexOf(' ') + 1; - lastPK = firstPK - + Integer.parseInt(revisionCounters.substring(index, - revisionCounters.length())); - - if (time.getTime() < firstDate) { - throw new WikiPageNotFoundException("No revision before the " - + "specified date [" + time + "]"); - } - } - else { - throw new WikiPageNotFoundException("The article with the ID " + articleID - + " was not found."); - } - } - finally { - if (statement != null) { - statement.close(); - } - if (result != null) { - result.close(); - } - } - try { - statement = this.connection - .prepareStatement("SELECT RevisionCounter FROM revisions WHERE PrimaryKey >= ? AND PrimaryKey < ? AND Timestamp <= ? ORDER BY Timestamp DESC LIMIT 1"); - statement.setInt(1, firstPK); - statement.setInt(2, lastPK); - statement.setLong(3, time.getTime()); - result = statement.executeQuery(); - - if (result.next()) { - int revisionCount = result.getInt(1); - return getReferencedRevision(articleID, revisionCount, fullRevisions, - revisionCounters); - } - else { - throw new WikiPageNotFoundException( - "The revision with the specified timestamp was not found."); - } - } - finally { - if (statement != null) { - statement.close(); - } - if (result != null) { - result.close(); - } - } - - } - catch (WikiPageNotFoundException e) { - throw e; - } - catch (Exception e) { - throw new WikiApiException(e); + } finally { + if (statement != null) { + statement.close(); } + if (result != null) { + result.close(); + } + } + + return pageId; + + } catch (WikiPageNotFoundException e) { + throw e; + } catch (Exception e) { + throw new WikiApiException(e); } + } + + /** + * Returns the by the article ID and revisionCounter specified revision. Note that this method + * returns the revision in chronological order. + * + * @param articleID ID of the article + * @param revisionCounter number of revision + * @return Revision + * @throws WikiApiException if an error occurs or the revision does not exists. + */ + public Revision getRevision(final int articleID, final int revisionCounter) + throws WikiApiException { + + try { + if (articleID < 1 || revisionCounter < 1) { + throw new IllegalArgumentException(); + } - /*--------------------------------------------------------------------------*/ - /* Internal methods */ - /*--------------------------------------------------------------------------*/ - - /** - * This method maps the chronological order to the revisionCounter. - * - * @param articleID - * ID of the article - * @param revisionCounter - * chronological position - * - * @return position in the chronological order - * - * @throws SQLException - * if an error occurs while accesing the database. - */ - protected int checkMapping(final int articleID, final int revisionCounter) - throws SQLException - { - - PreparedStatement statement = null; - ResultSet result = null; - - // Check for the correct revisionCounter mapping - try { - statement = this.connection.prepareStatement("SELECT Mapping " - + "FROM index_chronological " + "WHERE ArticleID=? LIMIT 1"); - statement.setInt(1, articleID); - result = statement.executeQuery(); + int revisionIndex = checkMapping(articleID, revisionCounter); + String fullRevisions, revCounters; - if (result.next()) { + PreparedStatement statement = null; + ResultSet result = null; - String mapping = result.getString(1); - return getMapping(mapping, revisionCounter); + try { + statement = this.connection + .prepareStatement("SELECT FullRevisionPKs, RevisionCounter FROM index_articleID_rc_ts WHERE ArticleID=? LIMIT 1"); + statement.setInt(1, articleID); + result = statement.executeQuery(); - } + if (result.next()) { + + fullRevisions = result.getString(1); + revCounters = result.getString(2); + + } else { + throw new WikiPageNotFoundException("The article with the ID " + articleID + + " was not found."); + } + } finally { + if (statement != null) { + statement.close(); } - finally { - if (statement != null) { - statement.close(); - } - if (result != null) { - result.close(); - } + if (result != null) { + result.close(); } + } + + return getReferencedRevision(articleID, revisionIndex, fullRevisions, revCounters); - return revisionCounter; + } catch (WikiPageNotFoundException e) { + throw e; + } catch (Exception e) { + throw new WikiApiException(e); } + } + + /** + * Returns the by the article ID and timestamp specified revision. Note that the timestamp is + * not an unique identifier of a revision related to an article. The returned revision should be + * the first revision that can be found inside the database. + * + * @param articleID ID of the article + * @param time Timestamp + * @return Revision + * @throws WikiApiException if an error occurs or the revision does not exists. + */ + public Revision getRevision(final int articleID, final Timestamp time) + throws WikiApiException { + + try { + + PreparedStatement statement = null; + ResultSet result = null; + String fullRevisions; + String revisionCounters; + + if (articleID < 1 || time == null || time.getTime() <= 0) { + throw new IllegalArgumentException(); + } - /** - * This method maps the revisionCounter to the chronological order. - * - * @param articleID - * ID of the article - * @param revisionCounter - * chronological position - * - * @return position in the chronological order - * - * @throws SQLException - * if an error occurs while accesing the database. - * - * @deprecated this method should only be used for internal processes - */ - @Deprecated - public int checkReverseMapping(final int articleID, final int revisionCounter) - throws SQLException - { - - PreparedStatement statement = null; - ResultSet result = null; - - // Check for the correct revisionCounter mapping - try { - statement = this.connection - .prepareStatement("SELECT ReverseMapping FROM index_chronological WHERE ArticleID=? LIMIT 1"); - statement.setInt(1, articleID); - result = statement.executeQuery(); + int firstPK, lastPK; + try { + statement = this.connection + .prepareStatement("SELECT FullRevisionPKs, RevisionCounter," + + " FirstAppearance " + "FROM index_articleID_rc_ts " + + "WHERE ArticleID=? LIMIT 1"); + statement.setInt(1, articleID); + result = statement.executeQuery(); + + if (result.next()) { + + fullRevisions = result.getString(1); + revisionCounters = result.getString(2); + long firstDate = result.getLong(3); + + // Find first and last FullRevision PK + int max = fullRevisions.length(); + int index = fullRevisions.indexOf(' '); + if (index == -1) { + index = max; + } - if (result.next()) { + firstPK = Integer.parseInt(fullRevisions.substring(0, index)); - String mapping = result.getString(1); - return getMapping(mapping, revisionCounter); + index = revisionCounters.lastIndexOf(' ') + 1; + lastPK = firstPK + + Integer.parseInt(revisionCounters.substring(index, + revisionCounters.length())); - } + if (time.getTime() < firstDate) { + throw new WikiPageNotFoundException("No revision before the " + + "specified date [" + time + "]"); + } + } else { + throw new WikiPageNotFoundException("The article with the ID " + articleID + + " was not found."); + } + } finally { + if (statement != null) { + statement.close(); } - finally { - if (statement != null) { - statement.close(); - } - if (result != null) { - result.close(); - } + if (result != null) { + result.close(); } + } + try { + statement = this.connection + .prepareStatement("SELECT RevisionCounter FROM revisions WHERE PrimaryKey >= ? AND PrimaryKey < ? AND Timestamp <= ? ORDER BY Timestamp DESC LIMIT 1"); + statement.setInt(1, firstPK); + statement.setInt(2, lastPK); + statement.setLong(3, time.getTime()); + result = statement.executeQuery(); + + if (result.next()) { + int revisionCount = result.getInt(1); + return getReferencedRevision(articleID, revisionCount, fullRevisions, + revisionCounters); + } else { + throw new WikiPageNotFoundException( + "The revision with the specified timestamp was not found."); + } + } finally { + if (statement != null) { + statement.close(); + } + if (result != null) { + result.close(); + } + } - return revisionCounter; + } catch (WikiPageNotFoundException e) { + throw e; + } catch (Exception e) { + throw new WikiApiException(e); } + } + + /*--------------------------------------------------------------------------*/ + /* Internal methods */ + /*--------------------------------------------------------------------------*/ + + /** + * This method maps the chronological order to the revisionCounter. + * + * @param articleID ID of the article + * @param revisionCounter chronological position + * @return position in the chronological order + * @throws SQLException if an error occurs while accesing the database. + */ + protected int checkMapping(final int articleID, final int revisionCounter) + throws SQLException { + + PreparedStatement statement = null; + ResultSet result = null; + + // Check for the correct revisionCounter mapping + try { + statement = this.connection.prepareStatement("SELECT Mapping " + + "FROM index_chronological " + "WHERE ArticleID=? LIMIT 1"); + statement.setInt(1, articleID); + result = statement.executeQuery(); + + if (result.next()) { + + String mapping = result.getString(1); + return getMapping(mapping, revisionCounter); - /** - * This method returns the correct mapping of the given input. - * - * @param mapping - * mapping sequence - * @param revisionCounter - * index to map - * @return mapped index - */ - private int getMapping(final String mapping, final int revisionCounter) - { + } + } finally { + if (statement != null) { + statement.close(); + } + if (result != null) { + result.close(); + } + } - String tempA, tempB; + return revisionCounter; + } + + /** + * This method maps the revisionCounter to the chronological order. + * + * @param articleID ID of the article + * @param revisionCounter chronological position + * @return position in the chronological order + * @throws SQLException if an error occurs while accesing the database. + * @deprecated this method should only be used for internal processes + */ + @Deprecated + public int checkReverseMapping(final int articleID, final int revisionCounter) + throws SQLException { + + PreparedStatement statement = null; + ResultSet result = null; + + // Check for the correct revisionCounter mapping + try { + statement = this.connection + .prepareStatement("SELECT ReverseMapping FROM index_chronological WHERE ArticleID=? LIMIT 1"); + statement.setInt(1, articleID); + result = statement.executeQuery(); + + if (result.next()) { + + String mapping = result.getString(1); + return getMapping(mapping, revisionCounter); - int length = 0; - int revC = -1, mapC = -1; - int index, max = mapping.length(); + } + } finally { + if (statement != null) { + statement.close(); + } + if (result != null) { + result.close(); + } + } - while (length < max && revC < revisionCounter) { + return revisionCounter; + } - // Read revisionCounter - index = mapping.indexOf(' ', length); - tempA = mapping.substring(length, index); - length = index + 1; + /** + * This method returns the correct mapping of the given input. + * + * @param mapping mapping sequence + * @param revisionCounter index to map + * @return mapped index + */ + private int getMapping(final String mapping, final int revisionCounter) { - // Read mappedCounter - index = mapping.indexOf(' ', length); - if (index == -1) { - index = mapping.length(); - } - tempB = mapping.substring(length, index); - length = index + 1; + String tempA, tempB; - // Parse values - revC = Integer.parseInt(tempA); - mapC = Integer.parseInt(tempB); + int length = 0; + int revC = -1, mapC = -1; + int index, max = mapping.length(); - // System.out.println(revC + " -> " + mapC); - } + while (length < max && revC < revisionCounter) { - if (revC == revisionCounter) { - // System.out.println(revC + " >> " + mapC); - return mapC; - } + // Read revisionCounter + index = mapping.indexOf(' ', length); + tempA = mapping.substring(length, index); + length = index + 1; - return revisionCounter; - } + // Read mappedCounter + index = mapping.indexOf(' ', length); + if (index == -1) { + index = mapping.length(); + } + tempB = mapping.substring(length, index); + length = index + 1; - /** - * This method identifies the correct full revision and retrieves the reference revision. - * - * @param articleID - * ID of the article - * @param revisionIndex - * number of revision - * @param fullRevisions - * list of full revisions - * @param revCounters - * list of revision counters - * @return Revision - * - * @throws WikiApiException - * if an error occurs - */ - private Revision getReferencedRevision(final int articleID, final int revisionIndex, - final String fullRevisions, final String revCounters) - throws WikiApiException - { + // Parse values + revC = Integer.parseInt(tempA); + mapC = Integer.parseInt(tempB); - try { - int fullRevPK; - int limit; + // System.out.println(revC + " -> " + mapC); + } + + if (revC == revisionCounter) { + // System.out.println(revC + " >> " + mapC); + return mapC; + } - String fullRev = null; + return revisionCounter; + } + + /** + * This method identifies the correct full revision and retrieves the reference revision. + * + * @param articleID ID of the article + * @param revisionIndex number of revision + * @param fullRevisions list of full revisions + * @param revCounters list of revision counters + * @return Revision + * @throws WikiApiException if an error occurs + */ + private Revision getReferencedRevision(final int articleID, final int revisionIndex, + final String fullRevisions, final String revCounters) + throws WikiApiException { + + try { + int fullRevPK; + int limit; + + String fullRev = null; + + int revA = -1, revB = -1; + int lengthFR = 0; + int lengthRC = 0; + int index; + int max = fullRevisions.length(); + + while (lengthFR < max && revB < revisionIndex) { + + // Read fullRevisionPK (as string) + index = fullRevisions.indexOf(' ', lengthFR); + if (index == -1) { + index = max; + } + + fullRev = fullRevisions.substring(lengthFR, index); + lengthFR = index + 1; + + // Read start revision counter + index = revCounters.indexOf(' ', lengthRC); + revA = Integer.parseInt(revCounters.substring(lengthRC, index)); + lengthRC = index + 1; + + // Read end revision counter + index = revCounters.indexOf(' ', lengthRC); + if (index == -1) { + index = revCounters.length(); + } + revB = Integer.parseInt(revCounters.substring(lengthRC, index)); + lengthRC = index + 1; + } - int revA = -1, revB = -1; - int lengthFR = 0; - int lengthRC = 0; - int index; - int max = fullRevisions.length(); + if (revisionIndex > revB) { + throw new WikiPageNotFoundException("The article with the ID " + articleID + + " has no revision number " + revisionIndex); + } - while (lengthFR < max && revB < revisionIndex) { + fullRevPK = Integer.parseInt(fullRev); + limit = (revisionIndex - revA) + 1; - // Read fullRevisionPK (as string) - index = fullRevisions.indexOf(' ', lengthFR); - if (index == -1) { - index = max; - } + // Build the revision + return buildRevisionMetaData(fullRevPK, limit); - fullRev = fullRevisions.substring(lengthFR, index); - lengthFR = index + 1; + } catch (WikiPageNotFoundException e) { + throw e; + } catch (Exception e) { + throw new WikiApiException(e); + } + } - // Read start revision counter - index = revCounters.indexOf(' ', lengthRC); - revA = Integer.parseInt(revCounters.substring(lengthRC, index)); - lengthRC = index + 1; + /** + * This method queries and builds the specified revision. + * + * @param revision + */ + public void setRevisionTextAndParts(Revision revision) { - // Read end revision counter - index = revCounters.indexOf(' ', lengthRC); - if (index == -1) { - index = revCounters.length(); - } - revB = Integer.parseInt(revCounters.substring(lengthRC, index)); - lengthRC = index + 1; - } + try { - if (revisionIndex > revB) { - throw new WikiPageNotFoundException("The article with the ID " + articleID - + " has no revision number " + revisionIndex); - } + PreparedStatement statement = null; + ResultSet result = null; - fullRevPK = Integer.parseInt(fullRev); - limit = (revisionIndex - revA) + 1; + int fullRevPK; + int limit; + try { + statement = this.connection.prepareStatement("SELECT FullRevisionPK, RevisionPK " + + "FROM index_revisionID " + "WHERE revisionID=? LIMIT 1"); + statement.setInt(1, revision.getRevisionID()); + result = statement.executeQuery(); - // Build the revision - return buildRevisionMetaData(fullRevPK, limit); + if (result.next()) { + fullRevPK = result.getInt(1); + limit = (result.getInt(2) - fullRevPK) + 1; + } else { + throw new WikiPageNotFoundException("The revision with the ID " + + revision.getRevisionID() + " was not found."); } - catch (WikiPageNotFoundException e) { - throw e; + } finally { + if (statement != null) { + statement.close(); } - catch (Exception e) { - throw new WikiApiException(e); + if (result != null) { + result.close(); } - } + } - /** - * This method queries and builds the specified revision. - * @param revision - */ - public void setRevisionTextAndParts(Revision revision) - { + try { + statement = this.connection + .prepareStatement("SELECT Revision, PrimaryKey, RevisionCounter, RevisionID, ArticleID, Timestamp, Comment, Minor, ContributorName, ContributorId, ContributorIsRegistered " + + "FROM revisions " + "WHERE PrimaryKey >= ? LIMIT " + limit); + statement.setInt(1, fullRevPK); + result = statement.executeQuery(); - try { + String previousRevision = null, currentRevision = null; - PreparedStatement statement = null; - ResultSet result = null; - - int fullRevPK; - int limit; - try { - statement = this.connection.prepareStatement("SELECT FullRevisionPK, RevisionPK " - + "FROM index_revisionID " + "WHERE revisionID=? LIMIT 1"); - statement.setInt(1, revision.getRevisionID()); - result = statement.executeQuery(); - - if (result.next()) { - fullRevPK = result.getInt(1); - limit = (result.getInt(2) - fullRevPK) + 1; - - } - else { - throw new WikiPageNotFoundException("The revision with the ID " - + revision.getRevisionID() + " was not found."); - } - } - finally { - if (statement != null) { - statement.close(); - } - if (result != null) { - result.close(); - } - } - - try { - statement = this.connection - .prepareStatement("SELECT Revision, PrimaryKey, RevisionCounter, RevisionID, ArticleID, Timestamp, Comment, Minor, ContributorName, ContributorId, ContributorIsRegistered " - + "FROM revisions " + "WHERE PrimaryKey >= ? LIMIT " + limit); - statement.setInt(1, fullRevPK); - result = statement.executeQuery(); - - String previousRevision = null, currentRevision = null; - - Diff diff = null; - RevisionDecoder decoder; - - boolean binaryData = result.getMetaData().getColumnType(1) == Types.LONGVARBINARY; - - while (result.next()) { - - decoder = new RevisionDecoder(config.getCharacterSet()); - - if (binaryData) { - decoder.setInput(result.getBinaryStream(1), true); - } - else { - decoder.setInput(result.getString(1)); - } - - diff = decoder.decode(); - currentRevision = diff.buildRevision(previousRevision); - - previousRevision = currentRevision; - } - - Collection<DiffPart> parts = new LinkedList<>(); - Iterator<DiffPart> it = diff.iterator(); - while (it.hasNext()) { - parts.add(it.next()); - } - - revision.setParts(parts); - revision.setRevisionText(currentRevision); - - } - finally { - if (statement != null) { - statement.close(); - } - if (result != null) { - result.close(); - } - } - } - catch (WikiPageNotFoundException | DecodingException | SQLException | IOException e) { - throw new RuntimeException(e); - } - catch (RuntimeException e) { - throw e; - } + Diff diff = null; + RevisionDecoder decoder; - } + boolean binaryData = result.getMetaData().getColumnType(1) == Types.LONGVARBINARY; - /** - * This method queries and builds the specified revision. - * - * @param fullRevPK - * PK of the full revision - * @param limit - * number of revision to query - * @return Revision - * - * @throws SQLException - * if an error occurs while retrieving data from the sql database. - */ - private Revision buildRevisionMetaData(final int fullRevPK, final int limit) throws SQLException - { - - PreparedStatement statement = null; - ResultSet result = null; + while (result.next()) { - try { - String query = "SELECT Revision, PrimaryKey, RevisionCounter, RevisionID, ArticleID, Timestamp, Comment, Minor, ContributorName, ContributorId, ContributorIsRegistered " - + "FROM revisions " + "WHERE PrimaryKey >= ? LIMIT " + limit; - - /* - * As HSQL does not support ResultSet.last() per default, we have to specify these extra parameters here. - * - * With these parameters in place, the 'last()' call works as expected. - * - * See also: https://stackoverflow.com/q/19533991 - */ - statement = this.connection.prepareStatement(query, ResultSet.TYPE_SCROLL_INSENSITIVE, ResultSet.CONCUR_READ_ONLY); - statement.setInt(1, fullRevPK); - result = statement.executeQuery(); - - Revision revision = null; - - if (result.last()) { - revision = new Revision(result.getInt(3), this); - - revision.setPrimaryKey(result.getInt(2)); - revision.setRevisionID(result.getInt(4)); - revision.setArticleID(result.getInt(5)); - revision.setTimeStamp(new Timestamp(result.getLong(6))); - revision.setComment(result.getString(7)); - revision.setMinor(result.getBoolean(8)); - revision.setContributorName(result.getString(9)); - - // we should not use getInt(), because result may be null - String contribIdString = result.getString(10); - Integer contributorId = contribIdString == null ? null : Integer - .parseInt(contribIdString); - revision.setContributorId(contributorId); - - revision.setContributorIsRegistered(result.getBoolean(11)); - } - return revision; - - } - finally { - if (statement != null) { - statement.close(); - } - if (result != null) { - result.close(); - } - } + decoder = new RevisionDecoder(config.getCharacterSet()); - } + if (binaryData) { + decoder.setInput(result.getBinaryStream(1), true); + } else { + decoder.setInput(result.getString(1)); + } - /** - * Checks if some index (besides the PRIMARY-Index) exists in a given table. - * - * @param table - * the table to check - * @return {@code true} if index exists, false else - * @throws SQLException - * if an error occurs connecting to or querying the db - */ - private boolean indexExists(String table) - throws SQLException - { - return indexExists(table, null); - } + diff = decoder.decode(); + currentRevision = diff.buildRevision(previousRevision); - /** - * Checks if an index with a specific name exists in a given table. - * - * @param table - * the table to check - * @param indexName - * the name of the index (may be null) - * @return {@code true} if index exists, false else - * @throws SQLException - * if an error occurs connecting to or querying the db - */ - private boolean indexExists(String table, String indexName) - throws SQLException - { - - try (PreparedStatement statement = this.connection.prepareStatement("SHOW INDEX FROM " + table - + " WHERE Key_name!= 'PRIMARY'"); ResultSet result = statement.executeQuery()) { - - // Check if an index exists (because otherwise the query would - // be awfully slow. Note that the existence of ANY index will - // suffice - we might want to check for a specific index. - if (result == null || !result.next()) { - return false; - } - - /* - * SOME INDEX EXISTS! We can now check for the existence of a specific index - */ - if (indexName != null) { - // go back to first result - - result.first(); - // check all existing indexes for the specific index name - boolean specificIndexExists = false; - while (result.next()) { - if (result.getString(3).equals(indexName)) { - specificIndexExists = true; - } - } - return specificIndexExists ? true : false; + previousRevision = currentRevision; + } - } else { - // we have an index, but don't want to check for an index with - // a specific name + Collection<DiffPart> parts = new LinkedList<>(); + Iterator<DiffPart> it = diff.iterator(); + while (it.hasNext()) { + parts.add(it.next()); + } - return true; + revision.setParts(parts); + revision.setRevisionText(currentRevision); + + } finally { + if (statement != null) { + statement.close(); + } + if (result != null) { + result.close(); } } + } catch (WikiPageNotFoundException | DecodingException | SQLException | IOException e) { + throw new RuntimeException(e); + } catch (RuntimeException e) { + throw e; + } + + } + + /** + * This method queries and builds the specified revision. + * + * @param fullRevPK PK of the full revision + * @param limit number of revision to query + * @return Revision + * @throws SQLException if an error occurs while retrieving data from the sql database. + */ + private Revision buildRevisionMetaData(final int fullRevPK, final int limit) throws SQLException { + + PreparedStatement statement = null; + ResultSet result = null; + + try { + String query = "SELECT Revision, PrimaryKey, RevisionCounter, RevisionID, ArticleID, Timestamp, Comment, Minor, ContributorName, ContributorId, ContributorIsRegistered " + + "FROM revisions " + "WHERE PrimaryKey >= ? LIMIT " + limit; + + /* + * As HSQL does not support ResultSet.last() per default, we have to specify these extra parameters here. + * + * With these parameters in place, the 'last()' call works as expected. + * + * See also: https://stackoverflow.com/q/19533991 + */ + statement = this.connection.prepareStatement(query, ResultSet.TYPE_SCROLL_INSENSITIVE, ResultSet.CONCUR_READ_ONLY); + statement.setInt(1, fullRevPK); + result = statement.executeQuery(); + + Revision revision = null; + + if (result.last()) { + revision = new Revision(result.getInt(3), this); + + revision.setPrimaryKey(result.getInt(2)); + revision.setRevisionID(result.getInt(4)); + revision.setArticleID(result.getInt(5)); + revision.setTimeStamp(new Timestamp(result.getLong(6))); + revision.setComment(result.getString(7)); + revision.setMinor(result.getBoolean(8)); + revision.setContributorName(result.getString(9)); + + // we should not use getInt(), because result may be null + String contribIdString = result.getString(10); + Integer contributorId = contribIdString == null ? null : Integer + .parseInt(contribIdString); + revision.setContributorId(contributorId); + + revision.setContributorIsRegistered(result.getBoolean(11)); + } + return revision; + } finally { + if (statement != null) { + statement.close(); + } + if (result != null) { + result.close(); + } } - /** - * Checks if a specific table exists - * - * @param table - * the table to check - * - * @return {@code true} if table exists, false else - * @throws SQLException - * if an error occurs connecting to or querying the db - */ - private boolean tableExists(String table) - throws SQLException - { - - try (PreparedStatement statement = this.connection.prepareStatement("SHOW TABLES;"); ResultSet result = statement.executeQuery()) { + } + + /** + * Checks if some index (besides the PRIMARY-Index) exists in a given table. + * + * @param table the table to check + * @return {@code true} if index exists, false else + * @throws SQLException if an error occurs connecting to or querying the db + */ + private boolean indexExists(String table) + throws SQLException { + return indexExists(table, null); + } + + /** + * Checks if an index with a specific name exists in a given table. + * + * @param table the table to check + * @param indexName the name of the index (may be null) + * @return {@code true} if index exists, false else + * @throws SQLException if an error occurs connecting to or querying the db + */ + private boolean indexExists(String table, String indexName) + throws SQLException { + + try (PreparedStatement statement = this.connection.prepareStatement("SHOW INDEX FROM " + table + + " WHERE Key_name!= 'PRIMARY'"); ResultSet result = statement.executeQuery()) { + + // Check if an index exists (because otherwise the query would + // be awfully slow. Note that the existence of ANY index will + // suffice - we might want to check for a specific index. + if (result == null || !result.next()) { + return false; + } + + /* + * SOME INDEX EXISTS! We can now check for the existence of a specific index + */ + if (indexName != null) { + // go back to first result - if (result == null) { - return false; - } - boolean found = false; + result.first(); + // check all existing indexes for the specific index name + boolean specificIndexExists = false; while (result.next()) { - if (table.equalsIgnoreCase(result.getString(1))) { - found = true; + if (result.getString(3).equals(indexName)) { + specificIndexExists = true; } } - return found; + return specificIndexExists ? true : false; - } + } else { + // we have an index, but don't want to check for an index with + // a specific name + return true; + } } - public RevisionAPIConfiguration getRevisionApiConfiguration(){ - return this.config; - } + } + + /** + * Checks if a specific table exists + * + * @param table the table to check + * @return {@code true} if table exists, false else + * @throws SQLException if an error occurs connecting to or querying the db + */ + private boolean tableExists(String table) + throws SQLException { + + try (PreparedStatement statement = this.connection.prepareStatement("SHOW TABLES;"); ResultSet result = statement.executeQuery()) { + + if (result == null) { + return false; + } + boolean found = false; + while (result.next()) { + if (table.equalsIgnoreCase(result.getString(1))) { + found = true; + } + } + return found; - public Connection getConnection(){ - return this.connection; } - @Deprecated // This should go into a demo or test class separated from the code here... - public static void main(String[] args) - throws Exception - { + } - RevisionAPIConfiguration config = new RevisionAPIConfiguration(); + public RevisionAPIConfiguration getRevisionApiConfiguration() { + return this.config; + } - config.setHost("localhost"); - config.setDatabase("en_wiki"); - config.setUser("root"); - config.setPassword("1234"); + public Connection getConnection() { + return this.connection; + } - config.setCharacterSet("UTF-8"); - config.setBufferSize(20000); - config.setMaxAllowedPacket(1024 * 1024); + @Deprecated // This should go into a demo or test class separated from the code here... + public static void main(String[] args) + throws Exception { - RevisionApi rev = new RevisionApi(config); + RevisionAPIConfiguration config = new RevisionAPIConfiguration(); - Revision r; + config.setHost("localhost"); + config.setDatabase("en_wiki"); + config.setUser("root"); + config.setPassword("1234"); - // System.out.println(rev.getNumberOfRevisions(12)); - // System.out.println(rev.getFirstDateOfAppearance(12)); - // System.out.println(rev.getLastDateOfAppearance(12)); + config.setCharacterSet("UTF-8"); + config.setBufferSize(20000); + config.setMaxAllowedPacket(1024 * 1024); - // r = rev.getRevision(31596, new Timestamp(1011743960000l)); - r = rev.getRevision(233181); + RevisionApi rev = new RevisionApi(config); - System.out.println(r.toString() + "\t" + r.getRevisionText()); - // System.out.println(rev.getRevision(979005).getRevisionText()); - // System.out.println(rev.getRevision(2, new - // Timestamp(1216747716000l)).getRevisionText()); + Revision r; - } + // System.out.println(rev.getNumberOfRevisions(12)); + // System.out.println(rev.getFirstDateOfAppearance(12)); + // System.out.println(rev.getLastDateOfAppearance(12)); + + // r = rev.getRevision(31596, new Timestamp(1011743960000l)); + r = rev.getRevision(233181); + + System.out.println(r.toString() + "\t" + r.getRevisionText()); + // System.out.println(rev.getRevision(979005).getRevisionText()); + // System.out.println(rev.getRevision(2, new + // Timestamp(1216747716000l)).getRevisionText()); + + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/RevisionDataInterface.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/RevisionDataInterface.java index 7f9aab34..534cc71b 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/RevisionDataInterface.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/RevisionDataInterface.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -21,76 +21,71 @@ /** * This interface contains method to access the additional data of a revision. - * - * - * */ -public interface RevisionDataInterface -{ +public interface RevisionDataInterface { - /** - * Returns the ID of the article. - * - * @return ID of the article - */ + /** + * Returns the ID of the article. + * + * @return ID of the article + */ int getArticleID(); - /** - * Returns the ID of the revision. - * - * @return ID of the revision - */ + /** + * Returns the ID of the revision. + * + * @return ID of the revision + */ int getRevisionID(); - /** - * Returns the timestamp - * - * @return timestamp - */ + /** + * Returns the timestamp + * + * @return timestamp + */ Timestamp getTimeStamp(); - /** - * Returns the revision counter - * - * @return revision counter - */ + /** + * Returns the revision counter + * + * @return revision counter + */ int getRevisionCounter(); - /** - * Returns the user comment for this revision - * - * - * @return the user comment for this revision - */ + /** + * Returns the user comment for this revision + * + * @return the user comment for this revision + */ String getComment(); - /** - * Returns true if revision is a minor revision. - * - * @return true if revision is a minor revision, false else - */ + /** + * Returns true if revision is a minor revision. + * + * @return true if revision is a minor revision, false else + */ boolean isMinor(); - /** - * Returns the contributorID of the revision contributor - * Unregistered users do not have an id, so the return value might be null. - * - * @return the contributorID of the revision contributor or null, if user does not have an id (= is not registered) - */ + /** + * Returns the contributorID of the revision contributor + * Unregistered users do not have an id, so the return value might be null. + * + * @return the contributorID of the revision contributor or null, if user does not have an id (= is not registered) + */ Integer getContributorId(); - /** - * Returns the contributorName of the revision contributor - * - * @return the contributorName of the revision contributor - */ + /** + * Returns the contributorName of the revision contributor + * + * @return the contributorName of the revision contributor + */ String getContributorName(); - /** - * Returns true, if the contributor is a registered user - * - * @return true, if the contributor is a registered user, false else - */ + /** + * Returns true, if the contributor is a registered user + * + * @return true, if the contributor is a registered user, false else + */ boolean contributorIsRegistered(); } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/RevisionIterator.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/RevisionIterator.java index d61acb95..a347ac38 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/RevisionIterator.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/RevisionIterator.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -39,445 +39,420 @@ * Part of the JWPL Revision API * <p> * This class represents the interface to iterate through multiple revisions. - * */ -public class RevisionIterator extends AbstractRevisionService implements RevisionIteratorInterface -{ - - private static final Logger logger = LoggerFactory.getLogger(RevisionIterator.class); - - /** Reference to the ResultSet */ - private ResultSet result; - - /** Reference to the Statement */ - private PreparedStatement statement; - - /** Binary Data Flag */ - private boolean binaryData; - - /** Text of the previous revision */ - private String previousRevision; - - /** Current primary key */ - private int primaryKey; - - /** Primary key indicating the end of the data */ - private int endPK; - - /** ID of the current article */ - private int currentArticleID; - - /** The last known revision counter */ - private int currentRevCounter; - - /** Configuration parameter - indicates the maximum size of a querry. */ - private final int MAX_NUMBER_RESULTS; - - /** Should load revision text? */ - private boolean shouldLoadRevisionText; - - /** - * The revisionapi for this iterator - used by the Revision object - * in case of lazy loading - */ - private RevisionApi revApi= null; - - public boolean shouldLoadRevisionText() - { - return shouldLoadRevisionText; - } - - public void setShouldLoadRevisionText(boolean shouldLoadRevisionText) - { - this.shouldLoadRevisionText = shouldLoadRevisionText; - } - - /** - * (Constructor) Creates a new RevisionIterator object. - * - * @param config - * Reference to the configuration object - * @param startPK - * Start index - * @param endPK - * End index - * @param connection - * Reference to the connection - * - * @throws WikiApiException - * if an error occurs - */ - public RevisionIterator(final RevisionAPIConfiguration config, - final int startPK, final int endPK, final Connection connection) - throws WikiApiException - { - - if (startPK < 0 || endPK < 0 || startPK > endPK || connection == null) { - throw new IllegalArgumentException("Illegal argument"); - } - - this.primaryKey = startPK - 1; - this.endPK = endPK; - this.config = config; - - this.currentArticleID = -1; - this.currentRevCounter = -1; - - MAX_NUMBER_RESULTS = config.getBufferSize(); - - this.connection = connection; - } - - /** - * (Constructor) Creates a new RevisionIterator object. - * - * @param config - * Reference to the configuration object - * @param startPK - * Start index - * - * @throws WikiApiException - * if an error occurs - */ - public RevisionIterator(final RevisionAPIConfiguration config, final int startPK) - throws WikiApiException - { - - this(config); - - if (startPK < 0) { - throw new IllegalArgumentException("Illegal argument"); - } - - this.primaryKey = startPK - 1; - } - - /** - * (Constructor) Creates a new RevisionIterator object. - * - * @param config - * Reference to the configuration object - * @param startPK - * Start index - * @param endPK - * End index - * - * @throws WikiApiException - * if an error occurs - */ - public RevisionIterator(final RevisionAPIConfiguration config, - final int startPK, final int endPK) - throws WikiApiException - { - - this(config, startPK); - - if (endPK < 0 || startPK > endPK) { - throw new IllegalArgumentException("Illegal argument"); - } - - this.endPK = endPK; - } - - /** - * (Constructor) Creates a new RevisionIterator object. - * - * @param config - * Reference to the configuration object - * - * @throws WikiApiException - * if an error occurs - */ - public RevisionIterator(final RevisionAPIConfiguration config) - throws WikiApiException - { - - this.config = config; - this.primaryKey = -1; - this.endPK = Integer.MAX_VALUE; - - this.statement = null; - this.result = null; - this.previousRevision = null; - MAX_NUMBER_RESULTS = config.getBufferSize(); - - connection = getConnection(config); - } - - /** - * (Constructor) Creates a new RevisionIterator object. - * - * @param config - * Reference to the configuration object - * @param shouldLoadRevisionText - * should load revision text - * @throws WikiApiException - * if an error occurs - */ - public RevisionIterator(final RevisionAPIConfiguration config, - boolean shouldLoadRevisionText) - throws WikiApiException - { - this(config); - this.shouldLoadRevisionText = shouldLoadRevisionText; - } - - public RevisionIterator(final DatabaseConfiguration db) - throws WikiApiException - { - this(getRevisionAPIConfig(db)); - } - - private static RevisionAPIConfiguration getRevisionAPIConfig( - final DatabaseConfiguration db) - { - RevisionAPIConfiguration revAPIConfig = new RevisionAPIConfiguration(); - - revAPIConfig.setHost(db.getHost()); - revAPIConfig.setDatabase(db.getDatabase()); - revAPIConfig.setDatabaseDriver(db.getDatabaseDriver()); - revAPIConfig.setJdbcURL(db.getJdbcURL()); - revAPIConfig.setUser(db.getUser()); - revAPIConfig.setPassword(db.getPassword()); - revAPIConfig.setLanguage(db.getLanguage()); - - return revAPIConfig; - } - - /** - * Sends the query to the database and stores the result. The {@link java.sql.Statement} and - * {@link ResultSet} connection will not be closed. - * - * @return {@code true}, if the result set has another element {@code false}, otherwise - * - * @throws SQLException - * if an error occurs while accessing the database. - */ - private boolean query() - throws SQLException - { - String query = "SELECT PrimaryKey, Revision, RevisionCounter," - + " RevisionID, ArticleID, Timestamp, FullRevisionID, ContributorName, ContributorId, Comment, Minor, ContributorIsRegistered " - + "FROM revisions"; - - if (primaryKey > 0) { - query += " WHERE PrimaryKey > " + primaryKey; - } - - if (MAX_NUMBER_RESULTS > 0) { - query += " LIMIT "; - - if (primaryKey + MAX_NUMBER_RESULTS > endPK) { - query += (endPK - primaryKey + 1); // TODO: +1 ? - } - else { - query += MAX_NUMBER_RESULTS; - } - - } - else if (endPK != Integer.MAX_VALUE) { - query += " LIMIT " + (endPK - primaryKey + 1); - } - - try{ - statement=this.connection.prepareStatement(query); - result = statement.executeQuery(); - }catch(Exception e){ - logger.error(e.getLocalizedMessage(), e); - try { - boolean connectionReady = !connection.isClosed() && connection.isValid(5); - logger.debug("Connection ready: {}", connectionReady); - if(!connectionReady) { - connection = getConnection(config); - } - statement = this.connection.prepareStatement(query); - result = statement.executeQuery(query); - } catch (WikiApiException wae) { - logger.error(wae.getLocalizedMessage(), wae); - } - } - - - if (result.next()) { - binaryData = result.getMetaData().getColumnType(2) == Types.LONGVARBINARY; - return true; - } - - return false; - } - - /** - * Returns the next revision. - * - * @return next revision - */ - @Override - public Revision next() - { - try { - - int revCount, articleID; - - revCount = result.getInt(3); - articleID = result.getInt(5); - - if (articleID != this.currentArticleID) { - this.currentRevCounter = 0; - this.currentArticleID = articleID; - } - - if (revCount - 1 != this.currentRevCounter) { - - logger.error("Invalid RevCounter -" + " [ArticleId " - + articleID + ", RevisionId " + result.getInt(4) - + ", RevisionCounter " +revCount + "] - Expected: " - + (this.currentRevCounter + 1)); - - this.currentRevCounter = revCount; - this.previousRevision = null; - - return null; - } - - this.currentRevCounter = revCount; - this.primaryKey = result.getInt(1); - - Revision revision = new Revision(revCount); - revision.setPrimaryKey(this.primaryKey); - if (!shouldLoadRevisionText) { - String currentRevision; - - Diff diff; - RevisionDecoder decoder = new RevisionDecoder( - config.getCharacterSet()); - - if (binaryData) { - decoder.setInput(result.getBinaryStream(2), true); - } - else { - decoder.setInput(result.getString(2)); - } - diff = decoder.decode(); - - try { - currentRevision = diff.buildRevision(previousRevision); - } - catch (Exception e) { - this.previousRevision = null; - logger.error("Reconstruction failed -" - + " [ArticleId " + result.getInt(5) - + ", RevisionId " + result.getInt(4) - + ", RevisionCounter " + result.getInt(3) + "]"); - return null; - } - - previousRevision = currentRevision; - revision.setRevisionText(currentRevision); - } else { - if(revApi==null){ - revApi = new RevisionApi(config); - } - revision.setRevisionApi(revApi); - } - - revision.setRevisionID(result.getInt(4)); - revision.setArticleID(articleID); - revision.setTimeStamp(new Timestamp(result.getLong(6))); - revision.setFullRevisionID(result.getInt(7)); - revision.setContributorName(result.getString(8)); - revision.setContributorId(result.getInt(9)); - revision.setComment(result.getString(10)); - revision.setMinor(result.getBoolean(11)); - revision.setContributorIsRegistered(result.getBoolean(12)); - - return revision; - - } - catch (DecodingException | SQLException | IOException | WikiApiException e) { - throw new RuntimeException(e); - } - } - - /** - * Returns whether another revision is available or not. - */ - @Override - public boolean hasNext() - { - try { - if (result != null && result.next()) { - return true; - } - - // Close old queries - if (this.statement != null) { - this.statement.close(); - } - if (this.result != null) { - this.result.close(); - } - - if (primaryKey <= endPK) { // TODO: <= ? - return query(); - } - - return false; - - } - catch (SQLException e) { - throw new RuntimeException(e); - } - } - - /** - * This method is unsupported and will result in a {@link UnsupportedOperationException}. - * - * @deprecated - */ - @Override - @Deprecated - public void remove() - { - throw new UnsupportedOperationException(); - } - - - // TODO This should go into a demo or test class separated from the code here... - @Deprecated - public static void main(final String[] args) - throws Exception - { - - RevisionAPIConfiguration config = new RevisionAPIConfiguration(); - config.setHost("localhost"); - config.setDatabase("en_wiki"); - config.setUser("root"); - config.setPassword("1234"); - - config.setCharacterSet("UTF-8"); - config.setBufferSize(20000); - config.setMaxAllowedPacket(16 * 1024 * 1023); - - long count = 1; - long start = System.currentTimeMillis(); - - Revision rev; - Iterator<Revision> it = new RevisionIterator(config); - - System.out.println(Time.toClock(System.currentTimeMillis() - start)); - - while (it.hasNext()) { - rev = it.next(); - - if (count++ % 10000 == 0) { - - if (rev != null) { - System.out.println(rev); - } - } - } - - // w.close(); - System.out.println(Time.toClock(System.currentTimeMillis() - start)); - } +public class RevisionIterator extends AbstractRevisionService implements RevisionIteratorInterface { + + private static final Logger logger = LoggerFactory.getLogger(RevisionIterator.class); + + /** + * Reference to the ResultSet + */ + private ResultSet result; + + /** + * Reference to the Statement + */ + private PreparedStatement statement; + + /** + * Binary Data Flag + */ + private boolean binaryData; + + /** + * Text of the previous revision + */ + private String previousRevision; + + /** + * Current primary key + */ + private int primaryKey; + + /** + * Primary key indicating the end of the data + */ + private int endPK; + + /** + * ID of the current article + */ + private int currentArticleID; + + /** + * The last known revision counter + */ + private int currentRevCounter; + + /** + * Configuration parameter - indicates the maximum size of a querry. + */ + private final int MAX_NUMBER_RESULTS; + + /** + * Should load revision text? + */ + private boolean shouldLoadRevisionText; + + /** + * The revisionapi for this iterator - used by the Revision object + * in case of lazy loading + */ + private RevisionApi revApi = null; + + public boolean shouldLoadRevisionText() { + return shouldLoadRevisionText; + } + + public void setShouldLoadRevisionText(boolean shouldLoadRevisionText) { + this.shouldLoadRevisionText = shouldLoadRevisionText; + } + + /** + * (Constructor) Creates a new RevisionIterator object. + * + * @param config Reference to the configuration object + * @param startPK Start index + * @param endPK End index + * @param connection Reference to the connection + * @throws WikiApiException if an error occurs + */ + public RevisionIterator(final RevisionAPIConfiguration config, + final int startPK, final int endPK, final Connection connection) + throws WikiApiException { + + if (startPK < 0 || endPK < 0 || startPK > endPK || connection == null) { + throw new IllegalArgumentException("Illegal argument"); + } + + this.primaryKey = startPK - 1; + this.endPK = endPK; + this.config = config; + + this.currentArticleID = -1; + this.currentRevCounter = -1; + + MAX_NUMBER_RESULTS = config.getBufferSize(); + + this.connection = connection; + } + + /** + * (Constructor) Creates a new RevisionIterator object. + * + * @param config Reference to the configuration object + * @param startPK Start index + * @throws WikiApiException if an error occurs + */ + public RevisionIterator(final RevisionAPIConfiguration config, final int startPK) + throws WikiApiException { + + this(config); + + if (startPK < 0) { + throw new IllegalArgumentException("Illegal argument"); + } + + this.primaryKey = startPK - 1; + } + + /** + * (Constructor) Creates a new RevisionIterator object. + * + * @param config Reference to the configuration object + * @param startPK Start index + * @param endPK End index + * @throws WikiApiException if an error occurs + */ + public RevisionIterator(final RevisionAPIConfiguration config, + final int startPK, final int endPK) + throws WikiApiException { + + this(config, startPK); + + if (endPK < 0 || startPK > endPK) { + throw new IllegalArgumentException("Illegal argument"); + } + + this.endPK = endPK; + } + + /** + * (Constructor) Creates a new RevisionIterator object. + * + * @param config Reference to the configuration object + * @throws WikiApiException if an error occurs + */ + public RevisionIterator(final RevisionAPIConfiguration config) + throws WikiApiException { + + this.config = config; + this.primaryKey = -1; + this.endPK = Integer.MAX_VALUE; + + this.statement = null; + this.result = null; + this.previousRevision = null; + MAX_NUMBER_RESULTS = config.getBufferSize(); + + connection = getConnection(config); + } + + /** + * (Constructor) Creates a new RevisionIterator object. + * + * @param config Reference to the configuration object + * @param shouldLoadRevisionText should load revision text + * @throws WikiApiException if an error occurs + */ + public RevisionIterator(final RevisionAPIConfiguration config, + boolean shouldLoadRevisionText) + throws WikiApiException { + this(config); + this.shouldLoadRevisionText = shouldLoadRevisionText; + } + + public RevisionIterator(final DatabaseConfiguration db) + throws WikiApiException { + this(getRevisionAPIConfig(db)); + } + + private static RevisionAPIConfiguration getRevisionAPIConfig( + final DatabaseConfiguration db) { + RevisionAPIConfiguration revAPIConfig = new RevisionAPIConfiguration(); + + revAPIConfig.setHost(db.getHost()); + revAPIConfig.setDatabase(db.getDatabase()); + revAPIConfig.setDatabaseDriver(db.getDatabaseDriver()); + revAPIConfig.setJdbcURL(db.getJdbcURL()); + revAPIConfig.setUser(db.getUser()); + revAPIConfig.setPassword(db.getPassword()); + revAPIConfig.setLanguage(db.getLanguage()); + + return revAPIConfig; + } + + /** + * Sends the query to the database and stores the result. The {@link java.sql.Statement} and + * {@link ResultSet} connection will not be closed. + * + * @return {@code true}, if the result set has another element {@code false}, otherwise + * @throws SQLException if an error occurs while accessing the database. + */ + private boolean query() + throws SQLException { + String query = "SELECT PrimaryKey, Revision, RevisionCounter," + + " RevisionID, ArticleID, Timestamp, FullRevisionID, ContributorName, ContributorId, Comment, Minor, ContributorIsRegistered " + + "FROM revisions"; + + if (primaryKey > 0) { + query += " WHERE PrimaryKey > " + primaryKey; + } + + if (MAX_NUMBER_RESULTS > 0) { + query += " LIMIT "; + + if (primaryKey + MAX_NUMBER_RESULTS > endPK) { + query += (endPK - primaryKey + 1); // TODO: +1 ? + } else { + query += MAX_NUMBER_RESULTS; + } + + } else if (endPK != Integer.MAX_VALUE) { + query += " LIMIT " + (endPK - primaryKey + 1); + } + + try { + statement = this.connection.prepareStatement(query); + result = statement.executeQuery(); + } catch (Exception e) { + logger.error(e.getLocalizedMessage(), e); + try { + boolean connectionReady = !connection.isClosed() && connection.isValid(5); + logger.debug("Connection ready: {}", connectionReady); + if (!connectionReady) { + connection = getConnection(config); + } + statement = this.connection.prepareStatement(query); + result = statement.executeQuery(query); + } catch (WikiApiException wae) { + logger.error(wae.getLocalizedMessage(), wae); + } + } + + + if (result.next()) { + binaryData = result.getMetaData().getColumnType(2) == Types.LONGVARBINARY; + return true; + } + + return false; + } + + /** + * Returns the next revision. + * + * @return next revision + */ + @Override + public Revision next() { + try { + + int revCount, articleID; + + revCount = result.getInt(3); + articleID = result.getInt(5); + + if (articleID != this.currentArticleID) { + this.currentRevCounter = 0; + this.currentArticleID = articleID; + } + + if (revCount - 1 != this.currentRevCounter) { + + logger.error("Invalid RevCounter -" + " [ArticleId " + + articleID + ", RevisionId " + result.getInt(4) + + ", RevisionCounter " + revCount + "] - Expected: " + + (this.currentRevCounter + 1)); + + this.currentRevCounter = revCount; + this.previousRevision = null; + + return null; + } + + this.currentRevCounter = revCount; + this.primaryKey = result.getInt(1); + + Revision revision = new Revision(revCount); + revision.setPrimaryKey(this.primaryKey); + if (!shouldLoadRevisionText) { + String currentRevision; + + Diff diff; + RevisionDecoder decoder = new RevisionDecoder( + config.getCharacterSet()); + + if (binaryData) { + decoder.setInput(result.getBinaryStream(2), true); + } else { + decoder.setInput(result.getString(2)); + } + diff = decoder.decode(); + + try { + currentRevision = diff.buildRevision(previousRevision); + } catch (Exception e) { + this.previousRevision = null; + logger.error("Reconstruction failed -" + + " [ArticleId " + result.getInt(5) + + ", RevisionId " + result.getInt(4) + + ", RevisionCounter " + result.getInt(3) + "]"); + return null; + } + + previousRevision = currentRevision; + revision.setRevisionText(currentRevision); + } else { + if (revApi == null) { + revApi = new RevisionApi(config); + } + revision.setRevisionApi(revApi); + } + + revision.setRevisionID(result.getInt(4)); + revision.setArticleID(articleID); + revision.setTimeStamp(new Timestamp(result.getLong(6))); + revision.setFullRevisionID(result.getInt(7)); + revision.setContributorName(result.getString(8)); + revision.setContributorId(result.getInt(9)); + revision.setComment(result.getString(10)); + revision.setMinor(result.getBoolean(11)); + revision.setContributorIsRegistered(result.getBoolean(12)); + + return revision; + + } catch (DecodingException | SQLException | IOException | WikiApiException e) { + throw new RuntimeException(e); + } + } + + /** + * Returns whether another revision is available or not. + */ + @Override + public boolean hasNext() { + try { + if (result != null && result.next()) { + return true; + } + + // Close old queries + if (this.statement != null) { + this.statement.close(); + } + if (this.result != null) { + this.result.close(); + } + + if (primaryKey <= endPK) { // TODO: <= ? + return query(); + } + + return false; + + } catch (SQLException e) { + throw new RuntimeException(e); + } + } + + /** + * This method is unsupported and will result in a {@link UnsupportedOperationException}. + * + * @deprecated + */ + @Override + @Deprecated + public void remove() { + throw new UnsupportedOperationException(); + } + + + // TODO This should go into a demo or test class separated from the code here... + @Deprecated + public static void main(final String[] args) + throws Exception { + + RevisionAPIConfiguration config = new RevisionAPIConfiguration(); + config.setHost("localhost"); + config.setDatabase("en_wiki"); + config.setUser("root"); + config.setPassword("1234"); + + config.setCharacterSet("UTF-8"); + config.setBufferSize(20000); + config.setMaxAllowedPacket(16 * 1024 * 1023); + + long count = 1; + long start = System.currentTimeMillis(); + + Revision rev; + Iterator<Revision> it = new RevisionIterator(config); + + System.out.println(Time.toClock(System.currentTimeMillis() - start)); + + while (it.hasNext()) { + rev = it.next(); + + if (count++ % 10000 == 0) { + + if (rev != null) { + System.out.println(rev); + } + } + } + + // w.close(); + System.out.println(Time.toClock(System.currentTimeMillis() - start)); + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/RevisionIteratorInterface.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/RevisionIteratorInterface.java index 492b7481..dd7cdcd5 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/RevisionIteratorInterface.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/RevisionIteratorInterface.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -27,20 +27,16 @@ * <p> * Since the {@link IOException} does not have inner exception in JAVA 1.5 the close * method has to throw both exception for both input components. - * */ public interface RevisionIteratorInterface - extends Iterator<Revision> -{ + extends Iterator<Revision> { - /** - * Closes the reader or connection to the input component. - * - * @throws IOException - * if an error occurs while reading from the input archive. - * @throws SQLException - * if an error occurs while accessing the sql database. - */ - void close() - throws IOException, SQLException; + /** + * Closes the reader or connection to the input component. + * + * @throws IOException if an error occurs while reading from the input archive. + * @throws SQLException if an error occurs while accessing the sql database. + */ + void close() + throws IOException, SQLException; } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/chrono/ChronoFullRevision.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/chrono/ChronoFullRevision.java index 80e37a65..2bc68e97 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/chrono/ChronoFullRevision.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/chrono/ChronoFullRevision.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -24,342 +24,327 @@ /** * ChronoFullRevision - * - * + * <p> + * <p> * 1 */ -public class ChronoFullRevision -{ - - /** PrimaryKey of the full revision */ - private final int fullRevisionPK; - - /** First revision counter / revision counter of the full revision */ - private final int startRC; - - /** Last revision counter based on the full revision */ - private final int endRC; - - /** Reference to the chrono storage block */ - private ChronoStorageBlock first; - - /** Set containing the IDs of revisions that could be reconstructed */ - private final Set<Integer> set; - - /** Link to the next full revision */ - private ChronoFullRevision next; - - /** Link to the previous full revision */ - private ChronoFullRevision prev; - - /** Number of bytes contained in this object */ - private long size; - - /** - * (Constructor) Creates a new ChronoFullRevision object. - * - * @param fullRevisionPK - * primary key of a full revision - * @param startRC - * revision counter of the full revision - * @param endRC - * last revision counter based on the full revision - */ - public ChronoFullRevision(final int fullRevisionPK, final int startRC, - final int endRC) - { - - this.fullRevisionPK = fullRevisionPK; - this.startRC = startRC; - this.endRC = endRC; - - this.size = 0; - - this.set = new HashSet<>(); - for (int i = startRC; i <= endRC; i++) { - this.set.add(i); - } - } - - /** - * Returns the reference to the ChronoStorageBlock. - * - * @return chrono storage block - */ - public ChronoStorageBlock getFirst() - { - return this.first; - } - - /** - * Sets the reference of the ChronoStorageBlock. - * - * @param block - * chrono storage block - */ - public void setFirst(final ChronoStorageBlock block) - { - this.first = block; - } - - /** - * Adds a ChonoStorageBlock to this chrono full revision object. - * - * @param block - * reference to the chrono storage block - */ - public void add(final ChronoStorageBlock block) - { - - int revCount = block.getRevisionCounter(); - this.size += block.length(); - - if (first == null) { - first = block; - } - else { - - ChronoStorageBlock previous = null, current = first; - do { - if (revCount < current.getRevisionCounter()) { - - block.setCounterPrev(previous); - block.setCounterNext(current); - - if (previous != null) { - previous.setCounterNext(block); - } - - current.setCounterPrev(block); - - if (current == this.first) { - this.first = block; - } - - return; - } - - previous = current; - current = current.getCounterNext(); - - } - while (current != null); - - // Add to end of list - previous.setCounterNext(block); - block.setCounterPrev(previous); - } - } - - /** - * Returns the nearest available revision to the specified revision counter. - * - * @param revisionCounter - * revision counter - * @return Revision - */ - public Revision getNearest(final int revisionCounter) - { - - if (first != null) { - - ChronoStorageBlock previous = null, current = first; - while (current != null - && current.getRevisionCounter() <= revisionCounter) { - previous = current; - current = current.getCounterNext(); - } - - return previous.getRev(); - } - - return null; - } - - /** - * Removes the revision counter from the list of reconstructible revisions. - * - * @param revisionCounter - * revision counter - */ - public void remove(final int revisionCounter) - { - this.set.remove(revisionCounter); - if (this.set.isEmpty()) { - clean(0, 0); - } - } - - /** - * Returns whether more revisions can be reconstructed by the use of this - * chrono full revision. - * - * @return TRUE | FALSE - */ - public boolean isEmpty() - { - return this.set.isEmpty(); - } - - /** - * Returns the next chrono full revision. - * - * @return next chrono full revision - */ - public ChronoFullRevision getNext() - { - return next; - } - - /** - * Sets the link to the next chrono full revision. - * - * @param next - * next chrono full revision - */ - public void setNext(final ChronoFullRevision next) - { - this.next = next; - } - - /** - * Returns the previous chrono full revision. - * - * @return previous chrono full revision - */ - public ChronoFullRevision getPrev() - { - return prev; - } - - /** - * Sets the link to the previous chrono full revision. - * - * @param prev - * previous chrono full revision - */ - public void setPrev(final ChronoFullRevision prev) - { - this.prev = prev; - } - - /** - * Reduces the storage space. - * - * @param currentRevisionIndex - * index of the current revision - * @param revisionIndex - * index of the revision - * @return size of used storage - */ - public long clean(final int currentRevisionIndex, final int revisionIndex) - { - - if (first == null) { - return 0; - } - else if (this.set.isEmpty()) { - this.first = null; - this.size = 0; - return 0; - } - - ChronoStorageBlock next, prev, current = first; - boolean remove; - - do { - remove = false; - - if (current.isDelivered()) { - - next = current.getCounterNext(); - - if (next != null) { - if (current.getRevisionCounter() + 1 == next - .getRevisionCounter()) { - remove = true; - } - } - - } - else if (current.getIndexNext() == null - && current.getIndexPrev() == null) { - - remove = (current.getRevisionIndex() < currentRevisionIndex) - || (current.getRevisionIndex() == revisionIndex); - } - - if (remove) { - // System.out.println("Clearn CFR : " + - // current.getRevisionCounter()); - - prev = current.getCounterPrev(); - next = current.getCounterNext(); - - current.setCounterNext(null); - current.setCounterPrev(null); - - if (prev != null) { - prev.setCounterNext(next); - } - if (next != null) { - next.setCounterPrev(prev); - } - if (current == first) { - this.first = next; - } - - this.size -= current.length(); - current = next; - } - - if (current != null) { - current = current.getCounterNext(); - } - - } - while (current != null); - - return this.size; - } - - /** - * Returns the size of this chrono full revision. - * - * @return size - */ - public long size() - { - return this.size; - } - - /** - * Returns the last revision counter based on this full revision. - * - * @return last revision counter - */ - public int getEndRC() - { - return endRC; - } - - /** - * Returns the pk of the full revision. - * - * @return pk of the full revision - */ - public int getFullRevisionPK() - { - return fullRevisionPK; - } - - /** - * Returns the revision counter of the full revision. - * - * @return first revision counter - */ - public int getStartRC() - { - return startRC; - } +public class ChronoFullRevision { + + /** + * PrimaryKey of the full revision + */ + private final int fullRevisionPK; + + /** + * First revision counter / revision counter of the full revision + */ + private final int startRC; + + /** + * Last revision counter based on the full revision + */ + private final int endRC; + + /** + * Reference to the chrono storage block + */ + private ChronoStorageBlock first; + + /** + * Set containing the IDs of revisions that could be reconstructed + */ + private final Set<Integer> set; + + /** + * Link to the next full revision + */ + private ChronoFullRevision next; + + /** + * Link to the previous full revision + */ + private ChronoFullRevision prev; + + /** + * Number of bytes contained in this object + */ + private long size; + + /** + * (Constructor) Creates a new ChronoFullRevision object. + * + * @param fullRevisionPK primary key of a full revision + * @param startRC revision counter of the full revision + * @param endRC last revision counter based on the full revision + */ + public ChronoFullRevision(final int fullRevisionPK, final int startRC, + final int endRC) { + + this.fullRevisionPK = fullRevisionPK; + this.startRC = startRC; + this.endRC = endRC; + + this.size = 0; + + this.set = new HashSet<>(); + for (int i = startRC; i <= endRC; i++) { + this.set.add(i); + } + } + + /** + * Returns the reference to the ChronoStorageBlock. + * + * @return chrono storage block + */ + public ChronoStorageBlock getFirst() { + return this.first; + } + + /** + * Sets the reference of the ChronoStorageBlock. + * + * @param block chrono storage block + */ + public void setFirst(final ChronoStorageBlock block) { + this.first = block; + } + + /** + * Adds a ChonoStorageBlock to this chrono full revision object. + * + * @param block reference to the chrono storage block + */ + public void add(final ChronoStorageBlock block) { + + int revCount = block.getRevisionCounter(); + this.size += block.length(); + + if (first == null) { + first = block; + } else { + + ChronoStorageBlock previous = null, current = first; + do { + if (revCount < current.getRevisionCounter()) { + + block.setCounterPrev(previous); + block.setCounterNext(current); + + if (previous != null) { + previous.setCounterNext(block); + } + + current.setCounterPrev(block); + + if (current == this.first) { + this.first = block; + } + + return; + } + + previous = current; + current = current.getCounterNext(); + + } + while (current != null); + + // Add to end of list + previous.setCounterNext(block); + block.setCounterPrev(previous); + } + } + + /** + * Returns the nearest available revision to the specified revision counter. + * + * @param revisionCounter revision counter + * @return Revision + */ + public Revision getNearest(final int revisionCounter) { + + if (first != null) { + + ChronoStorageBlock previous = null, current = first; + while (current != null + && current.getRevisionCounter() <= revisionCounter) { + previous = current; + current = current.getCounterNext(); + } + + return previous.getRev(); + } + + return null; + } + + /** + * Removes the revision counter from the list of reconstructible revisions. + * + * @param revisionCounter revision counter + */ + public void remove(final int revisionCounter) { + this.set.remove(revisionCounter); + if (this.set.isEmpty()) { + clean(0, 0); + } + } + + /** + * Returns whether more revisions can be reconstructed by the use of this + * chrono full revision. + * + * @return TRUE | FALSE + */ + public boolean isEmpty() { + return this.set.isEmpty(); + } + + /** + * Returns the next chrono full revision. + * + * @return next chrono full revision + */ + public ChronoFullRevision getNext() { + return next; + } + + /** + * Sets the link to the next chrono full revision. + * + * @param next next chrono full revision + */ + public void setNext(final ChronoFullRevision next) { + this.next = next; + } + + /** + * Returns the previous chrono full revision. + * + * @return previous chrono full revision + */ + public ChronoFullRevision getPrev() { + return prev; + } + + /** + * Sets the link to the previous chrono full revision. + * + * @param prev previous chrono full revision + */ + public void setPrev(final ChronoFullRevision prev) { + this.prev = prev; + } + + /** + * Reduces the storage space. + * + * @param currentRevisionIndex index of the current revision + * @param revisionIndex index of the revision + * @return size of used storage + */ + public long clean(final int currentRevisionIndex, final int revisionIndex) { + + if (first == null) { + return 0; + } else if (this.set.isEmpty()) { + this.first = null; + this.size = 0; + return 0; + } + + ChronoStorageBlock next, prev, current = first; + boolean remove; + + do { + remove = false; + + if (current.isDelivered()) { + + next = current.getCounterNext(); + + if (next != null) { + if (current.getRevisionCounter() + 1 == next + .getRevisionCounter()) { + remove = true; + } + } + + } else if (current.getIndexNext() == null + && current.getIndexPrev() == null) { + + remove = (current.getRevisionIndex() < currentRevisionIndex) + || (current.getRevisionIndex() == revisionIndex); + } + + if (remove) { + // System.out.println("Clearn CFR : " + + // current.getRevisionCounter()); + + prev = current.getCounterPrev(); + next = current.getCounterNext(); + + current.setCounterNext(null); + current.setCounterPrev(null); + + if (prev != null) { + prev.setCounterNext(next); + } + if (next != null) { + next.setCounterPrev(prev); + } + if (current == first) { + this.first = next; + } + + this.size -= current.length(); + current = next; + } + + if (current != null) { + current = current.getCounterNext(); + } + + } + while (current != null); + + return this.size; + } + + /** + * Returns the size of this chrono full revision. + * + * @return size + */ + public long size() { + return this.size; + } + + /** + * Returns the last revision counter based on this full revision. + * + * @return last revision counter + */ + public int getEndRC() { + return endRC; + } + + /** + * Returns the pk of the full revision. + * + * @return pk of the full revision + */ + public int getFullRevisionPK() { + return fullRevisionPK; + } + + /** + * Returns the revision counter of the full revision. + * + * @return first revision counter + */ + public int getStartRC() { + return startRC; + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/chrono/ChronoIterator.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/chrono/ChronoIterator.java index 5cb0142d..178c0d64 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/chrono/ChronoIterator.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/chrono/ChronoIterator.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -33,225 +33,230 @@ /** * ChronoIterator Iterates articles in chronological order. * <p> - * + * <p> * 1 */ -public class ChronoIterator -{ +public class ChronoIterator { + + /** + * Reference to the configuration + */ + private final RevisionAPIConfiguration config; + + /** + * Reference to the database connection + */ + private final Connection connection; + + /** + * Reference to the ChronoStorage + */ + private final ChronoStorage chronoStorage; + + /** + * currently used article pk + */ + private final int articlePK; + + /** + * revision index + */ + private int revisionIndex; + + /** + * maximum revision + */ + private final int maxRevision; + + /** + * ChronoFullRevision Storage + */ + private final Map<Integer, ChronoFullRevision> fullRevStorage; + + /** + * Mapping chronological position to revision counter + */ + private final Map<Integer, Integer> mappingStorage; + + /** + * (Constructor) Create a ChronoIterator object + * + * @param config reference to the configuration + * @param connection reference to the database connection + * @param mapping mapping (chrono counter to revision counter) + * @param fullRevisionPKs space separated list of full revision pks + * @param revisionCounters space separated list of revision counter intervals + */ + public ChronoIterator(final RevisionAPIConfiguration config, + final Connection connection, final String mapping, + final String fullRevisionPKs, final String revisionCounters) { + + this.config = config; + this.connection = connection; + + int index = fullRevisionPKs.indexOf(' '); + if (index == -1) { + index = fullRevisionPKs.length(); + } + + articlePK = Integer.parseInt(fullRevisionPKs.substring(0, index)); + + index = revisionCounters.lastIndexOf(' '); + if (index == -1) { + throw new RuntimeException("Invalid revisioncounter content"); + } + + this.revisionIndex = 0; + this.maxRevision = Integer.parseInt(revisionCounters.substring( + index + 1, revisionCounters.length())); + + Map<Integer, Integer> reverseMappingStorage = new HashMap<>(); + + this.mappingStorage = new HashMap<>(); + this.fullRevStorage = new HashMap<>(); + + ChronoFullRevision previous = null, current, firstCFR = null; + + int length; + int revC, mapC; + + int max = mapping.length(); + length = 0; + + // Creates the mapping information for each revision + while (length < max) { + + // Read revisionCounter + index = mapping.indexOf(' ', length); + revC = Integer.parseInt(mapping.substring(length, index)); + length = index + 1; + + // Read mappedCounter + index = mapping.indexOf(' ', length); + if (index == -1) { + index = mapping.length(); + } + mapC = Integer.parseInt(mapping.substring(length, index)); + length = index + 1; + + reverseMappingStorage.put(revC, mapC); + mappingStorage.put(mapC, revC); + } + + length = 0; + max = revisionCounters.length(); + int fullRevPK, lengthFR = 0; + + // Creates the full revision blocks for each full revision + while (length < max) { + + // Read fullRevisionPK (as string) + index = fullRevisionPKs.indexOf(' ', lengthFR); + if (index == -1) { + index = fullRevisionPKs.length(); + } + + fullRevPK = Integer.parseInt(fullRevisionPKs.substring(lengthFR, + index)); + lengthFR = index + 1; + + // Read start RC + index = revisionCounters.indexOf(' ', length); + revC = Integer.parseInt(revisionCounters.substring(length, index)); + length = index + 1; + + // Read end RC + index = revisionCounters.indexOf(' ', length); + if (index == -1) { + index = revisionCounters.length(); + } + mapC = Integer.parseInt(revisionCounters.substring(length, index)); + length = index + 1; + + // Constructs a double linked list containing the full revision + current = new ChronoFullRevision(fullRevPK, revC, mapC); + if (firstCFR == null) { + firstCFR = current; + } else { + current.setPrev(previous); + previous.setNext(current); + } + + // Add index information for each revision contained in such + // a block + for (int i = revC; i <= mapC; i++) { + fullRevStorage.put(i, current); + } - /** Reference to the configuration */ - private final RevisionAPIConfiguration config; + previous = current; + } - /** Reference to the database connection */ - private final Connection connection; - - /** Reference to the ChronoStorage */ - private final ChronoStorage chronoStorage; - - /** currently used article pk */ - private final int articlePK; + // Create ChronoStorage object + this.chronoStorage = new ChronoStorage(config, reverseMappingStorage, + firstCFR, fullRevStorage); + } + + /** + * Returns if all revision have retrieved. + * + * @return + */ + public boolean hasNext() { + return ++revisionIndex <= maxRevision; + } + + /** + * Returns the next revision. + * + * @return next revision + */ + public Revision next() + throws Exception { + + // Checks whether the next revision has already been reconstructed. + Revision revision; + if (chronoStorage.isTop(revisionIndex)) { + + // If this is the case the revision will removed from the storage + return chronoStorage.remove(); + } - /** revision index */ - private int revisionIndex; - - /** maximum revision */ - private final int maxRevision; - - /** ChronoFullRevision Storage */ - private final Map<Integer, ChronoFullRevision> fullRevStorage; - - /** Mapping chronological position to revision counter */ - private final Map<Integer, Integer> mappingStorage; - - /** - * (Constructor) Create a ChronoIterator object - * - * @param config - * reference to the configuration - * @param connection - * reference to the database connection - * @param mapping - * mapping (chrono counter to revision counter) - * @param fullRevisionPKs - * space separated list of full revision pks - * @param revisionCounters - * space separated list of revision counter intervals - */ - public ChronoIterator(final RevisionAPIConfiguration config, - final Connection connection, final String mapping, - final String fullRevisionPKs, final String revisionCounters) - { - - this.config = config; - this.connection = connection; - - int index = fullRevisionPKs.indexOf(' '); - if (index == -1) { - index = fullRevisionPKs.length(); - } - - articlePK = Integer.parseInt(fullRevisionPKs.substring(0, index)); - - index = revisionCounters.lastIndexOf(' '); - if (index == -1) { - throw new RuntimeException("Invalid revisioncounter content"); - } - - this.revisionIndex = 0; - this.maxRevision = Integer.parseInt(revisionCounters.substring( - index + 1, revisionCounters.length())); - - Map<Integer, Integer> reverseMappingStorage = new HashMap<>(); - - this.mappingStorage = new HashMap<>(); - this.fullRevStorage = new HashMap<>(); - - ChronoFullRevision previous = null, current, firstCFR = null; - - int length; - int revC, mapC; - - int max = mapping.length(); - length = 0; - - // Creates the mapping information for each revision - while (length < max) { - - // Read revisionCounter - index = mapping.indexOf(' ', length); - revC = Integer.parseInt(mapping.substring(length, index)); - length = index + 1; - - // Read mappedCounter - index = mapping.indexOf(' ', length); - if (index == -1) { - index = mapping.length(); - } - mapC = Integer.parseInt(mapping.substring(length, index)); - length = index + 1; - - reverseMappingStorage.put(revC, mapC); - mappingStorage.put(mapC, revC); - } - - length = 0; - max = revisionCounters.length(); - int fullRevPK, lengthFR = 0; - - // Creates the full revision blocks for each full revision - while (length < max) { - - // Read fullRevisionPK (as string) - index = fullRevisionPKs.indexOf(' ', lengthFR); - if (index == -1) { - index = fullRevisionPKs.length(); - } - - fullRevPK = Integer.parseInt(fullRevisionPKs.substring(lengthFR, - index)); - lengthFR = index + 1; - - // Read start RC - index = revisionCounters.indexOf(' ', length); - revC = Integer.parseInt(revisionCounters.substring(length, index)); - length = index + 1; - - // Read end RC - index = revisionCounters.indexOf(' ', length); - if (index == -1) { - index = revisionCounters.length(); - } - mapC = Integer.parseInt(revisionCounters.substring(length, index)); - length = index + 1; - - // Constructs a double linked list containing the full revision - current = new ChronoFullRevision(fullRevPK, revC, mapC); - if (firstCFR == null) { - firstCFR = current; - } - else { - current.setPrev(previous); - previous.setNext(current); - } - - // Add index information for each revision contained in such - // a block - for (int i = revC; i <= mapC; i++) { - fullRevStorage.put(i, current); - } - - previous = current; - } - - // Create ChronoStorage object - this.chronoStorage = new ChronoStorage(config, reverseMappingStorage, - firstCFR, fullRevStorage); - } - - /** - * Returns if all revision have retrieved. - * - * @return - */ - public boolean hasNext() - { - return ++revisionIndex <= maxRevision; - } - - /** - * Returns the next revision. - * - * @return next revision - */ - public Revision next() - throws Exception - { - - // Checks whether the next revision has already been reconstructed. - Revision revision; - if (chronoStorage.isTop(revisionIndex)) { - - // If this is the case the revision will removed from the storage - return chronoStorage.remove(); - } - - // Otherwise the chronological order counter will be mapped to the - // revsision counter - int revCount = revisionIndex; - if (mappingStorage.containsKey(revisionIndex)) { - revCount = mappingStorage.get(revisionIndex); - } - - // Retrieve the related full revision block - ChronoFullRevision cfr = fullRevStorage.get(revCount); - - int queryPK, limit, previousRevisionCounter; - String previousRevision; - - // Determine the nearest revision that could be used to construct - // the specified revision - revision = cfr.getNearest(revCount); - if (revision == null) { - - // Create query bounds (all revisions from the full revision till - // now) - queryPK = articlePK + cfr.getStartRC() - 1; - limit = revCount - cfr.getStartRC() + 1; - - previousRevision = null; - previousRevisionCounter = -1; - - } - else { - - // Create query bounds (only new revisions, last known + 1 till now) - queryPK = revision.getPrimaryKey() + 1; - limit = revCount - revision.getRevisionCounter(); - - previousRevision = revision.getRevisionText(); - previousRevisionCounter = revision.getRevisionCounter(); - - } + // Otherwise the chronological order counter will be mapped to the + // revsision counter + int revCount = revisionIndex; + if (mappingStorage.containsKey(revisionIndex)) { + revCount = mappingStorage.get(revisionIndex); + } + + // Retrieve the related full revision block + ChronoFullRevision cfr = fullRevStorage.get(revCount); + + int queryPK, limit, previousRevisionCounter; + String previousRevision; + + // Determine the nearest revision that could be used to construct + // the specified revision + revision = cfr.getNearest(revCount); + if (revision == null) { + + // Create query bounds (all revisions from the full revision till + // now) + queryPK = articlePK + cfr.getStartRC() - 1; + limit = revCount - cfr.getStartRC() + 1; + + previousRevision = null; + previousRevisionCounter = -1; + + } else { + + // Create query bounds (only new revisions, last known + 1 till now) + queryPK = revision.getPrimaryKey() + 1; + limit = revCount - revision.getRevisionCounter(); + + previousRevision = revision.getRevisionText(); + previousRevisionCounter = revision.getRevisionCounter(); + + } revision = null; @@ -329,7 +334,7 @@ public Revision next() + ", RevisionId " + result.getInt(4) + ", RevisionCounter " + result.getInt(3) + "]"); - return null; + return null; } // Add the reconstructed revision to the storage @@ -349,15 +354,14 @@ public Revision next() } } - } - - /** - * Returns the storage size description. - * - * @return storage size description - */ - public String getStorageSize() - { - return chronoStorage.getStorageSize(); - } + } + + /** + * Returns the storage size description. + * + * @return storage size description + */ + public String getStorageSize() { + return chronoStorage.getStorageSize(); + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/chrono/ChronoStorage.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/chrono/ChronoStorage.java index 7b7ef919..c4aff6e4 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/chrono/ChronoStorage.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/chrono/ChronoStorage.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -25,320 +25,318 @@ /** * This class represents the chrono storage. - * - * + * <p> + * <p> * 1 */ -public class ChronoStorage -{ - - /** Index of the currently used revision */ - private int revisionIndex; - - /** Reference to the first chrono storage block */ - private ChronoStorageBlock first; - - /** Reference to the last chrono storage block */ - private ChronoStorageBlock last; - - /** Map containing the chrono storage block and their index keys */ - private final Map<Integer, ChronoStorageBlock> storage; - - /** Reverse mapping */ - private final Map<Integer, Integer> mapping; - - /** - * Map containing reference to the chrono full revisions (Mapping of - * revision counter and their full revision blocks) - */ - private final Map<Integer, ChronoFullRevision> fullRevStorage; - - /** Reference to the first chrono full revision */ - private final ChronoFullRevision firstCFR; - - /** Size of the chrono storage */ - private long size; - - /** Configuration parameter - maximum size of this storage */ - private final long MAX_STORAGE_SIZE; - - /** - * (Constructor) Creates a ChronoStorage object - * - * @param config - * Reference to the configuration - * @param mapping - * Mapping information (revision counter -> chronological - * revision counter) - * @param firstCFR - * Head of the double linked list of full revisions blocks - * @param fullRevStorage - * Mapping of revision counter and their full revision blocks - */ - public ChronoStorage(final RevisionAPIConfiguration config, - final Map<Integer, Integer> mapping, - final ChronoFullRevision firstCFR, - final Map<Integer, ChronoFullRevision> fullRevStorage) - { - - this.revisionIndex = 0; - this.last = null; - this.first = null; - this.storage = new HashMap<>(); - - this.mapping = mapping; - this.fullRevStorage = fullRevStorage; - this.firstCFR = firstCFR; - - MAX_STORAGE_SIZE = config.getChronoStorageSpace(); - } - - /** - * Adds a revision to the chrono storage. - * - * @param rev - * reference to the revision - */ - public void add(final Revision rev) - { - - int revIndex = rev.getRevisionCounter(); - if (this.mapping.containsKey(revIndex)) { - revIndex = this.mapping.get(revIndex); - } - - // System.out.println("Store " + rev.getRevisionCounter() + " with " + - // revIndex); - - ChronoFullRevision cfr = this.fullRevStorage.get(rev - .getRevisionCounter()); - ChronoStorageBlock block = new ChronoStorageBlock(cfr, revIndex, rev); - cfr.add(block); - - if (revIndex < revisionIndex) { - // System.out.println("Revision has already been processed: " + - // revIndex); - block.setDelivered(true); - return; - } - - clean(); - - if (this.storage.containsKey(revIndex)) { - // throw new IllegalArgumentException(revisionIndex + - // "- Object already contained: " + revIndex); - return; - } - - storage.put(revIndex, block); - size += block.length(); - - if (first == null) { - first = block; - last = block; - } - else { - - ChronoStorageBlock previous = null, current = first; - do { - if (revIndex < current.getRevisionIndex()) { - - block.setIndexPrev(previous); - block.setIndexNext(current); - - if (previous != null) { - previous.setIndexNext(block); - } - current.setIndexPrev(block); - - if (current == first) { - this.first = block; - } - - return; - } - - previous = current; - current = current.getIndexNext(); - - } - while (current != null); - - // Add to end of list - previous.setIndexNext(block); - block.setIndexPrev(previous); - - this.last = block; - } - } - - /** - * Returns whether more chrono storage blocks are available. - * - * @return TRUE | FALSE - */ - public boolean hasMore() - { - return this.first != null; - } - - /** - * Removes a revision from the chrono storage. - * - * @return - */ - public Revision remove() - { - - ChronoStorageBlock block = first; - this.revisionIndex = block.getRevisionIndex(); - - ChronoStorageBlock next = block.getIndexNext(); - this.first = next; - - if (next != null) { - this.first.setIndexPrev(null); - } - else { - this.last = null; - } - - /* - * System.out.println("Deliver " + block.getRevisionIndex() + " RI|RC " - * + block.getRevisionCounter()); if (first != null) { - * System.out.println("OnTop: " + first.getRevisionIndex()); } - */ - block.setDelivered(true); - - // Remove from fullRevSet - ChronoFullRevision cfr = block.getChronoFullRevision(); - cfr.remove(block.getRevisionCounter()); - - if (storage.remove(block.getRevisionIndex()) == null) { - throw new RuntimeException("VALUE WAS NOT REMOVED FROM STORAGE"); - } - - // Subtract size - Revision rev = block.getRev(); - size -= rev.getRevisionText().length(); - return rev; - } - - /** - * Checks whether the specified chrono storage block is contained or not. - * - * @param revisionIndex - * chronological order index - * @return - */ - public boolean contains(final int revisionIndex) - { - return this.storage.containsKey(revisionIndex); - } - - /** - * Checks whether the chrono storage block is on top or not. - * - * @param revisionIndex - * chronological order index - * @return - */ - public boolean isTop(final int revisionIndex) - { - if (this.first != null) { - return this.first.getRevisionIndex() == revisionIndex; - } - - return false; - } - - /** - * Returns the revision of the specified chrono storage block. - * - * @param revisionIndex - * chronological order index - * @return - */ - public Revision get(final int revisionIndex) - { - if (this.storage.containsKey(revisionIndex)) { - - ChronoStorageBlock block = this.storage.get(revisionIndex); - return block.getRev(); - } - return null; - } - - /** Temporary variable - total size of the chrono storage */ - private long totalSize; - - /** - * Reduces the amount of used storage by discarding chrono storage blocks. - */ - public void clean() - { - - ChronoFullRevision cfr = firstCFR; - totalSize = size; - while (cfr != null) { - totalSize += cfr.size(); - cfr = cfr.getNext(); - } - - if (totalSize < MAX_STORAGE_SIZE) { - return; - } - - cfr = firstCFR; - while (cfr != null) { - totalSize += cfr.clean(revisionIndex, 0); - cfr = cfr.getNext(); - } - - ChronoStorageBlock block; - while (last != null && totalSize >= MAX_STORAGE_SIZE) { - - // System.out.println("CLEAN " + last.getRevisionIndex()); - - // Retrieve previous block - block = last.getIndexPrev(); - - // Subtract size - if (storage.remove(last.getRevisionIndex()) == null) { - throw new RuntimeException("VALUE WAS NOT REMOVED FROM STORAGE"); - } - totalSize -= last.length(); - size += last.length(); - - // Delete references - if (block != null) { - block.setIndexNext(null); - } - last.setIndexPrev(null); - - cfr = last.getChronoFullRevision(); - totalSize += cfr.size() - - cfr.clean(revisionIndex, last.getRevisionIndex()); - - if (last == first) { - first = null; - } - - // Set the new last - last = block; - } - - - } - - /** - * Returns a description of the chrono storage size. - * - * @return current revision index | storage size | size | total size - */ - public String getStorageSize() - { - return this.revisionIndex + " | " + this.storage.size() + " | " - + this.size + " | " + totalSize; - } +public class ChronoStorage { + + /** + * Index of the currently used revision + */ + private int revisionIndex; + + /** + * Reference to the first chrono storage block + */ + private ChronoStorageBlock first; + + /** + * Reference to the last chrono storage block + */ + private ChronoStorageBlock last; + + /** + * Map containing the chrono storage block and their index keys + */ + private final Map<Integer, ChronoStorageBlock> storage; + + /** + * Reverse mapping + */ + private final Map<Integer, Integer> mapping; + + /** + * Map containing reference to the chrono full revisions (Mapping of + * revision counter and their full revision blocks) + */ + private final Map<Integer, ChronoFullRevision> fullRevStorage; + + /** + * Reference to the first chrono full revision + */ + private final ChronoFullRevision firstCFR; + + /** + * Size of the chrono storage + */ + private long size; + + /** + * Configuration parameter - maximum size of this storage + */ + private final long MAX_STORAGE_SIZE; + + /** + * (Constructor) Creates a ChronoStorage object + * + * @param config Reference to the configuration + * @param mapping Mapping information (revision counter -> chronological + * revision counter) + * @param firstCFR Head of the double linked list of full revisions blocks + * @param fullRevStorage Mapping of revision counter and their full revision blocks + */ + public ChronoStorage(final RevisionAPIConfiguration config, + final Map<Integer, Integer> mapping, + final ChronoFullRevision firstCFR, + final Map<Integer, ChronoFullRevision> fullRevStorage) { + + this.revisionIndex = 0; + this.last = null; + this.first = null; + this.storage = new HashMap<>(); + + this.mapping = mapping; + this.fullRevStorage = fullRevStorage; + this.firstCFR = firstCFR; + + MAX_STORAGE_SIZE = config.getChronoStorageSpace(); + } + + /** + * Adds a revision to the chrono storage. + * + * @param rev reference to the revision + */ + public void add(final Revision rev) { + + int revIndex = rev.getRevisionCounter(); + if (this.mapping.containsKey(revIndex)) { + revIndex = this.mapping.get(revIndex); + } + + // System.out.println("Store " + rev.getRevisionCounter() + " with " + + // revIndex); + + ChronoFullRevision cfr = this.fullRevStorage.get(rev + .getRevisionCounter()); + ChronoStorageBlock block = new ChronoStorageBlock(cfr, revIndex, rev); + cfr.add(block); + + if (revIndex < revisionIndex) { + // System.out.println("Revision has already been processed: " + + // revIndex); + block.setDelivered(true); + return; + } + + clean(); + + if (this.storage.containsKey(revIndex)) { + // throw new IllegalArgumentException(revisionIndex + + // "- Object already contained: " + revIndex); + return; + } + + storage.put(revIndex, block); + size += block.length(); + + if (first == null) { + first = block; + last = block; + } else { + + ChronoStorageBlock previous = null, current = first; + do { + if (revIndex < current.getRevisionIndex()) { + + block.setIndexPrev(previous); + block.setIndexNext(current); + + if (previous != null) { + previous.setIndexNext(block); + } + current.setIndexPrev(block); + + if (current == first) { + this.first = block; + } + + return; + } + + previous = current; + current = current.getIndexNext(); + + } + while (current != null); + + // Add to end of list + previous.setIndexNext(block); + block.setIndexPrev(previous); + + this.last = block; + } + } + + /** + * Returns whether more chrono storage blocks are available. + * + * @return TRUE | FALSE + */ + public boolean hasMore() { + return this.first != null; + } + + /** + * Removes a revision from the chrono storage. + * + * @return + */ + public Revision remove() { + + ChronoStorageBlock block = first; + this.revisionIndex = block.getRevisionIndex(); + + ChronoStorageBlock next = block.getIndexNext(); + this.first = next; + + if (next != null) { + this.first.setIndexPrev(null); + } else { + this.last = null; + } + + /* + * System.out.println("Deliver " + block.getRevisionIndex() + " RI|RC " + * + block.getRevisionCounter()); if (first != null) { + * System.out.println("OnTop: " + first.getRevisionIndex()); } + */ + block.setDelivered(true); + + // Remove from fullRevSet + ChronoFullRevision cfr = block.getChronoFullRevision(); + cfr.remove(block.getRevisionCounter()); + + if (storage.remove(block.getRevisionIndex()) == null) { + throw new RuntimeException("VALUE WAS NOT REMOVED FROM STORAGE"); + } + + // Subtract size + Revision rev = block.getRev(); + size -= rev.getRevisionText().length(); + return rev; + } + + /** + * Checks whether the specified chrono storage block is contained or not. + * + * @param revisionIndex chronological order index + * @return + */ + public boolean contains(final int revisionIndex) { + return this.storage.containsKey(revisionIndex); + } + + /** + * Checks whether the chrono storage block is on top or not. + * + * @param revisionIndex chronological order index + * @return + */ + public boolean isTop(final int revisionIndex) { + if (this.first != null) { + return this.first.getRevisionIndex() == revisionIndex; + } + + return false; + } + + /** + * Returns the revision of the specified chrono storage block. + * + * @param revisionIndex chronological order index + * @return + */ + public Revision get(final int revisionIndex) { + if (this.storage.containsKey(revisionIndex)) { + + ChronoStorageBlock block = this.storage.get(revisionIndex); + return block.getRev(); + } + return null; + } + + /** + * Temporary variable - total size of the chrono storage + */ + private long totalSize; + + /** + * Reduces the amount of used storage by discarding chrono storage blocks. + */ + public void clean() { + + ChronoFullRevision cfr = firstCFR; + totalSize = size; + while (cfr != null) { + totalSize += cfr.size(); + cfr = cfr.getNext(); + } + + if (totalSize < MAX_STORAGE_SIZE) { + return; + } + + cfr = firstCFR; + while (cfr != null) { + totalSize += cfr.clean(revisionIndex, 0); + cfr = cfr.getNext(); + } + + ChronoStorageBlock block; + while (last != null && totalSize >= MAX_STORAGE_SIZE) { + + // System.out.println("CLEAN " + last.getRevisionIndex()); + + // Retrieve previous block + block = last.getIndexPrev(); + + // Subtract size + if (storage.remove(last.getRevisionIndex()) == null) { + throw new RuntimeException("VALUE WAS NOT REMOVED FROM STORAGE"); + } + totalSize -= last.length(); + size += last.length(); + + // Delete references + if (block != null) { + block.setIndexNext(null); + } + last.setIndexPrev(null); + + cfr = last.getChronoFullRevision(); + totalSize += cfr.size() + - cfr.clean(revisionIndex, last.getRevisionIndex()); + + if (last == first) { + first = null; + } + + // Set the new last + last = block; + } + + + } + + /** + * Returns a description of the chrono storage size. + * + * @return current revision index | storage size | size | total size + */ + public String getStorageSize() { + return this.revisionIndex + " | " + this.storage.size() + " | " + + this.size + " | " + totalSize; + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/chrono/ChronoStorageBlock.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/chrono/ChronoStorageBlock.java index 3e647d2e..f3d6d1bd 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/chrono/ChronoStorageBlock.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/chrono/ChronoStorageBlock.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -30,199 +30,190 @@ * - links to the previous and next counter block an counter reference describes * the normal order * <p> - * + * <p> * 1 */ -public class ChronoStorageBlock -{ - - /** Reference to the chrono full revision */ - private final ChronoFullRevision cfr; - - /** Index of the revision */ - private final int revisionIndex; - - /** Revision */ - private final Revision rev; - - /** Flag, indicating whether the revision was already returned or not */ - private boolean delivered; - - /** Reference to the previous index block */ - private ChronoStorageBlock indexPrev; - - /** Reference to the next index block */ - private ChronoStorageBlock indexNext; - - /** Reference to the previous counter block */ - private ChronoStorageBlock counterPrev; - - /** Reference to the next counter block */ - private ChronoStorageBlock counterNext; - - /** - * Returns the related chrono full revision. - * - * @return chrono full revision - */ - public ChronoFullRevision getChronoFullRevision() - { - return this.cfr; - } - - /** - * Returns the next counter block. - * - * @return next counter block - */ - public ChronoStorageBlock getCounterNext() - { - return counterNext; - } - - /** - * Sets the next counter block. - * - * @param counterNext - * next counter block - */ - public void setCounterNext(final ChronoStorageBlock counterNext) - { - this.counterNext = counterNext; - } - - /** - * Returns the previous counter block. - * - * @return previous counter block - */ - public ChronoStorageBlock getCounterPrev() - { - return counterPrev; - } - - /** - * Sets the previous counter block. - * - * @param counterPrev - * previous counter block - */ - public void setCounterPrev(final ChronoStorageBlock counterPrev) - { - this.counterPrev = counterPrev; - } - - /** - * Returns the next index block. - * - * @return next index block - */ - public ChronoStorageBlock getIndexNext() - { - return indexNext; - } - - /** - * Sets the next index block. - * - * @param indexNext - * next index block - */ - public void setIndexNext(final ChronoStorageBlock indexNext) - { - this.indexNext = indexNext; - } - - /** - * Returns the previous index block. - * - * @return previous index block - */ - public ChronoStorageBlock getIndexPrev() - { - return indexPrev; - } - - /** - * Sets the previous index block. - * - * @param indexPrev - * previous counter block - */ - public void setIndexPrev(final ChronoStorageBlock indexPrev) - { - this.indexPrev = indexPrev; - } - - /** - * (Constructor) Creates a new ChronoStorageBlock. - * - * @param cfr - * Reference to the chrono full revision - * @param revisionIndex - * Index of this revision - * @param rev - * Reference to the revision - */ - public ChronoStorageBlock(final ChronoFullRevision cfr, - final int revisionIndex, final Revision rev) - { - - this.cfr = cfr; - - this.revisionIndex = revisionIndex; - this.rev = rev; - this.delivered = false; - } - - public Revision getRev() - { - return rev; - } - - /** - * Returns whether this revision was already returned or not. - * - * @return flag - */ - public boolean isDelivered() - { - return delivered; - } - - /** - * Sets whether this revision was already returned or not. - * - * @param delivered - * flag - */ - public void setDelivered(final boolean delivered) - { - this.delivered = delivered; - } - - /** - * Returns the revision index. - * - * @return revision index - */ - public int getRevisionIndex() - { - return revisionIndex; - } - - /** - * Returns the revision counter. - * - * @return revision counter - */ - public int getRevisionCounter() - { - return this.rev.getRevisionCounter(); - } - - public int length() - { - return this.rev.getRevisionText().length(); - } +public class ChronoStorageBlock { + + /** + * Reference to the chrono full revision + */ + private final ChronoFullRevision cfr; + + /** + * Index of the revision + */ + private final int revisionIndex; + + /** + * Revision + */ + private final Revision rev; + + /** + * Flag, indicating whether the revision was already returned or not + */ + private boolean delivered; + + /** + * Reference to the previous index block + */ + private ChronoStorageBlock indexPrev; + + /** + * Reference to the next index block + */ + private ChronoStorageBlock indexNext; + + /** + * Reference to the previous counter block + */ + private ChronoStorageBlock counterPrev; + + /** + * Reference to the next counter block + */ + private ChronoStorageBlock counterNext; + + /** + * Returns the related chrono full revision. + * + * @return chrono full revision + */ + public ChronoFullRevision getChronoFullRevision() { + return this.cfr; + } + + /** + * Returns the next counter block. + * + * @return next counter block + */ + public ChronoStorageBlock getCounterNext() { + return counterNext; + } + + /** + * Sets the next counter block. + * + * @param counterNext next counter block + */ + public void setCounterNext(final ChronoStorageBlock counterNext) { + this.counterNext = counterNext; + } + + /** + * Returns the previous counter block. + * + * @return previous counter block + */ + public ChronoStorageBlock getCounterPrev() { + return counterPrev; + } + + /** + * Sets the previous counter block. + * + * @param counterPrev previous counter block + */ + public void setCounterPrev(final ChronoStorageBlock counterPrev) { + this.counterPrev = counterPrev; + } + + /** + * Returns the next index block. + * + * @return next index block + */ + public ChronoStorageBlock getIndexNext() { + return indexNext; + } + + /** + * Sets the next index block. + * + * @param indexNext next index block + */ + public void setIndexNext(final ChronoStorageBlock indexNext) { + this.indexNext = indexNext; + } + + /** + * Returns the previous index block. + * + * @return previous index block + */ + public ChronoStorageBlock getIndexPrev() { + return indexPrev; + } + + /** + * Sets the previous index block. + * + * @param indexPrev previous counter block + */ + public void setIndexPrev(final ChronoStorageBlock indexPrev) { + this.indexPrev = indexPrev; + } + + /** + * (Constructor) Creates a new ChronoStorageBlock. + * + * @param cfr Reference to the chrono full revision + * @param revisionIndex Index of this revision + * @param rev Reference to the revision + */ + public ChronoStorageBlock(final ChronoFullRevision cfr, + final int revisionIndex, final Revision rev) { + + this.cfr = cfr; + + this.revisionIndex = revisionIndex; + this.rev = rev; + this.delivered = false; + } + + public Revision getRev() { + return rev; + } + + /** + * Returns whether this revision was already returned or not. + * + * @return flag + */ + public boolean isDelivered() { + return delivered; + } + + /** + * Sets whether this revision was already returned or not. + * + * @param delivered flag + */ + public void setDelivered(final boolean delivered) { + this.delivered = delivered; + } + + /** + * Returns the revision index. + * + * @return revision index + */ + public int getRevisionIndex() { + return revisionIndex; + } + + /** + * Returns the revision counter. + * + * @return revision counter + */ + public int getRevisionCounter() { + return this.rev.getRevisionCounter(); + } + + public int length() { + return this.rev.getRevisionText().length(); + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/archivers/Bzip2Archiver.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/archivers/Bzip2Archiver.java index e4a7e7a1..4b212920 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/archivers/Bzip2Archiver.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/archivers/Bzip2Archiver.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -33,163 +33,150 @@ /** * Class provides basic bzip2 compression/decompression functionality - * - * */ -public class Bzip2Archiver -{ - - // Size to write in memory while compressing (in bytes) - private static final int COMPRESSION_CACHE = 10000000; - - // Size to write in memory while decompressing (in bytes) - private static final int DECOMPRESSION_CACHE = 10000000; - - /** - * Creates bz2 archive file from file in path - * - * @param path - * to file to compress - */ - public void compress(String path) - { - try { - - File fileToArchive = new File(path); - - BufferedInputStream input = new BufferedInputStream(new FileInputStream(fileToArchive)); - - File archivedFile = new File(fileToArchive.getName() + ".bz2"); - archivedFile.createNewFile(); - - FileOutputStream fos = new FileOutputStream(archivedFile); - BufferedOutputStream bufStr = new BufferedOutputStream(fos); - // added bzip2 prefix - fos.write("BZ".getBytes()); - BZip2CompressorOutputStream bzip2 = new BZip2CompressorOutputStream(bufStr); - - while (input.available() > 0) { - int size = COMPRESSION_CACHE; - - if (input.available() < COMPRESSION_CACHE) { - size = input.available(); - } - byte[] bytes = new byte[size]; - - input.read(bytes); +public class Bzip2Archiver { + + // Size to write in memory while compressing (in bytes) + private static final int COMPRESSION_CACHE = 10000000; + + // Size to write in memory while decompressing (in bytes) + private static final int DECOMPRESSION_CACHE = 10000000; + + /** + * Creates bz2 archive file from file in path + * + * @param path to file to compress + */ + public void compress(String path) { + try { + + File fileToArchive = new File(path); + + BufferedInputStream input = new BufferedInputStream(new FileInputStream(fileToArchive)); + + File archivedFile = new File(fileToArchive.getName() + ".bz2"); + archivedFile.createNewFile(); + + FileOutputStream fos = new FileOutputStream(archivedFile); + BufferedOutputStream bufStr = new BufferedOutputStream(fos); + // added bzip2 prefix + fos.write("BZ".getBytes()); + BZip2CompressorOutputStream bzip2 = new BZip2CompressorOutputStream(bufStr); + + while (input.available() > 0) { + int size = COMPRESSION_CACHE; + + if (input.available() < COMPRESSION_CACHE) { + size = input.available(); + } + byte[] bytes = new byte[size]; + + input.read(bytes); + + bzip2.write(bytes); + } + bzip2.close(); + bufStr.close(); + fos.close(); + input.close(); + + } catch (IOException e) { + e.printStackTrace(); + } + + } + + /** + * Creates stream for compression + * + * @param path path to file to compress + * @return compression stream + * @throws IOException + */ + public OutputStream getCompressionStream(String path) + throws IOException { + File archivedFile = new File(path); + + archivedFile.createNewFile(); + + FileOutputStream fos = new FileOutputStream(archivedFile); + + BufferedOutputStream bufStr = new BufferedOutputStream(fos); + // added bzip2 prefix + fos.write("BZ".getBytes()); + + BZip2CompressorOutputStream bzip2 = new BZip2CompressorOutputStream(bufStr); + return bzip2; + } + + /** + * Creates Stream for decompression + * + * @param path path to file to uncompress + * @param encoding ecoding to use + * @return decompression stream + * @throws IOException + */ + public InputStreamReader getDecompressionStream(String path, String encoding) + throws IOException { + File fileToUncompress = new File(path); + + BufferedInputStream fileStream = new BufferedInputStream(new FileInputStream(fileToUncompress)); + + // read bzip2 prefix: BZ + fileStream.read(); + fileStream.read(); + + BufferedInputStream bufferedStream = new BufferedInputStream(fileStream); + + BZip2CompressorInputStream input = new BZip2CompressorInputStream(bufferedStream); + + return new InputStreamReader(input, encoding); + + } + + /** + * Uncompress bz2 file + * + * @param path path to file to uncompress + * @throws IOException + */ + public void decompress(String path) + throws IOException { + File bzip2 = new File(path); + + // + File unarchived = new File(bzip2.getName().replace(".bz2", "")); + + unarchived.createNewFile(); + + BufferedInputStream inputStr = new BufferedInputStream(new FileInputStream(bzip2)); + + // read bzip2 prefix + inputStr.read(); + inputStr.read(); + + BufferedInputStream buffStr = new BufferedInputStream(inputStr); + + BZip2CompressorInputStream input = new BZip2CompressorInputStream(buffStr); + + FileOutputStream outStr = new FileOutputStream(unarchived); + + while (true) { + byte[] compressedBytes = new byte[DECOMPRESSION_CACHE]; + + int byteRead = input.read(compressedBytes); + + outStr.write(compressedBytes, 0, byteRead); + if (byteRead != DECOMPRESSION_CACHE) { + break; + } + } - bzip2.write(bytes); - } - bzip2.close(); - bufStr.close(); - fos.close(); - input.close(); - - } - catch (IOException e) { - e.printStackTrace(); - } - - } - - /** - * Creates stream for compression - * - * @param path - * path to file to compress - * @return compression stream - * @throws IOException - */ - public OutputStream getCompressionStream(String path) - throws IOException - { - File archivedFile = new File(path); - - archivedFile.createNewFile(); - - FileOutputStream fos = new FileOutputStream(archivedFile); - - BufferedOutputStream bufStr = new BufferedOutputStream(fos); - // added bzip2 prefix - fos.write("BZ".getBytes()); - - BZip2CompressorOutputStream bzip2 = new BZip2CompressorOutputStream(bufStr); - return bzip2; - } - - /** - * Creates Stream for decompression - * - * @param path - * path to file to uncompress - * @param encoding - * ecoding to use - * @return decompression stream - * @throws IOException - */ - public InputStreamReader getDecompressionStream(String path, String encoding) - throws IOException - { - File fileToUncompress = new File(path); - - BufferedInputStream fileStream = new BufferedInputStream(new FileInputStream(fileToUncompress)); - - // read bzip2 prefix: BZ - fileStream.read(); - fileStream.read(); - - BufferedInputStream bufferedStream = new BufferedInputStream(fileStream); - - BZip2CompressorInputStream input = new BZip2CompressorInputStream(bufferedStream); - - return new InputStreamReader(input, encoding); - - } - - /** - * Uncompress bz2 file - * - * @param path - * path to file to uncompress - * @throws IOException - */ - public void decompress(String path) - throws IOException - { - File bzip2 = new File(path); - - // - File unarchived = new File(bzip2.getName().replace(".bz2", "")); - - unarchived.createNewFile(); - - BufferedInputStream inputStr = new BufferedInputStream(new FileInputStream(bzip2)); - - // read bzip2 prefix - inputStr.read(); - inputStr.read(); - - BufferedInputStream buffStr = new BufferedInputStream(inputStr); - - BZip2CompressorInputStream input = new BZip2CompressorInputStream(buffStr); - - FileOutputStream outStr = new FileOutputStream(unarchived); - - while (true) { - byte[] compressedBytes = new byte[DECOMPRESSION_CACHE]; - - int byteRead = input.read(compressedBytes); - - outStr.write(compressedBytes, 0, byteRead); - if (byteRead != DECOMPRESSION_CACHE) { - break; - } - } - - input.close(); - buffStr.close(); - inputStr.close(); - outStr.close(); - } + input.close(); + buffStr.close(); + inputStr.close(); + outStr.close(); + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/exceptions/ArticleReaderException.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/exceptions/ArticleReaderException.java index 4d16f226..69fd52a1 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/exceptions/ArticleReaderException.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/exceptions/ArticleReaderException.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -23,41 +23,33 @@ */ @SuppressWarnings("serial") public class ArticleReaderException - extends Exception -{ + extends Exception { - /** - * (Constructor) Creates a new ArticleReaderException. - * - * @param description - * message - */ - public ArticleReaderException(final String description) - { - super(description); - } + /** + * (Constructor) Creates a new ArticleReaderException. + * + * @param description message + */ + public ArticleReaderException(final String description) { + super(description); + } - /** - * (Constructor) Creates a new ArticleReaderException. - * - * @param e - * inner exception - */ - public ArticleReaderException(final Exception e) - { - super(e); - } + /** + * (Constructor) Creates a new ArticleReaderException. + * + * @param e inner exception + */ + public ArticleReaderException(final Exception e) { + super(e); + } - /** - * (Constructor) Creates a new ArticleReaderException. - * - * @param description - * message - * @param e - * inner exception - */ - public ArticleReaderException(final String description, final Exception e) - { - super(description, e); - } + /** + * (Constructor) Creates a new ArticleReaderException. + * + * @param description message + * @param e inner exception + */ + public ArticleReaderException(final String description, final Exception e) { + super(description, e); + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/exceptions/ConfigurationException.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/exceptions/ConfigurationException.java index 7fdeb121..73c34f0c 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/exceptions/ConfigurationException.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/exceptions/ConfigurationException.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -20,74 +20,61 @@ /** * ConfigurationException Describes an exception that occurred while accessing * the configuration. - * - * - * */ @SuppressWarnings("serial") public class ConfigurationException - extends Exception -{ + extends Exception { - /** Reference to the error key */ - private ErrorKeys key; + /** + * Reference to the error key + */ + private ErrorKeys key; - /** - * (Constructor) Creates a new ConfigurationException. - * - * @param description - * message - */ - public ConfigurationException(final String description) - { - super(description); - } + /** + * (Constructor) Creates a new ConfigurationException. + * + * @param description message + */ + public ConfigurationException(final String description) { + super(description); + } - /** - * (Constructor) Creates a new ConfigurationException. - * - * @param e - * inner exception - */ - public ConfigurationException(final Exception e) - { - super(e); - } + /** + * (Constructor) Creates a new ConfigurationException. + * + * @param e inner exception + */ + public ConfigurationException(final Exception e) { + super(e); + } - /** - * (Constructor) Creates a new ConfigurationException. - * - * @param description - * message - * @param e - * inner exception - */ - public ConfigurationException(final String description, final Exception e) - { - super(description, e); - } + /** + * (Constructor) Creates a new ConfigurationException. + * + * @param description message + * @param e inner exception + */ + public ConfigurationException(final String description, final Exception e) { + super(description, e); + } - /** - * (Constructor) Creates a new ConfigurationException. - * - * @param key - * error key - * @param description - * message - */ - public ConfigurationException(final ErrorKeys key, final String description) - { - super(description); - this.key = key; - } + /** + * (Constructor) Creates a new ConfigurationException. + * + * @param key error key + * @param description message + */ + public ConfigurationException(final ErrorKeys key, final String description) { + super(description); + this.key = key; + } - /** - * Returns the error key. - * - * @return error key - */ - public ErrorKeys getKey() - { - return this.key; - } + /** + * Returns the error key. + * + * @return error key + */ + public ErrorKeys getKey() { + return this.key; + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/exceptions/DecodingException.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/exceptions/DecodingException.java index 49150dda..980bd446 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/exceptions/DecodingException.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/exceptions/DecodingException.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -20,47 +20,36 @@ /** * DecodingException Describes an exception that occurred while decoding the * diff information. - * - * - * */ @SuppressWarnings("serial") public class DecodingException - extends Exception -{ + extends Exception { - /** - * (Constructor) Creates a new DecodingException. - * - * @param description - * message - */ - public DecodingException(final String description) - { - super(description); - } + /** + * (Constructor) Creates a new DecodingException. + * + * @param description message + */ + public DecodingException(final String description) { + super(description); + } - /** - * (Constructor) Creates a new DecodingException. - * - * @param e - * inner exception - */ - public DecodingException(final Exception e) - { - super(e); - } + /** + * (Constructor) Creates a new DecodingException. + * + * @param e inner exception + */ + public DecodingException(final Exception e) { + super(e); + } - /** - * (Constructor) Creates a new DecodingException. - * - * @param description - * message - * @param e - * inner exception - */ - public DecodingException(final String description, final Exception e) - { - super(description, e); - } + /** + * (Constructor) Creates a new DecodingException. + * + * @param description message + * @param e inner exception + */ + public DecodingException(final String description, final Exception e) { + super(description, e); + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/exceptions/DiffException.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/exceptions/DiffException.java index 792d62f1..66e363f8 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/exceptions/DiffException.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/exceptions/DiffException.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -20,47 +20,36 @@ /** * DiffException Describes an exception that occurred while calculating the * diff. - * - * - * */ @SuppressWarnings("serial") public class DiffException - extends Exception -{ + extends Exception { - /** - * (Constructor) Creates a new DiffException. - * - * @param description - * message - */ - public DiffException(final String description) - { - super(description); - } + /** + * (Constructor) Creates a new DiffException. + * + * @param description message + */ + public DiffException(final String description) { + super(description); + } - /** - * (Constructor) Creates a new DiffException. - * - * @param e - * inner exception - */ - public DiffException(final Exception e) - { - super(e); - } + /** + * (Constructor) Creates a new DiffException. + * + * @param e inner exception + */ + public DiffException(final Exception e) { + super(e); + } - /** - * (Constructor) Creates a new DiffException. - * - * @param description - * message - * @param e - * inner exception - */ - public DiffException(final String description, final Exception e) - { - super(description, e); - } + /** + * (Constructor) Creates a new DiffException. + * + * @param description message + * @param e inner exception + */ + public DiffException(final String description, final Exception e) { + super(description, e); + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/exceptions/EncodingException.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/exceptions/EncodingException.java index ef548cca..e696bb41 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/exceptions/EncodingException.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/exceptions/EncodingException.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -20,47 +20,36 @@ /** * DecodingException Describes an exception that occurred while encoding the * diff information. - * - * - * */ @SuppressWarnings("serial") public class EncodingException - extends Exception -{ + extends Exception { - /** - * (Constructor) Creates a new EncodingException. - * - * @param description - * message - */ - public EncodingException(final String description) - { - super(description); - } + /** + * (Constructor) Creates a new EncodingException. + * + * @param description message + */ + public EncodingException(final String description) { + super(description); + } - /** - * (Constructor) Creates a new EncodingException. - * - * @param e - * inner exception - */ - public EncodingException(final Exception e) - { - super(e); - } + /** + * (Constructor) Creates a new EncodingException. + * + * @param e inner exception + */ + public EncodingException(final Exception e) { + super(e); + } - /** - * (Constructor) Creates a new EncodingException. - * - * @param description - * message - * @param e - * inner exception - */ - public EncodingException(final String description, final Exception e) - { - super(description, e); - } + /** + * (Constructor) Creates a new EncodingException. + * + * @param description message + * @param e inner exception + */ + public EncodingException(final String description, final Exception e) { + super(description, e); + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/exceptions/ErrorFactory.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/exceptions/ErrorFactory.java index f7c56491..5a0ea675 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/exceptions/ErrorFactory.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/exceptions/ErrorFactory.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -19,339 +19,284 @@ /** * This utility class contains method two create exceptions. - * - * - * */ -public final class ErrorFactory -{ - - /** No object - Utility class */ - private ErrorFactory() - { - } - - /* - * ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ - */ - - /** - * Creates a RuntimeException object. - * - * @param errorId - * reference to the error identifier - * @return RuntimeException - */ - public static RuntimeException createRuntimeException( - final ErrorKeys errorId) - { - - return new RuntimeException(errorId.toString()); - } - - /* - * +ARTICLE+READER+EXCEPTION+++++++++++++++++++++++++++++++++++++++++++++++++ - */ - - /** - * Creates a ArticleReaderException object. - * - * @param errorId - * reference to the error identifier - * @return ArticleReaderException - */ - public static ArticleReaderException createArticleReaderException( - final ErrorKeys errorId) - { - - return new ArticleReaderException(errorId.toString()); - } - - /* - * +CONFIGURATION+EXCEPTION++++++++++++++++++++++++++++++++++++++++++++++++++ - */ - - /** - * Creates a ConfigurationException object. - * - * @param errorId - * reference to the error identifier - * @return ConfigurationException - */ - public static ConfigurationException createConfigurationException( - final ErrorKeys errorId) - { - - return new ConfigurationException(errorId.toString()); - } - - /** - * Creates a ConfigurationException object. - * - * @param errorId - * reference to the error identifier - * @param message - * additional error message - * @return ConfigurationException - */ - public static ConfigurationException createConfigurationException( - final ErrorKeys errorId, final String message) - { - - return new ConfigurationException(errorId.toString() + ":\r\n" - + message); - } - - /* - * +TIMEOUT+EXCEPTION++++++++++++++++++++++++++++++++++++++++++++++++++++++++ - */ - - /** - * Creates a TimeoutException object. - * - * @param errorId - * reference to the error identifier - * @param sleepPeriod - * time value - * @return TimeoutException - */ - public static TimeoutException createTimeoutException( - final ErrorKeys errorId, final long sleepPeriod) - { - - return new TimeoutException(errorId.toString() + "\r\n" - + "Timeout after " + sleepPeriod + " miliseconds."); - } - - /* - * +LOGGING+EXCEPTION++++++++++++++++++++++++++++++++++++++++++++++++++++++++ - */ - - /** - * Creates a LoggingException object. - * - * @param errorId - * reference to the error identifier - * @return LoggingException - */ - public static LoggingException createLoggingException( - final ErrorKeys errorId) - { - - return new LoggingException(errorId.toString()); - } - - /** - * Creates a LoggingException object. - * - * @param errorId - * reference to the error identifier - * @param e - * inner exception - * @return LoggingException - */ - public static LoggingException createLoggingException( - final ErrorKeys errorId, final Exception e) - { - - return new LoggingException(errorId.toString(), e); - } - - /* - * +DIFF+EXCEPTION+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ - */ - - /** - * Creates a DiffException object. - * - * @param errorId - * reference to the error identifier - * @param message - * additional message - * @return DiffException - */ - public static DiffException createDiffException(final ErrorKeys errorId, - final String message) - { - - return new DiffException(errorId.toString() + ":\r\n" + message); - } - - /** - * Creates a DiffException object. - * - * @param errorId - * reference to the error identifier - * @param message - * additional message - * @param e - * inner exception - * @return DiffException - */ - public static DiffException createDiffException(final ErrorKeys errorId, - final String message, final Exception e) - { - - return new DiffException(errorId.toString() + ":\r\n" + message, e); - } - - /* - * +ENCODING+EXCEPTION+++++++++++++++++++++++++++++++++++++++++++++++++++++++ - */ - - /** - * Creates an EncodingException object. - * - * @param errorId - * reference to the error identifier - * @return EncodingException - */ - public static EncodingException createEncodingException( - final ErrorKeys errorId) - { - - return new EncodingException(errorId.toString()); - } - - /** - * Creates an EncodingException object. - * - * @param errorId - * reference to the error identifier - * @param message - * additional message - * @return EncodingException - */ - public static EncodingException createEncodingException( - final ErrorKeys errorId, final String message) - { - - return new EncodingException(errorId.toString() + ":\r\n" + message); - } - - /** - * Creates an EncodingException object. - * - * @param errorId - * reference to the error identifier - * @param message - * additional message - * @param e - * inner exception - * @return EncodingException - */ - public static EncodingException createEncodingException( - final ErrorKeys errorId, final String message, final Exception e) - { - - return new EncodingException(errorId.toString() + ":\r\n" + message, e); - } - - /* - * +DECODING+EXCEPTION+++++++++++++++++++++++++++++++++++++++++++++++++++++++ - */ - - /** - * Creates a DecodingException object. - * - * @param errorId - * reference to the error identifier - * @return DecodingException - */ - public static DecodingException createDecodingException( - final ErrorKeys errorId) - { - - return new DecodingException(errorId.toString()); - } - - /** - * Creates a DecodingException object. - * - * @param errorId - * reference to the error identifier - * @param message - * additional message - * @return DecodingException - */ - public static DecodingException createDecodingException( - final ErrorKeys errorId, final String message) - { - - return new DecodingException(errorId.toString() + ":\r\n" + message); - } - - /** - * Creates a DecodingException object. - * - * @param errorId - * reference to the error identifier - * @param message - * additional message - * @param e - * inner exception - * @return DecodingException - */ - public static DecodingException createDecodingException( - final ErrorKeys errorId, final String message, final Exception e) - { - - return new DecodingException(errorId.toString() + ":\r\n" + message, e); - } - - /* - * +UNCOMPRESSED+CONSUMER+EXCEPTION++++++++++++++++++++++++++++++++++++++++++++++++++++ - * + - */ - - /** - * Creates a SQLConsumerException object. - * - * @param errorId - * reference to the error identifier - * @param e - * inner exception - * @return SQLConsumerException - */ - public static SQLConsumerException createSQLConsumerException( - final ErrorKeys errorId, final Exception e) - { - - return new SQLConsumerException(errorId.toString(), e); - } - - /** - * Creates a SQLConsumerException object. - * - * @param errorId - * reference to the error identifier - * @param message - * additional message - * @return SQLConsumerException - */ - public static SQLConsumerException createSQLConsumerException( - final ErrorKeys errorId, final String message) - { - - return new SQLConsumerException(errorId.toString() + ":\r\n" + message); - } - - /** - * Creates a SQLConsumerException object. - * - * @param errorId - * reference to the error identifier - * @param message - * additional message - * @param e - * inner exception - * @return SQLConsumerException - */ - public static SQLConsumerException createSQLConsumerException( - final ErrorKeys errorId, final String message, final Exception e) - { - - return new SQLConsumerException(errorId.toString() + ":\r\n" + message, - e); - } +public final class ErrorFactory { + + /** + * No object - Utility class + */ + private ErrorFactory() { + } + + /* + * ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + */ + + /** + * Creates a RuntimeException object. + * + * @param errorId reference to the error identifier + * @return RuntimeException + */ + public static RuntimeException createRuntimeException( + final ErrorKeys errorId) { + + return new RuntimeException(errorId.toString()); + } + + /* + * +ARTICLE+READER+EXCEPTION+++++++++++++++++++++++++++++++++++++++++++++++++ + */ + + /** + * Creates a ArticleReaderException object. + * + * @param errorId reference to the error identifier + * @return ArticleReaderException + */ + public static ArticleReaderException createArticleReaderException( + final ErrorKeys errorId) { + + return new ArticleReaderException(errorId.toString()); + } + + /* + * +CONFIGURATION+EXCEPTION++++++++++++++++++++++++++++++++++++++++++++++++++ + */ + + /** + * Creates a ConfigurationException object. + * + * @param errorId reference to the error identifier + * @return ConfigurationException + */ + public static ConfigurationException createConfigurationException( + final ErrorKeys errorId) { + + return new ConfigurationException(errorId.toString()); + } + + /** + * Creates a ConfigurationException object. + * + * @param errorId reference to the error identifier + * @param message additional error message + * @return ConfigurationException + */ + public static ConfigurationException createConfigurationException( + final ErrorKeys errorId, final String message) { + + return new ConfigurationException(errorId.toString() + ":\r\n" + + message); + } + + /* + * +TIMEOUT+EXCEPTION++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + */ + + /** + * Creates a TimeoutException object. + * + * @param errorId reference to the error identifier + * @param sleepPeriod time value + * @return TimeoutException + */ + public static TimeoutException createTimeoutException( + final ErrorKeys errorId, final long sleepPeriod) { + + return new TimeoutException(errorId.toString() + "\r\n" + + "Timeout after " + sleepPeriod + " miliseconds."); + } + + /* + * +LOGGING+EXCEPTION++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + */ + + /** + * Creates a LoggingException object. + * + * @param errorId reference to the error identifier + * @return LoggingException + */ + public static LoggingException createLoggingException( + final ErrorKeys errorId) { + + return new LoggingException(errorId.toString()); + } + + /** + * Creates a LoggingException object. + * + * @param errorId reference to the error identifier + * @param e inner exception + * @return LoggingException + */ + public static LoggingException createLoggingException( + final ErrorKeys errorId, final Exception e) { + + return new LoggingException(errorId.toString(), e); + } + + /* + * +DIFF+EXCEPTION+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + */ + + /** + * Creates a DiffException object. + * + * @param errorId reference to the error identifier + * @param message additional message + * @return DiffException + */ + public static DiffException createDiffException(final ErrorKeys errorId, + final String message) { + + return new DiffException(errorId.toString() + ":\r\n" + message); + } + + /** + * Creates a DiffException object. + * + * @param errorId reference to the error identifier + * @param message additional message + * @param e inner exception + * @return DiffException + */ + public static DiffException createDiffException(final ErrorKeys errorId, + final String message, final Exception e) { + + return new DiffException(errorId.toString() + ":\r\n" + message, e); + } + + /* + * +ENCODING+EXCEPTION+++++++++++++++++++++++++++++++++++++++++++++++++++++++ + */ + + /** + * Creates an EncodingException object. + * + * @param errorId reference to the error identifier + * @return EncodingException + */ + public static EncodingException createEncodingException( + final ErrorKeys errorId) { + + return new EncodingException(errorId.toString()); + } + + /** + * Creates an EncodingException object. + * + * @param errorId reference to the error identifier + * @param message additional message + * @return EncodingException + */ + public static EncodingException createEncodingException( + final ErrorKeys errorId, final String message) { + + return new EncodingException(errorId.toString() + ":\r\n" + message); + } + + /** + * Creates an EncodingException object. + * + * @param errorId reference to the error identifier + * @param message additional message + * @param e inner exception + * @return EncodingException + */ + public static EncodingException createEncodingException( + final ErrorKeys errorId, final String message, final Exception e) { + + return new EncodingException(errorId.toString() + ":\r\n" + message, e); + } + + /* + * +DECODING+EXCEPTION+++++++++++++++++++++++++++++++++++++++++++++++++++++++ + */ + + /** + * Creates a DecodingException object. + * + * @param errorId reference to the error identifier + * @return DecodingException + */ + public static DecodingException createDecodingException( + final ErrorKeys errorId) { + + return new DecodingException(errorId.toString()); + } + + /** + * Creates a DecodingException object. + * + * @param errorId reference to the error identifier + * @param message additional message + * @return DecodingException + */ + public static DecodingException createDecodingException( + final ErrorKeys errorId, final String message) { + + return new DecodingException(errorId.toString() + ":\r\n" + message); + } + + /** + * Creates a DecodingException object. + * + * @param errorId reference to the error identifier + * @param message additional message + * @param e inner exception + * @return DecodingException + */ + public static DecodingException createDecodingException( + final ErrorKeys errorId, final String message, final Exception e) { + + return new DecodingException(errorId.toString() + ":\r\n" + message, e); + } + + /* + * +UNCOMPRESSED+CONSUMER+EXCEPTION++++++++++++++++++++++++++++++++++++++++++++++++++++ + * + + */ + + /** + * Creates a SQLConsumerException object. + * + * @param errorId reference to the error identifier + * @param e inner exception + * @return SQLConsumerException + */ + public static SQLConsumerException createSQLConsumerException( + final ErrorKeys errorId, final Exception e) { + + return new SQLConsumerException(errorId.toString(), e); + } + + /** + * Creates a SQLConsumerException object. + * + * @param errorId reference to the error identifier + * @param message additional message + * @return SQLConsumerException + */ + public static SQLConsumerException createSQLConsumerException( + final ErrorKeys errorId, final String message) { + + return new SQLConsumerException(errorId.toString() + ":\r\n" + message); + } + + /** + * Creates a SQLConsumerException object. + * + * @param errorId reference to the error identifier + * @param message additional message + * @param e inner exception + * @return SQLConsumerException + */ + public static SQLConsumerException createSQLConsumerException( + final ErrorKeys errorId, final String message, final Exception e) { + + return new SQLConsumerException(errorId.toString() + ":\r\n" + message, + e); + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/exceptions/ErrorKeys.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/exceptions/ErrorKeys.java index f5ceae99..b9360a0d 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/exceptions/ErrorKeys.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/exceptions/ErrorKeys.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -19,50 +19,56 @@ /** * This class contains an enumeration of the possible error sources. - * - * - * */ -public enum ErrorKeys -{ +public enum ErrorKeys { - /** The configuration manager has not been created */ - CONFIGURATION_CONFIGURATIONMANAGER_NOT_INITIALIZED, + /** + * The configuration manager has not been created + */ + CONFIGURATION_CONFIGURATIONMANAGER_NOT_INITIALIZED, - /** An unknown configuration parameter was requested */ - CONFIGURATION_CONFIGURATIONMANAGER_UNKNOWN_CONFIG_PARAMETER, + /** + * An unknown configuration parameter was requested + */ + CONFIGURATION_CONFIGURATIONMANAGER_UNKNOWN_CONFIG_PARAMETER, - /** An undefined parameter was requested */ - CONFIGURATION_PARAMETER_UNDEFINED, + /** + * An undefined parameter was requested + */ + CONFIGURATION_PARAMETER_UNDEFINED, - /** An IOException occurred while parsing the xml input */ - DELTA_CONSUMERS_TASK_READER_WIKIPEDIAXMLREADER_IOEXCEPTION, + /** + * An IOException occurred while parsing the xml input + */ + DELTA_CONSUMERS_TASK_READER_WIKIPEDIAXMLREADER_IOEXCEPTION, - /** An keyword was found were it was not supposed to be */ - DELTA_CONSUMERS_TASK_READER_WIKIPEDIAXMLREADER_UNEXPECTED_KEYWORD, + /** + * An keyword was found were it was not supposed to be + */ + DELTA_CONSUMERS_TASK_READER_WIKIPEDIAXMLREADER_UNEXPECTED_KEYWORD, - /** - * The end of the file was reached, but the parsing process was not finished - */ - DELTA_CONSUMERS_TASK_READER_WIKIPEDIAXMLREADER_UNEXPECTED_END_OF_FILE, + /** + * The end of the file was reached, but the parsing process was not finished + */ + DELTA_CONSUMERS_TASK_READER_WIKIPEDIAXMLREADER_UNEXPECTED_END_OF_FILE, - DELTA_CONSUMERS_TASK_READER_INPUTFACTORY_ILLEGAL_INPUTMODE_VALUE, + DELTA_CONSUMERS_TASK_READER_INPUTFACTORY_ILLEGAL_INPUTMODE_VALUE, - DELTA_CONSUMERS_SQL_CODEC_BITREADER_READ_OPERATION_OUT_OF_RANGE, DELTA_CONSUMERS_SQL_CODEC_BITREADER_READ_OPERATION_AFTER_END_OF_STREAM, + DELTA_CONSUMERS_SQL_CODEC_BITREADER_READ_OPERATION_OUT_OF_RANGE, DELTA_CONSUMERS_SQL_CODEC_BITREADER_READ_OPERATION_AFTER_END_OF_STREAM, - DELTA_CONSUMERS_SQL_CODEC_BITWRITER_WRITE_OPERATOR_OUT_OF_RANGE, DELTA_CONSUMERS_SQL_CODEC_BITWRITER_INVALID_WRITE_OPERATION, + DELTA_CONSUMERS_SQL_CODEC_BITWRITER_WRITE_OPERATOR_OUT_OF_RANGE, DELTA_CONSUMERS_SQL_CODEC_BITWRITER_INVALID_WRITE_OPERATION, - DELTA_CONSUMERS_SQL_WRITER_OUTPUTFACTORY_ILLEGAL_OUTPUTMODE_VALUE, + DELTA_CONSUMERS_SQL_WRITER_OUTPUTFACTORY_ILLEGAL_OUTPUTMODE_VALUE, - DIFFTOOL_DIFFCONSUMER_DIFF_VERIFICATION_FAILED, + DIFFTOOL_DIFFCONSUMER_DIFF_VERIFICATION_FAILED, - DIFFTOOL_SQLCONSUMER_ENCODING_VERIFICATION_FAILED, DIFFTOOL_SQLCONSUMER_DATABASEWRITER_EXCEPTION, DIFFTOOL_SQLCONSUMER_FILEWRITER_EXCEPTION, + DIFFTOOL_SQLCONSUMER_ENCODING_VERIFICATION_FAILED, DIFFTOOL_SQLCONSUMER_DATABASEWRITER_EXCEPTION, DIFFTOOL_SQLCONSUMER_FILEWRITER_EXCEPTION, - DIFFTOOL_ENCODING_INVALID_VALUE, DIFFTOOL_ENCODING_VALUE_OUT_OF_RANGE, + DIFFTOOL_ENCODING_INVALID_VALUE, DIFFTOOL_ENCODING_VALUE_OUT_OF_RANGE, - DIFFTOOL_DECODING_INVALID_VALUE, DIFFTOOL_DECODING_VALUE_OUT_OF_RANGE, DIFFTOOL_DECODING_UNEXPECTED_END_OF_STREAM, + DIFFTOOL_DECODING_INVALID_VALUE, DIFFTOOL_DECODING_VALUE_OUT_OF_RANGE, DIFFTOOL_DECODING_UNEXPECTED_END_OF_STREAM, - LOGGING_LOGGER_INITIALIZISATION_FAILED, LOGGING_LOGGINGFACTORY_NO_SUCH_LOGGER, LOGGING_LOGGINGFACTORY_LOGGER_ALREADY_EXIST, + LOGGING_LOGGER_INITIALIZISATION_FAILED, LOGGING_LOGGINGFACTORY_NO_SUCH_LOGGER, LOGGING_LOGGINGFACTORY_LOGGER_ALREADY_EXIST, - ABSTRACT_CONSUMER_TIMEOUT + ABSTRACT_CONSUMER_TIMEOUT } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/exceptions/LoggingException.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/exceptions/LoggingException.java index 1178914d..9e445856 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/exceptions/LoggingException.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/exceptions/LoggingException.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -19,47 +19,36 @@ /** * LoggingException Describes an exception that occurred during the logging. - * - * - * */ @SuppressWarnings("serial") public class LoggingException - extends Exception -{ + extends Exception { - /** - * (Constructor) Creates a new LoggingException. - * - * @param description - * message - */ - public LoggingException(final String description) - { - super(description); - } + /** + * (Constructor) Creates a new LoggingException. + * + * @param description message + */ + public LoggingException(final String description) { + super(description); + } - /** - * (Constructor) Creates a new LoggingException. - * - * @param e - * inner exception - */ - public LoggingException(final Exception e) - { - super(e); - } + /** + * (Constructor) Creates a new LoggingException. + * + * @param e inner exception + */ + public LoggingException(final Exception e) { + super(e); + } - /** - * (Constructor) Creates a new LoggingException. - * - * @param description - * message - * @param e - * inner exception - */ - public LoggingException(final String description, final Exception e) - { - super(description, e); - } + /** + * (Constructor) Creates a new LoggingException. + * + * @param description message + * @param e inner exception + */ + public LoggingException(final String description, final Exception e) { + super(description, e); + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/exceptions/SQLConsumerException.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/exceptions/SQLConsumerException.java index 58bda2fe..79991183 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/exceptions/SQLConsumerException.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/exceptions/SQLConsumerException.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -20,47 +20,36 @@ /** * SQLConsumerException Describes an exception that occurred while accessing the * database. - * - * - * */ @SuppressWarnings("serial") public class SQLConsumerException - extends Exception -{ + extends Exception { - /** - * (Constructor) Creates a new SQLConsumerException. - * - * @param description - * message - */ - public SQLConsumerException(final String description) - { - super(description); - } + /** + * (Constructor) Creates a new SQLConsumerException. + * + * @param description message + */ + public SQLConsumerException(final String description) { + super(description); + } - /** - * (Constructor) Creates a new SQLConsumerException. - * - * @param e - * inner exception - */ - public SQLConsumerException(final Exception e) - { - super(e); - } + /** + * (Constructor) Creates a new SQLConsumerException. + * + * @param e inner exception + */ + public SQLConsumerException(final Exception e) { + super(e); + } - /** - * (Constructor) Creates a new SQLConsumerException. - * - * @param description - * message - * @param e - * inner exception - */ - public SQLConsumerException(final String description, final Exception e) - { - super(description, e); - } + /** + * (Constructor) Creates a new SQLConsumerException. + * + * @param description message + * @param e inner exception + */ + public SQLConsumerException(final String description, final Exception e) { + super(description, e); + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/exceptions/TimeoutException.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/exceptions/TimeoutException.java index 93b2e1b8..ea5c7cc2 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/exceptions/TimeoutException.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/exceptions/TimeoutException.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -20,47 +20,36 @@ /** * TimeoutException Describes an exception that occurred because of a timeout * event. - * - * - * */ @SuppressWarnings("serial") public class TimeoutException - extends Exception -{ + extends Exception { - /** - * (Constructor) Creates a new TimeoutException. - * - * @param description - * message - */ - public TimeoutException(final String description) - { - super(description); - } + /** + * (Constructor) Creates a new TimeoutException. + * + * @param description message + */ + public TimeoutException(final String description) { + super(description); + } - /** - * (Constructor) Creates a new TimeoutException. - * - * @param e - * inner exception - */ - public TimeoutException(final Exception e) - { - super(e); - } + /** + * (Constructor) Creates a new TimeoutException. + * + * @param e inner exception + */ + public TimeoutException(final Exception e) { + super(e); + } - /** - * (Constructor) Creates a new TimeoutException. - * - * @param description - * message - * @param e - * inner exception - */ - public TimeoutException(final String description, final Exception e) - { - super(description, e); - } + /** + * (Constructor) Creates a new TimeoutException. + * + * @param description message + * @param e inner exception + */ + public TimeoutException(final String description, final Exception e) { + super(description, e); + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/logging/Logger.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/logging/Logger.java index c0414d86..b056d1a5 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/logging/Logger.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/logging/Logger.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -29,256 +29,231 @@ /** * DiffTool Logger class - * */ -public class Logger -{ - - /** Name of the logger */ - private final String consumerName; - - /** Reference to level of the logging */ - private final Level logLevel; - - /** Type of the logger */ - private final LoggerType type; - - /** Reference to the output writer */ - private final FileWriter writer; - - /** - * Creates a new logger. - * - * @param type - * type - * @param consumerName - * name - * @throws LoggingException - * if an error occurred - */ - public Logger(final LoggerType type, final String consumerName) - throws LoggingException - { - - try { - this.type = type; - this.consumerName = consumerName; - - ConfigurationManager config = ConfigurationManager.getInstance(); - String path = (String) config - .getConfigParameter(ConfigurationKeys.LOGGING_PATH_DIFFTOOL); - - switch (type) { - case ARTICLE_OUTPUT: - logLevel = Level.INFO; - break; - case DIFF_TOOL_ERROR: - logLevel = Level.ERROR; - break; - case DIFF_TOOL: - logLevel = (Level) config - .getConfigParameter(ConfigurationKeys.LOGGING_LOGLEVEL_DIFFTOOL); - break; - default: - throw ErrorFactory - .createLoggingException(ErrorKeys.LOGGING_LOGGER_INITIALIZISATION_FAILED); - } - - this.writer = new FileWriter(path + consumerName + ".log"); - - } - catch (Exception e) { - throw ErrorFactory.createLoggingException( - ErrorKeys.LOGGING_LOGGER_INITIALIZISATION_FAILED, e); - } - } - - /** - * Closes the output writer. - */ - public synchronized void close() - { - try { - writer.close(); - } - catch (IOException ioe) { - ioe.printStackTrace(); - } - } - - /** - * Flushes the buffered output of the writer to the file. - */ - public synchronized void flush() - { - try { - writer.flush(); - } - catch (IOException ioe) { - ioe.printStackTrace(); - } - } - - /** - * Returns the log level. - * - * @return log level - */ - public Level getLogLevel() - { - return this.logLevel; - } - - /** - * Writes the given text to the output file. - * - * @param text - * log message - */ - private synchronized void log(final String text) - { - - try { - this.writer.write(text); - } - catch (IOException ioe) { - ioe.printStackTrace(); - } - } - - /** - * The occurred error with the related log level and message has to be given - * to this method. - * <p> - * This method will verify if the message should be logged or not. - * - * @param level - * log level - * @param message - * message - * @param e - * Error - */ - public void logError(final Level level, final String message, final Error e) - { - try { - Logger errors = LoggingFactory - .getLogger(LoggingFactory.NAME_ERROR_LOGGER); - - errors.logThrowable(level, message, e); - - } - catch (LoggingException ex) { - ex.printStackTrace(); - } - - if (logLevel.toInt() > level.toInt()) { - return; - } - - logThrowable(level, message, e); - } - - /** - * The occurred exception with the related log level and message has to be - * given to this method. - * <p> - * This method will verify if the message should be logged or not. - * - * @param level - * log level - * @param message - * message - * @param e - * Exception - */ - public void logException(final Level level, final String message, - final Exception e) - { - - try { - Logger errors = LoggingFactory - .getLogger(LoggingFactory.NAME_ERROR_LOGGER); - - errors.logThrowable(level, message, e); - - } - catch (LoggingException ex) { - ex.printStackTrace(); - } - - if (logLevel.toInt() > level.toInt()) { - return; - } - - logThrowable(level, message, e); - } - - /** - * This method will be called with a message and the related log level. It - * be verified if the message should be logged or not. - * <p> - * The format of the logged message is: \t consumerName [ Type of Logger ] - * \t message \r\n - * - * @param level - * level - * @param message - * message - */ - public synchronized void logMessage(final Level level, final String message) - { - - if (logLevel.toInt() > level.toInt()) { - return; - } - - try { - this.writer.write(System.currentTimeMillis() + "\t" + consumerName - + " [" + type.toString() + "] " + "\t" + message + "\r\n"); - this.writer.flush(); - } - catch (IOException ioe) { - ioe.printStackTrace(); - } - } - - /** - * The occurred error or exception with the related log level and message - * will be logged by this method. - * - * @param level - * log level - * @param message - * message - * @param t - * Throwable - */ - private synchronized void logThrowable(final Level level, - final String message, final Throwable t) - { - - if (t != null) { - log("\r\n[" + System.currentTimeMillis() + "]\t" + message); - log("\r\n" + t); - log("\r\n"); - - for (StackTraceElement st : t.getStackTrace()) { - log("\t" + st.toString() + "\r\n"); - } - - Throwable c = t.getCause(); - if (c != null) { - - log("Caused by:\t" + c + "\r\n"); - - for (StackTraceElement st : c.getStackTrace()) { - log("\t" + st.toString() + "\r\n"); - } - } - - log("\r\n"); - this.flush(); - } - } +public class Logger { + + /** + * Name of the logger + */ + private final String consumerName; + + /** + * Reference to level of the logging + */ + private final Level logLevel; + + /** + * Type of the logger + */ + private final LoggerType type; + + /** + * Reference to the output writer + */ + private final FileWriter writer; + + /** + * Creates a new logger. + * + * @param type type + * @param consumerName name + * @throws LoggingException if an error occurred + */ + public Logger(final LoggerType type, final String consumerName) + throws LoggingException { + + try { + this.type = type; + this.consumerName = consumerName; + + ConfigurationManager config = ConfigurationManager.getInstance(); + String path = (String) config + .getConfigParameter(ConfigurationKeys.LOGGING_PATH_DIFFTOOL); + + switch (type) { + case ARTICLE_OUTPUT: + logLevel = Level.INFO; + break; + case DIFF_TOOL_ERROR: + logLevel = Level.ERROR; + break; + case DIFF_TOOL: + logLevel = (Level) config + .getConfigParameter(ConfigurationKeys.LOGGING_LOGLEVEL_DIFFTOOL); + break; + default: + throw ErrorFactory + .createLoggingException(ErrorKeys.LOGGING_LOGGER_INITIALIZISATION_FAILED); + } + + this.writer = new FileWriter(path + consumerName + ".log"); + + } catch (Exception e) { + throw ErrorFactory.createLoggingException( + ErrorKeys.LOGGING_LOGGER_INITIALIZISATION_FAILED, e); + } + } + + /** + * Closes the output writer. + */ + public synchronized void close() { + try { + writer.close(); + } catch (IOException ioe) { + ioe.printStackTrace(); + } + } + + /** + * Flushes the buffered output of the writer to the file. + */ + public synchronized void flush() { + try { + writer.flush(); + } catch (IOException ioe) { + ioe.printStackTrace(); + } + } + + /** + * Returns the log level. + * + * @return log level + */ + public Level getLogLevel() { + return this.logLevel; + } + + /** + * Writes the given text to the output file. + * + * @param text log message + */ + private synchronized void log(final String text) { + + try { + this.writer.write(text); + } catch (IOException ioe) { + ioe.printStackTrace(); + } + } + + /** + * The occurred error with the related log level and message has to be given + * to this method. + * <p> + * This method will verify if the message should be logged or not. + * + * @param level log level + * @param message message + * @param e Error + */ + public void logError(final Level level, final String message, final Error e) { + try { + Logger errors = LoggingFactory + .getLogger(LoggingFactory.NAME_ERROR_LOGGER); + + errors.logThrowable(level, message, e); + + } catch (LoggingException ex) { + ex.printStackTrace(); + } + + if (logLevel.toInt() > level.toInt()) { + return; + } + + logThrowable(level, message, e); + } + + /** + * The occurred exception with the related log level and message has to be + * given to this method. + * <p> + * This method will verify if the message should be logged or not. + * + * @param level log level + * @param message message + * @param e Exception + */ + public void logException(final Level level, final String message, + final Exception e) { + + try { + Logger errors = LoggingFactory + .getLogger(LoggingFactory.NAME_ERROR_LOGGER); + + errors.logThrowable(level, message, e); + + } catch (LoggingException ex) { + ex.printStackTrace(); + } + + if (logLevel.toInt() > level.toInt()) { + return; + } + + logThrowable(level, message, e); + } + + /** + * This method will be called with a message and the related log level. It + * be verified if the message should be logged or not. + * <p> + * The format of the logged message is: \t consumerName [ Type of Logger ] + * \t message \r\n + * + * @param level level + * @param message message + */ + public synchronized void logMessage(final Level level, final String message) { + + if (logLevel.toInt() > level.toInt()) { + return; + } + + try { + this.writer.write(System.currentTimeMillis() + "\t" + consumerName + + " [" + type.toString() + "] " + "\t" + message + "\r\n"); + this.writer.flush(); + } catch (IOException ioe) { + ioe.printStackTrace(); + } + } + + /** + * The occurred error or exception with the related log level and message + * will be logged by this method. + * + * @param level log level + * @param message message + * @param t Throwable + */ + private synchronized void logThrowable(final Level level, + final String message, final Throwable t) { + + if (t != null) { + log("\r\n[" + System.currentTimeMillis() + "]\t" + message); + log("\r\n" + t); + log("\r\n"); + + for (StackTraceElement st : t.getStackTrace()) { + log("\t" + st.toString() + "\r\n"); + } + + Throwable c = t.getCause(); + if (c != null) { + + log("Caused by:\t" + c + "\r\n"); + + for (StackTraceElement st : c.getStackTrace()) { + log("\t" + st.toString() + "\r\n"); + } + } + + log("\r\n"); + this.flush(); + } + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/logging/LoggerType.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/logging/LoggerType.java index 715d7272..e62dea34 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/logging/LoggerType.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/logging/LoggerType.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -19,38 +19,56 @@ /** * This class contains all keys for diff tool loggers. - * */ -public enum LoggerType -{ +public enum LoggerType { - /** DiffTool Error Logger */ - DIFF_TOOL_ERROR, + /** + * DiffTool Error Logger + */ + DIFF_TOOL_ERROR, - /** DiffTool Logger */ - DIFF_TOOL, + /** + * DiffTool Logger + */ + DIFF_TOOL, - /** Article Output Logger */ - ARTICLE_OUTPUT, + /** + * Article Output Logger + */ + ARTICLE_OUTPUT, - /** UNCOMPRESSED Consumer Logger */ - CONSUMER_SQL, + /** + * UNCOMPRESSED Consumer Logger + */ + CONSUMER_SQL, - /** Diff Consumer Logger */ - CONSUMER_DIFF, + /** + * Diff Consumer Logger + */ + CONSUMER_DIFF, - /** Task Consumer Logger */ - CONSUMER_TASK, + /** + * Task Consumer Logger + */ + CONSUMER_TASK, - /** Artcile Producer Logger */ - PRODUCER_ARTICLES, + /** + * Artcile Producer Logger + */ + PRODUCER_ARTICLES, - /** Producer Archives Logger */ - PRODUCER_ARCHIVES, + /** + * Producer Archives Logger + */ + PRODUCER_ARCHIVES, - /** Diff Producer Logger */ - PRODUCER_DIFFS, + /** + * Diff Producer Logger + */ + PRODUCER_DIFFS, - /** Consumer Producer Logger */ - PRODUCER_CONSUMERS + /** + * Consumer Producer Logger + */ + PRODUCER_CONSUMERS } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/logging/LoggingFactory.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/logging/LoggingFactory.java index e5efea3c..1568b1ac 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/logging/LoggingFactory.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/logging/LoggingFactory.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -25,80 +25,78 @@ /** * The static references in this 'class' creates and controls all loggers. - * */ -public class LoggingFactory -{ +public class LoggingFactory { - /** Reference Map Consumer(-Name) -> Logger */ - private static final HashMap<String, Logger> consumerLoggingIndex; + /** + * Reference Map Consumer(-Name) -> Logger + */ + private static final HashMap<String, Logger> consumerLoggingIndex; - /** Name for the DiffTool Output Logger */ - public final static String NAME_ARTICLE_OUTPUT_LOGGER = "DiffToolOutput"; + /** + * Name for the DiffTool Output Logger + */ + public final static String NAME_ARTICLE_OUTPUT_LOGGER = "DiffToolOutput"; - /** Name for the DiffTool Error Logger */ - public final static String NAME_ERROR_LOGGER = "DiffToolErrors"; + /** + * Name for the DiffTool Error Logger + */ + public final static String NAME_ERROR_LOGGER = "DiffToolErrors"; - /* Creates the static logging factory components */ - static { - consumerLoggingIndex = new HashMap<>(); + /* Creates the static logging factory components */ + static { + consumerLoggingIndex = new HashMap<>(); - try { - createLogger(LoggerType.DIFF_TOOL_ERROR, NAME_ERROR_LOGGER); - createLogger(LoggerType.ARTICLE_OUTPUT, NAME_ARTICLE_OUTPUT_LOGGER); - } - catch (LoggingException e) { - e.printStackTrace(); - System.exit(-1); - } - } + try { + createLogger(LoggerType.DIFF_TOOL_ERROR, NAME_ERROR_LOGGER); + createLogger(LoggerType.ARTICLE_OUTPUT, NAME_ARTICLE_OUTPUT_LOGGER); + } catch (LoggingException e) { + e.printStackTrace(); + System.exit(-1); + } + } - /** No class */ - private LoggingFactory() - { - } + /** + * No class + */ + private LoggingFactory() { + } - /** - * Creates a new Logger. - * - * @param consumerName - * Consumer Name - * @return The referenced Logger - * - * @throws LoggingException - */ - public static Logger createLogger(final LoggerType type, final String consumerName) - throws LoggingException - { + /** + * Creates a new Logger. + * + * @param consumerName Consumer Name + * @return The referenced Logger + * @throws LoggingException + */ + public static Logger createLogger(final LoggerType type, final String consumerName) + throws LoggingException { - Logger log = new Logger(type, consumerName); - if (consumerLoggingIndex.put(consumerName, log) != null) { - throw ErrorFactory - .createLoggingException(ErrorKeys.LOGGING_LOGGINGFACTORY_LOGGER_ALREADY_EXIST); - } + Logger log = new Logger(type, consumerName); + if (consumerLoggingIndex.put(consumerName, log) != null) { + throw ErrorFactory + .createLoggingException(ErrorKeys.LOGGING_LOGGINGFACTORY_LOGGER_ALREADY_EXIST); + } - return log; - } + return log; + } - /** - * Returns an already created Logger. - * - * @param consumerName - * Consumer Name - * @return The referenced Logger - * - * @throws LoggingException - */ - public static Logger getLogger(final String consumerName) - throws LoggingException - { + /** + * Returns an already created Logger. + * + * @param consumerName Consumer Name + * @return The referenced Logger + * @throws LoggingException + */ + public static Logger getLogger(final String consumerName) + throws LoggingException { - Logger log = consumerLoggingIndex.get(consumerName); - if (log == null) { - throw ErrorFactory - .createLoggingException(ErrorKeys.LOGGING_LOGGINGFACTORY_NO_SUCH_LOGGER); - } + Logger log = consumerLoggingIndex.get(consumerName); + if (log == null) { + throw ErrorFactory + .createLoggingException(ErrorKeys.LOGGING_LOGGINGFACTORY_NO_SUCH_LOGGER); + } - return log; - } + return log; + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/logging/messages/DiffToolLogMessages.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/logging/messages/DiffToolLogMessages.java index 3a0be8b4..81c03487 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/logging/messages/DiffToolLogMessages.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/logging/messages/DiffToolLogMessages.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -26,98 +26,78 @@ * <p> * TODO: This file should be replaced with resource files. */ -public class DiffToolLogMessages -{ +public class DiffToolLogMessages { - /** No object - utility class */ - private DiffToolLogMessages() - { - } + /** + * No object - utility class + */ + private DiffToolLogMessages() { + } - /** - * Logs the start of the diff tool. - * - * @param logger - * reference to the logger - */ - public static void logInitialization(final Logger logger) - { - logger.logMessage(Level.INFO, "DiffTool initialized [LogLevel: " - + logger.getLogLevel() + "]"); - } + /** + * Logs the start of the diff tool. + * + * @param logger reference to the logger + */ + public static void logInitialization(final Logger logger) { + logger.logMessage(Level.INFO, "DiffTool initialized [LogLevel: " + + logger.getLogLevel() + "]"); + } - /** - * Logs the status of the diff tool. - * - * @param logger - * reference to the logger - * @param time - * time since start - * @param articleConsumer - * number of active article consumers - * @param diffConsumer - * number of active diff consumers - * @param sqlConsumer - * number of active sql consumers - * @param archiveState - * state of the arcive producer - * @param articleState - * state of the article producer - * @param diffState - * state of the diff producer - */ - public static void logStatus(final Logger logger, final long time, - final int articleConsumer, final int diffConsumer, - final int sqlConsumer, final boolean archiveState, - final boolean articleState, final boolean diffState) - { + /** + * Logs the status of the diff tool. + * + * @param logger reference to the logger + * @param time time since start + * @param articleConsumer number of active article consumers + * @param diffConsumer number of active diff consumers + * @param sqlConsumer number of active sql consumers + * @param archiveState state of the arcive producer + * @param articleState state of the article producer + * @param diffState state of the diff producer + */ + public static void logStatus(final Logger logger, final long time, + final int articleConsumer, final int diffConsumer, + final int sqlConsumer, final boolean archiveState, + final boolean articleState, final boolean diffState) { - logger.logMessage(Level.INFO, - "\r\nDiffTool-Status-Report [" + Time.toClock(time) + "]" - + "\r\nConsumerProducer \t[" + articleConsumer + " | " - + diffConsumer + " | " + sqlConsumer + "]" - + "\r\nArchiveProducer\t" + archiveState - + "\r\nArticleProducer\t" + articleState - + "\r\nDiffProducer \t" + diffState + "\r\n"); - } + logger.logMessage(Level.INFO, + "\r\nDiffTool-Status-Report [" + Time.toClock(time) + "]" + + "\r\nConsumerProducer \t[" + articleConsumer + " | " + + diffConsumer + " | " + sqlConsumer + "]" + + "\r\nArchiveProducer\t" + archiveState + + "\r\nArticleProducer\t" + articleState + + "\r\nDiffProducer \t" + diffState + "\r\n"); + } - /** - * Logs an exception. - * - * @param logger - * reference to the logger - * @param e - * reference to the exception - */ - public static void logException(final Logger logger, final Exception e) - { - logger.logException(Level.ERROR, "Unexpected Exception", e); - } + /** + * Logs an exception. + * + * @param logger reference to the logger + * @param e reference to the exception + */ + public static void logException(final Logger logger, final Exception e) { + logger.logException(Level.ERROR, "Unexpected Exception", e); + } - /** - * Logs an error. - * - * @param logger - * reference to the logger - * @param e - * reference to the error - */ - public static void logError(final Logger logger, final Error e) - { - logger.logError(Level.ERROR, "Unexpected Error", e); - } + /** + * Logs an error. + * + * @param logger reference to the logger + * @param e reference to the error + */ + public static void logError(final Logger logger, final Error e) { + logger.logError(Level.ERROR, "Unexpected Error", e); + } - /** - * Logs the shutdown of the logger. - * - * @param logger - * reference to the logger - * @param endTime - * time since start - */ - public static void logShutdown(final Logger logger, final long endTime) - { - logger.logMessage(Level.INFO, - "DiffTool initiates SHUTDOWN\t" + Time.toClock(endTime)); - } + /** + * Logs the shutdown of the logger. + * + * @param logger reference to the logger + * @param endTime time since start + */ + public static void logShutdown(final Logger logger, final long endTime) { + logger.logMessage(Level.INFO, + "DiffTool initiates SHUTDOWN\t" + Time.toClock(endTime)); + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/logging/messages/consumer/ArticleConsumerLogMessages.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/logging/messages/consumer/ArticleConsumerLogMessages.java index bb21d8b7..3a745271 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/logging/messages/consumer/ArticleConsumerLogMessages.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/logging/messages/consumer/ArticleConsumerLogMessages.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -30,260 +30,208 @@ /** * This class contains the english localized log messages for ArticleConsumers. - * + * <p> * TODO: This file should be replaced with resource files. - * - * - * */ -public final class ArticleConsumerLogMessages -{ - - /** - * Logs the retrieval of an archive descriptor. - * - * @param logger - * reference to the logger - * @param archive - * reference to the archive descriptor - */ - public static void logArchiveRetrieved(final Logger logger, - final ArchiveDescription archive) - { - - logger.logMessage(Level.INFO, "Retrieved archive " + archive.toString() - + " successfully"); - } - - /** - * Logs the reading of an revision task. - * - * @param logger - * reference to the logger - * @param article - * reference to the revision task - * @param time - * time needed for the operation - */ - public static void logArticleRead(final Logger logger, - final Task<Revision> article, final long time) - { - - logger.logMessage(Level.INFO, "Read article\t" + Time.toClock(time) - + "\t" + article.toString()); - } - - /** - * Logs the reading of an revision task. - * - * @param logger - * reference to the logger - * @param article - * reference to the revision task - * @param time - * time needed for the operation - * @param position - * input file position - */ - public static void logArticleRead(final Logger logger, - final Task<Revision> article, final long time, final long position) - { - - logger.logMessage(Level.INFO, "Read article\t" + Time.toClock(time) - + "\t" + article.toString() + "\t" + position); - } - - /** - * Logs the occurance of an error while retrieving the input file. - * - * @param logger - * reference to the logger - * @param archive - * reference to the archive - * @param e - * reference to the error - */ - public static void logErrorRetrieveArchive(final Logger logger, - final ArchiveDescription archive, final Error e) - { - - logger.logError(Level.ERROR, "Error while accessing archive " - + archive.toString(), e); - } - - /** - * Logs the occurance of an exception while retrieving the input file. - * - * @param logger - * reference to the logger - * @param archive - * reference to the archive - * @param e - * reference to the exception - */ - public static void logExceptionRetrieveArchive(final Logger logger, - final ArchiveDescription archive, final Exception e) - { - - logger.logException(Level.ERROR, "Exception while accessing archive " - + archive.toString(), e); - } - - /** - * Logs the occurance of an invalid task type. - * - * @param logger - * reference to the logger - * @param type - * type of task - */ - public static void logInvalidTaskType(final Logger logger, - final TaskTypes type) - { - - logger.logMessage(Level.INFO, "Invalid TaskType: " + type); - } - - /** - * Logs that no more archives are available. - * - * @param logger - * reference to the logger - */ - public static void logNoMoreArchives(final Logger logger) - { - - logger.logMessage(Level.INFO, - "Consumer initiates SHUTDOWN: no more archives available."); - } - - /** - * Logs that no more articles are available. - * - * @param logger - * reference to the logger - * @param archive - * reference to the archive descriptor - */ - public static void logNoMoreArticles(final Logger logger, - final ArchiveDescription archive) - { - - logger.logMessage(Level.INFO, "Archive " + archive.toString() - + " contains no more articles"); - } - - /** - * Logs an occurance of an exception while reading a task. - * - * @param logger - * reference to the logger - * @param task - * reference to the task - * @param e - * reference to the exception - */ - public static void logReadTaskException(final Logger logger, - final Task<Revision> task, final Exception e) - { - - if (task != null) { - logger.logException(Level.ERROR, "Error while reading a task: " - + task, e); - } - else { - logger.logException(Level.ERROR, - "Error while reading an unknown task", e); - } - } - - /** - * Logs an occurance of an OutOfMemoryError while reading a task. - * - * @param logger - * reference to the logger - * @param task - * reference to the task - * @param e - * reference to the error - */ - public static void logReadTaskOutOfMemoryError(final Logger logger, - final Task<Revision> task, final OutOfMemoryError e) - { - - if (task != null) { - logger.logError(Level.WARN, "Error while reading a task: " - + task, e); - } - else { - logger.logError(Level.WARN, - "Error while reading an unknown task", e); - } - } - - /** - * Logs the failed retrieval of an archive descriptor. - * - * @param logger - * reference to the logger - */ - public static void logRetrieveArchiveFailed(final Logger logger) - { - - logger.logMessage(Level.WARN, "Consumer failed to obtain an archive"); - } - - /** - * Logs the status of the article consumer. - * - * @param logger - * reference to the logger - * @param articleReader - * reference to the ArticleReader - * @param startTime - * start time - * @param sleepingTime - * time the consumer has slept - * @param workingTime - * time the consumer was working - */ - public static void logStatus(final Logger logger, - final ArticleReaderInterface articleReader, final long startTime, - final long sleepingTime, final long workingTime) - { - - String message = "Consumer-Status-Report [" - + Time.toClock(System.currentTimeMillis() - startTime) + "]"; - - if (articleReader != null) { - message += "\tPOSITION <" + articleReader.getBytePosition() + ">"; - } - - message += "\tEFFICIENCY\t " - + MathUtilities.percentPlus(workingTime, sleepingTime) - + "\tWORK [" + Time.toClock(workingTime) + "]" + "\tSLEEP [" - + Time.toClock(sleepingTime) + "]"; - - logger.logMessage(Level.DEBUG, message); - } - - /** - * Logs the occurance of an ArticleReaderException. - * - * @param logger - * reference to the logger - * @param e - * reference to the exception - */ - public static void logTaskReaderException(final Logger logger, - final ArticleReaderException e) - { - - logger.logException(Level.ERROR, "TaskReaderException", e); - } - - /** No object - utility class */ - private ArticleConsumerLogMessages() - { - } +public final class ArticleConsumerLogMessages { + + /** + * Logs the retrieval of an archive descriptor. + * + * @param logger reference to the logger + * @param archive reference to the archive descriptor + */ + public static void logArchiveRetrieved(final Logger logger, + final ArchiveDescription archive) { + + logger.logMessage(Level.INFO, "Retrieved archive " + archive.toString() + + " successfully"); + } + + /** + * Logs the reading of an revision task. + * + * @param logger reference to the logger + * @param article reference to the revision task + * @param time time needed for the operation + */ + public static void logArticleRead(final Logger logger, + final Task<Revision> article, final long time) { + + logger.logMessage(Level.INFO, "Read article\t" + Time.toClock(time) + + "\t" + article.toString()); + } + + /** + * Logs the reading of an revision task. + * + * @param logger reference to the logger + * @param article reference to the revision task + * @param time time needed for the operation + * @param position input file position + */ + public static void logArticleRead(final Logger logger, + final Task<Revision> article, final long time, final long position) { + + logger.logMessage(Level.INFO, "Read article\t" + Time.toClock(time) + + "\t" + article.toString() + "\t" + position); + } + + /** + * Logs the occurance of an error while retrieving the input file. + * + * @param logger reference to the logger + * @param archive reference to the archive + * @param e reference to the error + */ + public static void logErrorRetrieveArchive(final Logger logger, + final ArchiveDescription archive, final Error e) { + + logger.logError(Level.ERROR, "Error while accessing archive " + + archive.toString(), e); + } + + /** + * Logs the occurance of an exception while retrieving the input file. + * + * @param logger reference to the logger + * @param archive reference to the archive + * @param e reference to the exception + */ + public static void logExceptionRetrieveArchive(final Logger logger, + final ArchiveDescription archive, final Exception e) { + + logger.logException(Level.ERROR, "Exception while accessing archive " + + archive.toString(), e); + } + + /** + * Logs the occurance of an invalid task type. + * + * @param logger reference to the logger + * @param type type of task + */ + public static void logInvalidTaskType(final Logger logger, + final TaskTypes type) { + + logger.logMessage(Level.INFO, "Invalid TaskType: " + type); + } + + /** + * Logs that no more archives are available. + * + * @param logger reference to the logger + */ + public static void logNoMoreArchives(final Logger logger) { + + logger.logMessage(Level.INFO, + "Consumer initiates SHUTDOWN: no more archives available."); + } + + /** + * Logs that no more articles are available. + * + * @param logger reference to the logger + * @param archive reference to the archive descriptor + */ + public static void logNoMoreArticles(final Logger logger, + final ArchiveDescription archive) { + + logger.logMessage(Level.INFO, "Archive " + archive.toString() + + " contains no more articles"); + } + + /** + * Logs an occurance of an exception while reading a task. + * + * @param logger reference to the logger + * @param task reference to the task + * @param e reference to the exception + */ + public static void logReadTaskException(final Logger logger, + final Task<Revision> task, final Exception e) { + + if (task != null) { + logger.logException(Level.ERROR, "Error while reading a task: " + + task, e); + } else { + logger.logException(Level.ERROR, + "Error while reading an unknown task", e); + } + } + + /** + * Logs an occurance of an OutOfMemoryError while reading a task. + * + * @param logger reference to the logger + * @param task reference to the task + * @param e reference to the error + */ + public static void logReadTaskOutOfMemoryError(final Logger logger, + final Task<Revision> task, final OutOfMemoryError e) { + + if (task != null) { + logger.logError(Level.WARN, "Error while reading a task: " + + task, e); + } else { + logger.logError(Level.WARN, + "Error while reading an unknown task", e); + } + } + + /** + * Logs the failed retrieval of an archive descriptor. + * + * @param logger reference to the logger + */ + public static void logRetrieveArchiveFailed(final Logger logger) { + + logger.logMessage(Level.WARN, "Consumer failed to obtain an archive"); + } + + /** + * Logs the status of the article consumer. + * + * @param logger reference to the logger + * @param articleReader reference to the ArticleReader + * @param startTime start time + * @param sleepingTime time the consumer has slept + * @param workingTime time the consumer was working + */ + public static void logStatus(final Logger logger, + final ArticleReaderInterface articleReader, final long startTime, + final long sleepingTime, final long workingTime) { + + String message = "Consumer-Status-Report [" + + Time.toClock(System.currentTimeMillis() - startTime) + "]"; + + if (articleReader != null) { + message += "\tPOSITION <" + articleReader.getBytePosition() + ">"; + } + + message += "\tEFFICIENCY\t " + + MathUtilities.percentPlus(workingTime, sleepingTime) + + "\tWORK [" + Time.toClock(workingTime) + "]" + "\tSLEEP [" + + Time.toClock(sleepingTime) + "]"; + + logger.logMessage(Level.DEBUG, message); + } + + /** + * Logs the occurance of an ArticleReaderException. + * + * @param logger reference to the logger + * @param e reference to the exception + */ + public static void logTaskReaderException(final Logger logger, + final ArticleReaderException e) { + + logger.logException(Level.ERROR, "TaskReaderException", e); + } + + /** + * No object - utility class + */ + private ArticleConsumerLogMessages() { + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/logging/messages/consumer/ConsumerLogMessages.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/logging/messages/consumer/ConsumerLogMessages.java index e2047a8b..24db94e8 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/logging/messages/consumer/ConsumerLogMessages.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/logging/messages/consumer/ConsumerLogMessages.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -27,172 +27,140 @@ * This class contains the english localized log messages for Consumers. * <p> * TODO: This file should be replaced with resource files. - * */ -public final class ConsumerLogMessages -{ - - /** - * Logs the start of a consumer. - * - * @param logger - * reference to the logger - */ - public static void logConsumerRunning(final Logger logger) - { - logger.logMessage(Level.INFO, "Consumer is up and running"); - } - - /** - * Logs an error. - * - * @param logger - * reference to the logger - * @param e - * reference to the error - */ - public static void logError(final Logger logger, final Error e) - { - logger.logError(Level.ERROR, "Unexpected Error", e); - } - - /** - * Logs an exception. - * - * @param logger - * reference to the logger - * @param e - * reference to the exception - */ - public static void logException(final Logger logger, final Exception e) - { - logger.logException(Level.ERROR, "Unexpected Exception", e); - } - - /** - * Logs the initialization of a consumer. - * - * @param logger - * reference to the logger - */ - public static void logInitialization(final Logger logger) - { - logger.logMessage(Level.INFO, "Consumer initialized [LogLevel: " - + logger.getLogLevel() + "]"); - } - - /** - * Logs the receival of the kill signal. - * - * @param logger - * reference to the logger - */ - public static void logKillSignalMessage(final Logger logger) - { - logger.logMessage(Level.INFO, "Consumer received KILL Signal"); - } - - /** - * Logs the receival of the ping signal. - * - * @param logger - * reference to the logger - */ - public static void logPingSignal(final Logger logger) - { - logger.logMessage(Level.INFO, "Consumer received PING Signal"); - } - - /** - * Logs the shutdown of the consumer. - * - * @param logger - * reference to the logger - * @param endTime - * time - */ - public static void logShutdown(final Logger logger, final long endTime) - { - logger.logMessage(Level.INFO, - "Consumer initiates SHUTDOWN\t" + Time.toClock(endTime)); - } - - /** - * Logs that the consumer is sleeping. - * - * @param logger - * reference to the logger - */ - public static void logSleep(final Logger logger) - { - logger.logMessage(Level.DEBUG, "Consumer is sleeping"); - } - - /** - * Logs the receival of the start signal. - * - * @param logger - * reference to the logger - */ - public static void logStartSignalMessage(final Logger logger) - { - logger.logMessage(Level.INFO, "Consumer received START Signal"); - } - - /** - * Logs the status of the consumer. - * - * @param logger - * reference to the logger - * @param startTime - * start time - * @param sleepingTime - * time the consumer has slept - * @param workingTime - * time the consumer was working - */ - public static void logStatus(final Logger logger, final long startTime, - final long sleepingTime, final long workingTime) - { - - logger.logMessage( - Level.DEBUG, - "Consumer-Status-Report [" - + Time.toClock(System.currentTimeMillis() - startTime) - + "]" + "\tEFFICIENCY\t " - + MathUtilities.percentPlus(workingTime, sleepingTime) - + "\tWORK [" + Time.toClock(workingTime) + "]" - + "\tSLEEP [" + Time.toClock(sleepingTime) + "]"); - } - - /** - * Logs the receival of the stop signal. - * - * @param logger - * reference to the logger - */ - public static void logStopSignal(final Logger logger) - { - logger.logMessage(Level.INFO, "Consumer received STOP Signal"); - } - - /** - * Logs the occurrence of a TimeoutException. - * - * @param logger - * reference to the logger - * @param e - * reference to the exception - */ - public static void logTimeoutException(final Logger logger, - final TimeoutException e) - { - - logger.logException(Level.WARN, "TimeoutException", e); - } - - /** No object - utility class */ - private ConsumerLogMessages() - { - } +public final class ConsumerLogMessages { + + /** + * Logs the start of a consumer. + * + * @param logger reference to the logger + */ + public static void logConsumerRunning(final Logger logger) { + logger.logMessage(Level.INFO, "Consumer is up and running"); + } + + /** + * Logs an error. + * + * @param logger reference to the logger + * @param e reference to the error + */ + public static void logError(final Logger logger, final Error e) { + logger.logError(Level.ERROR, "Unexpected Error", e); + } + + /** + * Logs an exception. + * + * @param logger reference to the logger + * @param e reference to the exception + */ + public static void logException(final Logger logger, final Exception e) { + logger.logException(Level.ERROR, "Unexpected Exception", e); + } + + /** + * Logs the initialization of a consumer. + * + * @param logger reference to the logger + */ + public static void logInitialization(final Logger logger) { + logger.logMessage(Level.INFO, "Consumer initialized [LogLevel: " + + logger.getLogLevel() + "]"); + } + + /** + * Logs the receival of the kill signal. + * + * @param logger reference to the logger + */ + public static void logKillSignalMessage(final Logger logger) { + logger.logMessage(Level.INFO, "Consumer received KILL Signal"); + } + + /** + * Logs the receival of the ping signal. + * + * @param logger reference to the logger + */ + public static void logPingSignal(final Logger logger) { + logger.logMessage(Level.INFO, "Consumer received PING Signal"); + } + + /** + * Logs the shutdown of the consumer. + * + * @param logger reference to the logger + * @param endTime time + */ + public static void logShutdown(final Logger logger, final long endTime) { + logger.logMessage(Level.INFO, + "Consumer initiates SHUTDOWN\t" + Time.toClock(endTime)); + } + + /** + * Logs that the consumer is sleeping. + * + * @param logger reference to the logger + */ + public static void logSleep(final Logger logger) { + logger.logMessage(Level.DEBUG, "Consumer is sleeping"); + } + + /** + * Logs the receival of the start signal. + * + * @param logger reference to the logger + */ + public static void logStartSignalMessage(final Logger logger) { + logger.logMessage(Level.INFO, "Consumer received START Signal"); + } + + /** + * Logs the status of the consumer. + * + * @param logger reference to the logger + * @param startTime start time + * @param sleepingTime time the consumer has slept + * @param workingTime time the consumer was working + */ + public static void logStatus(final Logger logger, final long startTime, + final long sleepingTime, final long workingTime) { + + logger.logMessage( + Level.DEBUG, + "Consumer-Status-Report [" + + Time.toClock(System.currentTimeMillis() - startTime) + + "]" + "\tEFFICIENCY\t " + + MathUtilities.percentPlus(workingTime, sleepingTime) + + "\tWORK [" + Time.toClock(workingTime) + "]" + + "\tSLEEP [" + Time.toClock(sleepingTime) + "]"); + } + + /** + * Logs the receival of the stop signal. + * + * @param logger reference to the logger + */ + public static void logStopSignal(final Logger logger) { + logger.logMessage(Level.INFO, "Consumer received STOP Signal"); + } + + /** + * Logs the occurrence of a TimeoutException. + * + * @param logger reference to the logger + * @param e reference to the exception + */ + public static void logTimeoutException(final Logger logger, + final TimeoutException e) { + + logger.logException(Level.WARN, "TimeoutException", e); + } + + /** + * No object - utility class + */ + private ConsumerLogMessages() { + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/logging/messages/consumer/DiffConsumerLogMessages.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/logging/messages/consumer/DiffConsumerLogMessages.java index 521acf43..ebcbfb57 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/logging/messages/consumer/DiffConsumerLogMessages.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/logging/messages/consumer/DiffConsumerLogMessages.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -29,142 +29,114 @@ * This class contains the english localized log messages for DiffConsumers. * <p> * TODO: This file should be replaced with resource files. - * */ -public class DiffConsumerLogMessages -{ - - /** - * Logs the processing of a revision task. - * - * @param logger - * reference to the logger - * @param article - * reference to the revision task - * @param time - * time - */ - public static void logArticleProcessed(final Logger logger, - final Task<Revision> article, long time) - { - - logger.logMessage(Level.INFO, "Generated Diff\t" + Time.toClock(time) - + "\t" + article.toString()); - } - - /** - * Logs the processing of a revision task. - * - * @param logger - * reference to the logger - * @param article - * reference to the revision task - * @param time - * time - * @param transmittingTime - * time that the transfer of data to the producer needed - */ - public static void logArticleProcessed(final Logger logger, - final Task<Revision> article, long time, long transmittingTime) - { - - logger.logMessage( - Level.INFO, - "Generated Diff\t" + Time.toClock(time) + "\t" - + Time.toClock(transmittingTime) + "\t" - + article.toString()); - } - - /** - * Logs the occurance of a DiffException. - * - * @param logger - * reference to the logger - * @param e - * reference to the exception - */ - public static void logDiffException(final Logger logger, - final DiffException e) - { - - logger.logException(Level.ERROR, "DiffException", e); - } - - /** - * Logs the receival of an end task. - * - * @param logger - * reference to the logger - */ - public static void logEndTaskReceived(final Logger logger) - { - - logger.logMessage(Level.INFO, - "Consumer initiates SHUTDOWN: EndTask received"); - } - - /** - * Logs the occurance of an invalid task type. - * - * @param logger - * reference to the logger - * @param type - * type of task - */ - public static void logInvalidTaskType(final Logger logger, - final TaskTypes type) - { - - logger.logMessage(Level.INFO, "Invalid TaskType: " + type); - } - - /** - * Logs the occurance of an TaskOutOfMemoryError while reading a revision - * task. - * - * @param logger - * reference to the logger - * @param task - * reference to the revision task - * @param e - * reference to the error - */ - public static void logReadTaskOutOfMemoryError(final Logger logger, - final Task<Revision> task, final OutOfMemoryError e) - { - - if (task != null) { - logger.logError(Level.WARN, "Error while reading a task: " - + task, e); - } - else { - logger.logError(Level.WARN, - "Error while reading an unknown task", e); - } - } - - /** - * Logs the start of the processing of an revision task. - * - * @param logger - * reference to the logger - * @param article - * reference to the revision task - * @param time - * time - * @param transmittingTime - * time that the transfer of data to the producer needed - */ - public static void logStartArticleProcessing(final Logger logger, - final Task<Revision> article, long time, long transmittingTime) - { - - logger.logMessage(Level.TRACE, - "Start Procssing Task\t" + article.toString()); - } - - /** No object - utility class */ - private DiffConsumerLogMessages() - { - } +public class DiffConsumerLogMessages { + + /** + * Logs the processing of a revision task. + * + * @param logger reference to the logger + * @param article reference to the revision task + * @param time time + */ + public static void logArticleProcessed(final Logger logger, + final Task<Revision> article, long time) { + + logger.logMessage(Level.INFO, "Generated Diff\t" + Time.toClock(time) + + "\t" + article.toString()); + } + + /** + * Logs the processing of a revision task. + * + * @param logger reference to the logger + * @param article reference to the revision task + * @param time time + * @param transmittingTime time that the transfer of data to the producer needed + */ + public static void logArticleProcessed(final Logger logger, + final Task<Revision> article, long time, long transmittingTime) { + + logger.logMessage( + Level.INFO, + "Generated Diff\t" + Time.toClock(time) + "\t" + + Time.toClock(transmittingTime) + "\t" + + article.toString()); + } + + /** + * Logs the occurance of a DiffException. + * + * @param logger reference to the logger + * @param e reference to the exception + */ + public static void logDiffException(final Logger logger, + final DiffException e) { + + logger.logException(Level.ERROR, "DiffException", e); + } + + /** + * Logs the receival of an end task. + * + * @param logger reference to the logger + */ + public static void logEndTaskReceived(final Logger logger) { + + logger.logMessage(Level.INFO, + "Consumer initiates SHUTDOWN: EndTask received"); + } + + /** + * Logs the occurance of an invalid task type. + * + * @param logger reference to the logger + * @param type type of task + */ + public static void logInvalidTaskType(final Logger logger, + final TaskTypes type) { + + logger.logMessage(Level.INFO, "Invalid TaskType: " + type); + } + + /** + * Logs the occurance of an TaskOutOfMemoryError while reading a revision + * task. + * + * @param logger reference to the logger + * @param task reference to the revision task + * @param e reference to the error + */ + public static void logReadTaskOutOfMemoryError(final Logger logger, + final Task<Revision> task, final OutOfMemoryError e) { + + if (task != null) { + logger.logError(Level.WARN, "Error while reading a task: " + + task, e); + } else { + logger.logError(Level.WARN, + "Error while reading an unknown task", e); + } + } + + /** + * Logs the start of the processing of an revision task. + * + * @param logger reference to the logger + * @param article reference to the revision task + * @param time time + * @param transmittingTime time that the transfer of data to the producer needed + */ + public static void logStartArticleProcessing(final Logger logger, + final Task<Revision> article, long time, long transmittingTime) { + + logger.logMessage(Level.TRACE, + "Start Procssing Task\t" + article.toString()); + } + + /** + * No object - utility class + */ + private DiffConsumerLogMessages() { + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/logging/messages/consumer/SQLConsumerLogMessages.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/logging/messages/consumer/SQLConsumerLogMessages.java index ddb83f3f..bbdb770e 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/logging/messages/consumer/SQLConsumerLogMessages.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/logging/messages/consumer/SQLConsumerLogMessages.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -28,86 +28,70 @@ * This class contains the english localized log messages for SQLConsumers. * <p> * TODO: This file should be replaced with resource files. - * */ -public class SQLConsumerLogMessages -{ +public class SQLConsumerLogMessages { - /** - * Logs the processing of a diff task. - * - * @param logger - * reference to the logger - * @param diff - * reference to the task - * @param time - * time - */ - public static void logDiffProcessed(final Logger logger, - final Task<Diff> diff, final long time) - { + /** + * Logs the processing of a diff task. + * + * @param logger reference to the logger + * @param diff reference to the task + * @param time time + */ + public static void logDiffProcessed(final Logger logger, + final Task<Diff> diff, final long time) { - logger.logMessage( - Level.INFO, - "Generated Entry\t" + Time.toClock(time) + "\t" - + diff.toString()); - } + logger.logMessage( + Level.INFO, + "Generated Entry\t" + Time.toClock(time) + "\t" + + diff.toString()); + } - /** - * Logs the creation of an output file. - * - * @param logger - * reference to the logger - * @param path - * path of the output file - */ - public static void logFileCreation(final Logger logger, final String path) - { + /** + * Logs the creation of an output file. + * + * @param logger reference to the logger + * @param path path of the output file + */ + public static void logFileCreation(final Logger logger, final String path) { - logger.logMessage(Level.INFO, "New File created:\t" + path); - } + logger.logMessage(Level.INFO, "New File created:\t" + path); + } - /** - * Logs the occurrence of an OutOfMemoryError while reading a task. - * - * @param logger - * reference to the logger - * @param task - * reference to the revision task - * @param e - * reference to the error - */ - public static void logReadTaskOutOfMemoryError(final Logger logger, - final Task<Diff> task, final OutOfMemoryError e) - { + /** + * Logs the occurrence of an OutOfMemoryError while reading a task. + * + * @param logger reference to the logger + * @param task reference to the revision task + * @param e reference to the error + */ + public static void logReadTaskOutOfMemoryError(final Logger logger, + final Task<Diff> task, final OutOfMemoryError e) { - if (task != null) { - logger.logError(Level.WARN, "Error while reading a task: " - + task, e); - } - else { - logger.logError(Level.WARN, - "Error while reading an unknown task", e); - } - } + if (task != null) { + logger.logError(Level.WARN, "Error while reading a task: " + + task, e); + } else { + logger.logError(Level.WARN, + "Error while reading an unknown task", e); + } + } - /** - * Logs the occurrence of an SqlConsumerException. - * - * @param logger - * reference to the logger - * @param e - * reference to the exception - */ - public static void logSQLConsumerException(final Logger logger, - final SQLConsumerException e) - { + /** + * Logs the occurrence of an SqlConsumerException. + * + * @param logger reference to the logger + * @param e reference to the exception + */ + public static void logSQLConsumerException(final Logger logger, + final SQLConsumerException e) { - logger.logException(Level.ERROR, "SQLConsumerException", e); - } + logger.logException(Level.ERROR, "SQLConsumerException", e); + } - /** No object - utility class */ - private SQLConsumerLogMessages() - { - } + /** + * No object - utility class + */ + private SQLConsumerLogMessages() { + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/util/LetterNode.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/util/LetterNode.java index 41780af3..7949d0a1 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/util/LetterNode.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/util/LetterNode.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -22,129 +22,119 @@ /** * LetterNode This node represents a node of the keyword tree. * - * - * - * - * @param <V> - * contained value + * @param <V> contained value */ -public class LetterNode<V> -{ - - /** Alphabetic index of successor nodes */ - private final HashMap<Character, LetterNode<V>> nodes; - - /** Flag, whether this node contains a valid key or not */ - private boolean isKeyword; - - /** Contained keyword */ - private final String word; - - /** Contained value - related to the keyword */ - private V value; - - /** - * (Constructor) Creates a empty LetterNode. - */ - public LetterNode() - { - this.nodes = new HashMap<>(); - this.isKeyword = false; - this.word = ""; - } - - /** - * (Constructor) Creates a LetterNode with a keyword. - * - * @param word - * keyword - */ - public LetterNode(final String word) - { - this.nodes = new HashMap<>(); - this.isKeyword = false; - this.word = word; - } - - /** - * Adds a word and its related value. - * - * @param word - * keyword - * @param value - * related value - */ - public void add(final String word, final V value) - { - - char c = word.charAt(0); - - LetterNode<V> node = get(c); - if (node == null) { - node = new LetterNode<>(this.word + c); - } - this.nodes.put(c, node); - - if (word.length() == 1) { - node.isKeyword = true; - node.value = value; - return; - } - - node.add(word.substring(1), value); - } - - /** - * Returns the keyword. - * - * @return keyword - */ - public String getWord() - { - return this.word; - } - - /** - * Returns the related value. - * - * @return related value - */ - public V getValue() - { - return this.value; - } - - /** - * Returns the specified successor node. - * - * @param c - * character - * @return successor node or NULL if the specified node is not available - */ - public LetterNode<V> get(char c) - { - return this.nodes.get(c); - } - - /** - * Checks whether the specified successor node is contained. - * - * @param c - * character - * @return TRUE | FALSE - */ - public boolean contains(char c) - { - return this.nodes.containsKey(c); - } - - /** - * Returns whether this node contains a keyword or not. - * - * @return TRUE | FALSE - */ - public boolean isKeyword() - { - return this.isKeyword; - } +public class LetterNode<V> { + + /** + * Alphabetic index of successor nodes + */ + private final HashMap<Character, LetterNode<V>> nodes; + + /** + * Flag, whether this node contains a valid key or not + */ + private boolean isKeyword; + + /** + * Contained keyword + */ + private final String word; + + /** + * Contained value - related to the keyword + */ + private V value; + + /** + * (Constructor) Creates a empty LetterNode. + */ + public LetterNode() { + this.nodes = new HashMap<>(); + this.isKeyword = false; + this.word = ""; + } + + /** + * (Constructor) Creates a LetterNode with a keyword. + * + * @param word keyword + */ + public LetterNode(final String word) { + this.nodes = new HashMap<>(); + this.isKeyword = false; + this.word = word; + } + + /** + * Adds a word and its related value. + * + * @param word keyword + * @param value related value + */ + public void add(final String word, final V value) { + + char c = word.charAt(0); + + LetterNode<V> node = get(c); + if (node == null) { + node = new LetterNode<>(this.word + c); + } + this.nodes.put(c, node); + + if (word.length() == 1) { + node.isKeyword = true; + node.value = value; + return; + } + + node.add(word.substring(1), value); + } + + /** + * Returns the keyword. + * + * @return keyword + */ + public String getWord() { + return this.word; + } + + /** + * Returns the related value. + * + * @return related value + */ + public V getValue() { + return this.value; + } + + /** + * Returns the specified successor node. + * + * @param c character + * @return successor node or NULL if the specified node is not available + */ + public LetterNode<V> get(char c) { + return this.nodes.get(c); + } + + /** + * Checks whether the specified successor node is contained. + * + * @param c character + * @return TRUE | FALSE + */ + public boolean contains(char c) { + return this.nodes.containsKey(c); + } + + /** + * Returns whether this node contains a keyword or not. + * + * @return TRUE | FALSE + */ + public boolean isKeyword() { + return this.isKeyword; + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/util/MathUtilities.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/util/MathUtilities.java index c8af284c..9fbe3818 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/util/MathUtilities.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/util/MathUtilities.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -19,103 +19,86 @@ /** * Mathematic functions - * - * - * */ -public class MathUtilities -{ +public class MathUtilities { - /** No object - utility class */ - private MathUtilities() - { - } + /** + * No object - utility class + */ + private MathUtilities() { + } - /** - * Rounds the given number to a precision of two after digit numbers. - * - * @param v - * number - * @return rounded number - */ - public static double round(final double v) - { - return ((long) (v * 100.)) / 100.; - } + /** + * Rounds the given number to a precision of two after digit numbers. + * + * @param v number + * @return rounded number + */ + public static double round(final double v) { + return ((long) (v * 100.)) / 100.; + } - /** - * Rounds the result of a / (a + b) to a precision of two after digit - * numbers. - * - * @param a - * value a - * @param b - * value b - * @return xx.xx - */ - public static double percentPlus(final double a, final double b) - { - return round((double) a / (double) (a + b)); - } + /** + * Rounds the result of a / (a + b) to a precision of two after digit + * numbers. + * + * @param a value a + * @param b value b + * @return xx.xx + */ + public static double percentPlus(final double a, final double b) { + return round((double) a / (double) (a + b)); + } - /** - * Rounds the result of a / (a + b) to a precision of two after digit - * numbers. - * - * @param a - * value a - * @param b - * value b - * @return xx.xx - */ - public static double percRoundPlus(final double a, final double b) - { - return ((long) ((a / (a + b)) * 10000) / 100.); - } + /** + * Rounds the result of a / (a + b) to a precision of two after digit + * numbers. + * + * @param a value a + * @param b value b + * @return xx.xx + */ + public static double percRoundPlus(final double a, final double b) { + return ((long) ((a / (a + b)) * 10000) / 100.); + } - /** - * Rounds the result of a / b to a precision of two after digit numbers. - * - * @param a - * value a - * @param b - * value b - * @return xx.xx - */ - public static double percentDiv(final double a, final double b) - { - return ((long) ((a / b) * 10000) / 100.); - } + /** + * Rounds the result of a / b to a precision of two after digit numbers. + * + * @param a value a + * @param b value b + * @return xx.xx + */ + public static double percentDiv(final double a, final double b) { + return ((long) ((a / b) * 10000) / 100.); + } - /** - * Returns the result of (a / b) as a percentage string - * - * @param a - * value a - * @param b - * value b - * @return xx.xx% - */ - public static String percentFrom(final double a, final double b) - { + /** + * Returns the result of (a / b) as a percentage string + * + * @param a value a + * @param b value b + * @return xx.xx% + */ + public static String percentFrom(final double a, final double b) { - double bVal = b; + double bVal = b; - if (bVal == 0.) { - bVal = 1.; - } + if (bVal == 0.) { + bVal = 1.; + } - StringBuilder rep = new StringBuilder(); - double d = ((long) ((a / bVal) * 10000) / 100.); - if (d < 10.0) { - rep.append('0'); - } + StringBuilder rep = new StringBuilder(); + double d = ((long) ((a / bVal) * 10000) / 100.); + if (d < 10.0) { + rep.append('0'); + } - rep.append(d); - while (rep.length() < 5) { - rep.append('0'); - } + rep.append(d); + while (rep.length() < 5) { + rep.append('0'); + } - return rep + "%"; - } + return rep + "%"; + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/util/MultipleKeywordTree.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/util/MultipleKeywordTree.java index a5bf2196..d3c71ff4 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/util/MultipleKeywordTree.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/util/MultipleKeywordTree.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -26,102 +26,95 @@ * <p> * This keyword tree can be used for overlapping keywords. * - * - * - * - * @param <V> - * related value + * @param <V> related value */ -public class MultipleKeywordTree<V> -{ - - /** Reference to the root */ - private final LetterNode<V> root; - - /** List of current nodes */ - private List<LetterNode<V>> currentList; - - /** List of successor nodes */ - private final List<LetterNode<V>> hits; - - /** - * (Constructor) Creates an empty MultipleKeywordTree object. - */ - public MultipleKeywordTree() - { - root = new LetterNode<>(); - this.currentList = new ArrayList<>(); - this.hits = new ArrayList<>(); - this.currentList.add(root); - reset(); - } - - /** - * Adds a keyword and its related value. - * - * @param s - * keyword - * @param value - * related value - */ - public void addKeyword(final String s, final V value) - { - root.add(s, value); - } - - /** - * Checks whether the character is related to one of the current nodes (the - * root node is always a current node). - * <p> - * After the comparison the list of current nodes will be replaced. - * - * @param c - * character - * @return TRUE if successor nodes could be identified FALSE otherwise - */ - public boolean check(final char c) - { - - List<LetterNode<V>> newList = new ArrayList<>(); - newList.add(root); - - LetterNode<V> current; - hits.clear(); - - int size = this.currentList.size(); - for (int i = 0; i < size; i++) { - current = this.currentList.get(i); - - current = current.get(c); - if (current != null) { - newList.add(current); - - if (current.isKeyword()) { - hits.add(current); - } - } - } - - this.currentList = newList; - return !hits.isEmpty(); - } - - /** - * Resets the list of current node to only contain the root node. - */ - public void reset() - { - this.currentList.clear(); - this.currentList.add(root); - } - - /** - * Returns the list of successor nodes. - * - * @return list of successor nodes - */ - public List<LetterNode<V>> getHits() - { - return this.hits; - } +public class MultipleKeywordTree<V> { + + /** + * Reference to the root + */ + private final LetterNode<V> root; + + /** + * List of current nodes + */ + private List<LetterNode<V>> currentList; + + /** + * List of successor nodes + */ + private final List<LetterNode<V>> hits; + + /** + * (Constructor) Creates an empty MultipleKeywordTree object. + */ + public MultipleKeywordTree() { + root = new LetterNode<>(); + this.currentList = new ArrayList<>(); + this.hits = new ArrayList<>(); + this.currentList.add(root); + reset(); + } + + /** + * Adds a keyword and its related value. + * + * @param s keyword + * @param value related value + */ + public void addKeyword(final String s, final V value) { + root.add(s, value); + } + + /** + * Checks whether the character is related to one of the current nodes (the + * root node is always a current node). + * <p> + * After the comparison the list of current nodes will be replaced. + * + * @param c character + * @return TRUE if successor nodes could be identified FALSE otherwise + */ + public boolean check(final char c) { + + List<LetterNode<V>> newList = new ArrayList<>(); + newList.add(root); + + LetterNode<V> current; + hits.clear(); + + int size = this.currentList.size(); + for (int i = 0; i < size; i++) { + current = this.currentList.get(i); + + current = current.get(c); + if (current != null) { + newList.add(current); + + if (current.isKeyword()) { + hits.add(current); + } + } + } + + this.currentList = newList; + return !hits.isEmpty(); + } + + /** + * Resets the list of current node to only contain the root node. + */ + public void reset() { + this.currentList.clear(); + this.currentList.add(root); + } + + /** + * Returns the list of successor nodes. + * + * @return list of successor nodes + */ + public List<LetterNode<V>> getHits() { + return this.hits; + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/util/SingleKeywordTree.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/util/SingleKeywordTree.java index 4700ac8b..458f4d1d 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/util/SingleKeywordTree.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/util/SingleKeywordTree.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -23,86 +23,76 @@ * <p> * This keyword tree can only be used for non overlapping keywords. * - * - * - * - * @param <V> - * related value + * @param <V> related value */ -public class SingleKeywordTree<V> -{ +public class SingleKeywordTree<V> { - /** Reference to the root */ - private final LetterNode<V> root; + /** + * Reference to the root + */ + private final LetterNode<V> root; - /** Reference to the current node */ - private LetterNode<V> current; + /** + * Reference to the current node + */ + private LetterNode<V> current; - /** - * (Constructor) Creates an empty SingleKeywordTree object. - */ - public SingleKeywordTree() - { - root = new LetterNode<>(); - reset(); - } + /** + * (Constructor) Creates an empty SingleKeywordTree object. + */ + public SingleKeywordTree() { + root = new LetterNode<>(); + reset(); + } - /** - * Adds a keyword and its related value. - * - * @param s - * keyword - * @param value - * related value - */ - public void addKeyword(final String s, final V value) - { - root.add(s, value); - } + /** + * Adds a keyword and its related value. + * + * @param s keyword + * @param value related value + */ + public void addKeyword(final String s, final V value) { + root.add(s, value); + } - /** - * Checks whether the character is related to the currently used node. If - * the comparison fails the keyword tree will be reseted to its root node, - * otherwise the related node will replace the current node. - * - * @param c - * character - * @return TRUE if the current node contains a keyword FALSE otherwise - */ - public boolean check(final char c) - { - current = current.get(c); - if (current == null) { - reset(); - } - return current.isKeyword(); - } + /** + * Checks whether the character is related to the currently used node. If + * the comparison fails the keyword tree will be reseted to its root node, + * otherwise the related node will replace the current node. + * + * @param c character + * @return TRUE if the current node contains a keyword FALSE otherwise + */ + public boolean check(final char c) { + current = current.get(c); + if (current == null) { + reset(); + } + return current.isKeyword(); + } - /** - * Resets the current node with the root node. - */ - public void reset() - { - this.current = root; - } + /** + * Resets the current node with the root node. + */ + public void reset() { + this.current = root; + } - /** - * Returns the keyword of the current node. - * - * @return keyword - */ - public String getWord() - { - return this.current.getWord(); - } + /** + * Returns the keyword of the current node. + * + * @return keyword + */ + public String getWord() { + return this.current.getWord(); + } - /** - * Returns the related value of the current node. - * - * @return related value - */ - public V getValue() - { - return this.current.getValue(); - } + /** + * Returns the related value of the current node. + * + * @return related value + */ + public V getValue() { + return this.current.getValue(); + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/util/Surrogates.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/util/Surrogates.java index f14e7e71..93205652 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/util/Surrogates.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/util/Surrogates.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -19,67 +19,59 @@ /** * This utitly class contains some surrogate related methods. - * - * - * */ -public class Surrogates -{ +public class Surrogates { - /** No object - utility class */ - private Surrogates() - { - } + /** + * No object - utility class + */ + private Surrogates() { + } - /** - * Returns whether a surrogate character was contained in the specified - * input. - * - * @param input - * input - * @return if a surrogate character was contained or not - */ - public static boolean scan(final char[] input) - { + /** + * Returns whether a surrogate character was contained in the specified + * input. + * + * @param input input + * @return if a surrogate character was contained or not + */ + public static boolean scan(final char[] input) { - int surLow = 0xD800; - int surHgh = 0xDFFF; + int surLow = 0xD800; + int surHgh = 0xDFFF; - int end = input.length; - for (int i = 0; i < end; i++) { - if ((int) input[i] >= surLow && input[i] <= surHgh) { - return true; - } - } + int end = input.length; + for (int i = 0; i < end; i++) { + if ((int) input[i] >= surLow && input[i] <= surHgh) { + return true; + } + } - return false; - } + return false; + } - /** - * Replaces all surrogates characters with '?'. - * - * @param input - * input - * @return input with '?' instead of surrogates characters - */ - public static char[] replace(final char[] input) - { + /** + * Replaces all surrogates characters with '?'. + * + * @param input input + * @return input with '?' instead of surrogates characters + */ + public static char[] replace(final char[] input) { - int surLow = 0xD800; - int surHgh = 0xDFFF; + int surLow = 0xD800; + int surHgh = 0xDFFF; - int end = input.length; - char[] output = new char[end]; + int end = input.length; + char[] output = new char[end]; - for (int i = 0; i < end; i++) { - if ((int) input[i] >= surLow && input[i] <= surHgh) { - output[i] = '?'; - } - else { - output[i] = input[i]; - } - } + for (int i = 0; i < end; i++) { + if ((int) input[i] >= surLow && input[i] <= surHgh) { + output[i] = '?'; + } else { + output[i] = input[i]; + } + } - return output; - } + return output; + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/util/Time.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/util/Time.java index 668198fa..5517844f 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/util/Time.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/util/Time.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -22,170 +22,172 @@ * <p> * A clock representation describes the time (HH:MM:SS:sss) and is used for * measuring the processing times. - * - * - * */ -public class Time -{ - - /** Weeks */ - private final short weeks; - - /** Days */ - private final short days; - - /** Hours */ - private final short hours; - - /** Minutes */ - private final short minutes; - - /** Seconds */ - private final short seconds; - - /** Milliseconds */ - private final short milliseconds; - - /** - * (Constructor) Creates a new time information transforming the millisecond - * value into a clock representation. - * - * @param time - * milliseconds - */ - public Time(final long time) - { - - long ttime = time; - - this.milliseconds = (short) (ttime % 1000); - ttime = ttime / 1000; - - this.seconds = (short) (ttime % 60); - ttime = ttime / 60; - - this.minutes = (short) (ttime % 60); - ttime = ttime / 60; - - this.hours = (short) (ttime % 24); - ttime = ttime / 24; - - this.days = (short) (ttime % 7); - this.weeks = (short) (ttime / 7); - } - - /** - * Returns the textual description of the time value. - */ - public String toString() - { - StringBuilder s = new StringBuilder(); - - boolean appended = false; - if (this.weeks != 0 || appended) { - appended = true; - s.append(this.weeks + " Wochen "); - } - if (this.days != 0 || appended) { - appended = true; - s.append(this.days + " Tage "); - } - if (this.hours != 0 || appended) { - appended = true; - s.append(this.hours + " Stunden "); - } - if (this.minutes != 0 || appended) { - appended = true; - s.append(this.minutes + " Minuten "); - } - if (this.seconds != 0 || appended) { - appended = true; - s.append(this.seconds + " Sekunden "); - } - if (this.milliseconds != 0 || appended) { - s.append(this.milliseconds + " Milisekunden"); - } - - return s.toString(); - } - - /** - * Returns the clock description of the time value. - */ - public String toClock() - { - StringBuilder s = new StringBuilder(); - - s.append(((this.weeks * 7 + this.days) * 24 + this.hours) + ":"); - if (this.minutes < 10) { - s.append('0'); - } - s.append(this.minutes + ":"); - if (this.seconds < 10) { - s.append('0'); - } - s.append(this.seconds + "."); - if (this.milliseconds < 100) { - s.append('0'); - } - if (this.milliseconds < 10) { - s.append('0'); - } - s.append(this.milliseconds); - - return s.toString(); - } - - /** - * Transforms a millisecond value to the clock representation. - * - * @param time - * milliseconds - * @return clock representation - */ - public static String toClock(long time) - { - - long ttime = time; - - short miliseconds = (short) (ttime % 1000); - ttime = ttime / 1000; - - short seconds = (short) (ttime % 60); - ttime = ttime / 60; - - short minutes = (short) (ttime % 60); - ttime = ttime / 60; - - short hours = (short) (ttime % 24); - ttime = ttime / 24; - - short days = (short) (ttime % 7); - short weeks = (short) (ttime / 7); - - StringBuilder s = new StringBuilder(); - - s.append(((weeks * 7 + days) * 24 + hours) + ":"); - - if (minutes < 10) { - s.append('0'); - } - s.append(minutes + ":"); - - if (seconds < 10) { - s.append('0'); - } - s.append(seconds + "."); - - if (miliseconds < 100) { - s.append('0'); - } - if (miliseconds < 10) { - s.append('0'); - } - s.append(miliseconds); - - return s.toString(); - } +public class Time { + + /** + * Weeks + */ + private final short weeks; + + /** + * Days + */ + private final short days; + + /** + * Hours + */ + private final short hours; + + /** + * Minutes + */ + private final short minutes; + + /** + * Seconds + */ + private final short seconds; + + /** + * Milliseconds + */ + private final short milliseconds; + + /** + * (Constructor) Creates a new time information transforming the millisecond + * value into a clock representation. + * + * @param time milliseconds + */ + public Time(final long time) { + + long ttime = time; + + this.milliseconds = (short) (ttime % 1000); + ttime = ttime / 1000; + + this.seconds = (short) (ttime % 60); + ttime = ttime / 60; + + this.minutes = (short) (ttime % 60); + ttime = ttime / 60; + + this.hours = (short) (ttime % 24); + ttime = ttime / 24; + + this.days = (short) (ttime % 7); + this.weeks = (short) (ttime / 7); + } + + /** + * Returns the textual description of the time value. + */ + public String toString() { + StringBuilder s = new StringBuilder(); + + boolean appended = false; + if (this.weeks != 0 || appended) { + appended = true; + s.append(this.weeks + " Wochen "); + } + if (this.days != 0 || appended) { + appended = true; + s.append(this.days + " Tage "); + } + if (this.hours != 0 || appended) { + appended = true; + s.append(this.hours + " Stunden "); + } + if (this.minutes != 0 || appended) { + appended = true; + s.append(this.minutes + " Minuten "); + } + if (this.seconds != 0 || appended) { + appended = true; + s.append(this.seconds + " Sekunden "); + } + if (this.milliseconds != 0 || appended) { + s.append(this.milliseconds + " Milisekunden"); + } + + return s.toString(); + } + + /** + * Returns the clock description of the time value. + */ + public String toClock() { + StringBuilder s = new StringBuilder(); + + s.append(((this.weeks * 7 + this.days) * 24 + this.hours) + ":"); + if (this.minutes < 10) { + s.append('0'); + } + s.append(this.minutes + ":"); + if (this.seconds < 10) { + s.append('0'); + } + s.append(this.seconds + "."); + if (this.milliseconds < 100) { + s.append('0'); + } + if (this.milliseconds < 10) { + s.append('0'); + } + s.append(this.milliseconds); + + return s.toString(); + } + + /** + * Transforms a millisecond value to the clock representation. + * + * @param time milliseconds + * @return clock representation + */ + public static String toClock(long time) { + + long ttime = time; + + short miliseconds = (short) (ttime % 1000); + ttime = ttime / 1000; + + short seconds = (short) (ttime % 60); + ttime = ttime / 60; + + short minutes = (short) (ttime % 60); + ttime = ttime / 60; + + short hours = (short) (ttime % 24); + ttime = ttime / 24; + + short days = (short) (ttime % 7); + short weeks = (short) (ttime / 7); + + StringBuilder s = new StringBuilder(); + + s.append(((weeks * 7 + days) * 24 + hours) + ":"); + + if (minutes < 10) { + s.append('0'); + } + s.append(minutes + ":"); + + if (seconds < 10) { + s.append('0'); + } + s.append(seconds + "."); + + if (miliseconds < 100) { + s.append('0'); + } + if (miliseconds < 10) { + s.append('0'); + } + s.append(miliseconds); + + return s.toString(); + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/util/WikipediaXMLKeys.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/util/WikipediaXMLKeys.java index 2deb73bf..baa84cf8 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/util/WikipediaXMLKeys.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/util/WikipediaXMLKeys.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -19,105 +19,145 @@ /** * This class contains all keys for wikipedia dump files. - * - * - * */ -public enum WikipediaXMLKeys -{ - - /** Indicates the start of a page */ - KEY_START_PAGE("<page>"), - - /** Indicates the end of a page */ - KEY_END_PAGE("</page>"), - - /** Indicates the start of a title */ - KEY_START_TITLE("<title>"), - - /** Indicates the end of a title */ - KEY_END_TITLE(""), - - /** Indicates the start of an id */ - KEY_START_ID(""), - - /** Indicates the end of an id */ - KEY_END_ID(""), - - /** Indicates the start of a revision */ - KEY_START_REVISION(""), - - /** Indicates the end of a revision */ - KEY_END_REVISION(""), - - /** Indicates the start of a comment */ - KEY_START_COMMENT(""), - - /** Indicates the end of a comment */ - KEY_END_COMMENT(""), - - /** Indicates the start of the contributor ip */ - KEY_START_IP(""), - - /** Indicates the end of the contributor ip */ - KEY_END_IP(""), - - /** Indicates the start of the the contributor username */ - KEY_START_USERNAME(""), - - /** Indicates the end of the contributor username */ - KEY_END_USERNAME(""), - - /** Indicates the start of a timestamp */ - KEY_START_TIMESTAMP(""), - - /** Indicates the end of a timestamp */ - KEY_END_TIMESTAMP(""), - - /** Indicates the start of the contributor info */ - KEY_START_CONTRIBUTOR(""), - - /** Indicates the end of the contributor info */ - KEY_END_CONTRIBUTOR(""), - - /** Indicates the start of the namespace block */ - KEY_START_NAMESPACES(""), - - /** Indicates the end of the namespace block */ - KEY_END_NAMESPACES(""), - - /** Indicates the start of a text segment */ - KEY_START_TEXT(""), - - /** Indicates the end of a text segment */ - KEY_END_TEXT(""), - - /** Indicates that the revision is a minor revision */ - KEY_MINOR_FLAG(""); - - - - /** Keyword related to the key */ - private final String keyword; - - /** - * Creates an enumerator with the speciefied keyword - * - * @param keyword - * keyword - */ - WikipediaXMLKeys(final String keyword) - { - this.keyword = keyword; - } - - /** - * Returns the keyword - * - * @return keyword - */ - public String getKeyword() - { - return this.keyword; - } +public enum WikipediaXMLKeys { + + /** + * Indicates the start of a page + */ + KEY_START_PAGE(""), + + /** + * Indicates the end of a page + */ + KEY_END_PAGE(""), + + /** + * Indicates the start of a title + */ + KEY_START_TITLE(""), + + /** + * Indicates the end of a title + */ + KEY_END_TITLE(""), + + /** + * Indicates the start of an id + */ + KEY_START_ID(""), + + /** + * Indicates the end of an id + */ + KEY_END_ID(""), + + /** + * Indicates the start of a revision + */ + KEY_START_REVISION(""), + + /** + * Indicates the end of a revision + */ + KEY_END_REVISION(""), + + /** + * Indicates the start of a comment + */ + KEY_START_COMMENT(""), + + /** + * Indicates the end of a comment + */ + KEY_END_COMMENT(""), + + /** + * Indicates the start of the contributor ip + */ + KEY_START_IP(""), + + /** + * Indicates the end of the contributor ip + */ + KEY_END_IP(""), + + /** + * Indicates the start of the the contributor username + */ + KEY_START_USERNAME(""), + + /** + * Indicates the end of the contributor username + */ + KEY_END_USERNAME(""), + + /** + * Indicates the start of a timestamp + */ + KEY_START_TIMESTAMP(""), + + /** + * Indicates the end of a timestamp + */ + KEY_END_TIMESTAMP(""), + + /** + * Indicates the start of the contributor info + */ + KEY_START_CONTRIBUTOR(""), + + /** + * Indicates the end of the contributor info + */ + KEY_END_CONTRIBUTOR(""), + + /** + * Indicates the start of the namespace block + */ + KEY_START_NAMESPACES(""), + + /** + * Indicates the end of the namespace block + */ + KEY_END_NAMESPACES(""), + + /** + * Indicates the start of a text segment + */ + KEY_START_TEXT(""), + + /** + * Indicates the end of a text segment + */ + KEY_END_TEXT(""), + + /** + * Indicates that the revision is a minor revision + */ + KEY_MINOR_FLAG(""); + + + /** + * Keyword related to the key + */ + private final String keyword; + + /** + * Creates an enumerator with the speciefied keyword + * + * @param keyword keyword + */ + WikipediaXMLKeys(final String keyword) { + this.keyword = keyword; + } + + /** + * Returns the keyword + * + * @return keyword + */ + public String getKeyword() { + return this.keyword; + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/util/WikipediaXMLWriter.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/util/WikipediaXMLWriter.java index 55ef0cdb..913a6326 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/util/WikipediaXMLWriter.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/util/WikipediaXMLWriter.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -37,363 +37,340 @@ * output file. *

* This class is used for debug purposes. - * - * - * */ -public class WikipediaXMLWriter -{ - - /** Reference to the writer */ - private final OutputStreamWriter writer; - - /** - * (Constructor) Creates a WikipediaXMLWriter object. - * - * @param path - * path of the output file - * @throws IOException - * if an error occurs while writing the output - */ - public WikipediaXMLWriter(final String path) - throws IOException - { - this.writer = new OutputStreamWriter(new BufferedOutputStream(new FileOutputStream(path)), +public class WikipediaXMLWriter { + + /** + * Reference to the writer + */ + private final OutputStreamWriter writer; + + /** + * (Constructor) Creates a WikipediaXMLWriter object. + * + * @param path path of the output file + * @throws IOException if an error occurs while writing the output + */ + public WikipediaXMLWriter(final String path) + throws IOException { + this.writer = new OutputStreamWriter(new BufferedOutputStream(new FileOutputStream(path)), StandardCharsets.UTF_8); - } - - /** - * Writes the diff task to the output using wikipedia xml notation. - * - * @param diff - * Reference to a diff task - * @throws IOException - * if an error occurs while writing the output - */ - public void writeDiff(final Task diff) - throws IOException - { - writeDiff(diff, 0); - } - - /** - * Writes a part of the diff task, starting with the given element, to the - * output using wikipedia xml notation. - * - * @param diff - * Reference to a diff task - * @param start - * Position of the start element - * @throws IOException - * if an error occurs while writing the output - */ - public void writeDiff(final Task diff, final int start) - throws IOException - { - - int size = diff.size(); - Diff d; - String previousRevision = null, currentRevision; - - this.writer - .write(WikipediaXMLKeys.KEY_START_PAGE.getKeyword() + "\r\n"); - - ArticleInformation header = diff.getHeader(); - - this.writer.write("\t" + WikipediaXMLKeys.KEY_START_TITLE.getKeyword()); - this.writer.write(header.getArticleName()); - this.writer.write(WikipediaXMLKeys.KEY_END_TITLE.getKeyword() + "\r\n"); - - this.writer.write("\t" + WikipediaXMLKeys.KEY_START_ID.getKeyword()); - this.writer.write(Integer.toString(header.getArticleId())); - this.writer.write(WikipediaXMLKeys.KEY_END_ID.getKeyword() + "\r\n"); - - this.writer.write("\t"); - this.writer.write(Integer.toString(diff.getPartCounter())); - this.writer.write("\r\n"); - - for (int i = start; i < size; i++) { - d = diff.get(i); - currentRevision = d.buildRevision(previousRevision); - - this.writer - .write("\t" - + WikipediaXMLKeys.KEY_START_REVISION.getKeyword() - + "\r\n"); - - this.writer.write("\t\t" - + WikipediaXMLKeys.KEY_START_ID.getKeyword()); - this.writer.write(Integer.toString(d.getRevisionID())); - this.writer - .write(WikipediaXMLKeys.KEY_END_ID.getKeyword() + "\r\n"); - - this.writer.write("\t\t"); - this.writer.write(Integer.toString(d.getRevisionCounter())); - this.writer.write("\r\n"); - - this.writer.write("\t\t" - + WikipediaXMLKeys.KEY_START_TIMESTAMP.getKeyword()); - this.writer.write(d.getTimeStamp().toString()); - this.writer.write(WikipediaXMLKeys.KEY_END_TIMESTAMP.getKeyword() - + "\r\n"); - - this.writer.write("\t\t" - + WikipediaXMLKeys.KEY_START_TEXT.getKeyword()); - if (currentRevision != null) { - this.writer.write(currentRevision); - previousRevision = currentRevision; - } - this.writer.write(WikipediaXMLKeys.KEY_END_TEXT.getKeyword() - + "\r\n"); - - this.writer.write("\t" - + WikipediaXMLKeys.KEY_END_REVISION.getKeyword() + "\r\n"); - - } - - this.writer.write(WikipediaXMLKeys.KEY_END_PAGE.getKeyword() + "\r\n"); - this.writer.flush(); - } - - /** - * Writes the diff task to the output using an xml representation of the - * diff information. - * - * @param diff - * Reference to a diff task - * @throws IOException - * if an error occurs while writing the output - */ - public void writeDiffFile(final Task diff) - throws IOException - { - - int partsCount; - int size = diff.size(); - Diff d; - DiffPart p; - RevisionCodecData codec; - - this.writer - .write(WikipediaXMLKeys.KEY_START_PAGE.getKeyword() + "\r\n"); - - ArticleInformation header = diff.getHeader(); - - this.writer.write("\t" + WikipediaXMLKeys.KEY_START_TITLE.getKeyword()); - this.writer.write(header.getArticleName()); - this.writer.write(WikipediaXMLKeys.KEY_END_TITLE.getKeyword() + "\r\n"); - - this.writer.write("\t" + WikipediaXMLKeys.KEY_START_ID.getKeyword()); - this.writer.write(Integer.toString(header.getArticleId())); - this.writer.write(WikipediaXMLKeys.KEY_END_ID.getKeyword() + "\r\n"); - - this.writer.write("\t"); - this.writer.write(Integer.toString(diff.getPartCounter())); - this.writer.write("\r\n"); - - for (int i = 0; i < size; i++) { - d = diff.get(i); - - this.writer - .write("\t" - + WikipediaXMLKeys.KEY_START_REVISION.getKeyword() - + "\r\n"); - - codec = d.getCodecData(); - if (!codec.isConverted()) { - codec.totalSizeInBits(); - } - - this.writer.write("\t\t\r\n"); - - this.writer.write("\t\t\t" - + codec.getBlocksizeS() + "\r\n"); - this.writer.write("\t\t\t" - + codec.getBlocksizeE() + "\r\n"); - this.writer.write("\t\t\t" - + codec.getBlocksizeB() + "\r\n"); - this.writer.write("\t\t\t" - + codec.getBlocksizeL() + "\r\n"); - - this.writer.write("\t\t\r\n"); - - this.writer.write("\t\t" - + WikipediaXMLKeys.KEY_START_ID.getKeyword()); - this.writer.write(Integer.toString(d.getRevisionID())); - this.writer - .write(WikipediaXMLKeys.KEY_END_ID.getKeyword() + "\r\n"); - - this.writer.write("\t\t"); - this.writer.write(Integer.toString(d.getRevisionCounter())); - this.writer.write("\r\n"); - - this.writer.write("\t\t" - + WikipediaXMLKeys.KEY_START_TIMESTAMP.getKeyword()); - this.writer.write(d.getTimeStamp().toString()); - this.writer.write(WikipediaXMLKeys.KEY_END_TIMESTAMP.getKeyword() - + "\r\n"); - - this.writer.write("\t\t\r\n"); - partsCount = d.size(); - for (int j = 0; j < partsCount; j++) { - - p = d.get(j); - this.writer.write("\t\t\t\r\n"); - - this.writer.write("\t\t\t\t" + p.getAction() - + "\r\n"); - this.writer.write("\t\t\t\t" - + p.getStart() + "\r\n"); - this.writer.write("\t\t\t\t" - + p.getEnd() + "\r\n"); - if (p.getText() != null) { - this.writer - .write("\t\t\t\t" - + p.getText()); - this.writer.write("\r\n"); - } - - this.writer.write("\t\t\t\r\n"); - } - - this.writer.write("\t\t\r\n"); - this.writer.write("\t" - + WikipediaXMLKeys.KEY_END_REVISION.getKeyword() + "\r\n"); - } - - this.writer.write(WikipediaXMLKeys.KEY_END_PAGE.getKeyword() + "\r\n"); - this.writer.flush(); - } - - /** - * Writes the revision task to the output using wikipedia xml notation. - * - * @param task - * Reference to a revision task - * @throws IOException - * if an error occurs while writing the output - */ - public void writeRevision(final Task task) - throws IOException - { - - if (task.getTaskType() == TaskTypes.TASK_PARTIAL_FIRST - || task.getTaskType() == TaskTypes.TASK_FULL) { - - this.writer.write(WikipediaXMLKeys.KEY_START_PAGE.getKeyword() - + "\r\n"); - - ArticleInformation header = task.getHeader(); - - this.writer.write("\t" - + WikipediaXMLKeys.KEY_START_TITLE.getKeyword()); - this.writer.write(header.getArticleName()); - this.writer.write(WikipediaXMLKeys.KEY_END_TITLE.getKeyword() - + "\r\n"); - - this.writer - .write("\t" + WikipediaXMLKeys.KEY_START_ID.getKeyword()); - this.writer.write(Integer.toString(header.getArticleId())); - this.writer - .write(WikipediaXMLKeys.KEY_END_ID.getKeyword() + "\r\n"); - } - - Revision rev; - Iterator revIt = task.iterator(); - while (revIt.hasNext()) { - - this.writer - .write("\t" - + WikipediaXMLKeys.KEY_START_REVISION.getKeyword() - + "\r\n"); - rev = revIt.next(); - - this.writer.write("\t\t" - + WikipediaXMLKeys.KEY_START_ID.getKeyword()); - this.writer.write(Integer.toString(rev.getRevisionID())); - this.writer - .write(WikipediaXMLKeys.KEY_END_ID.getKeyword() + "\r\n"); - - this.writer.write("\t\t"); - this.writer.write(Integer.toString(rev.getRevisionCounter())); - this.writer.write("\r\n"); - - this.writer.write("\t\t" - + WikipediaXMLKeys.KEY_START_TIMESTAMP.getKeyword()); - this.writer.write(rev.getTimeStamp().toString()); - this.writer.write(WikipediaXMLKeys.KEY_END_TIMESTAMP.getKeyword() - + "\r\n"); - - this.writer.write("\t\t" - + WikipediaXMLKeys.KEY_START_CONTRIBUTOR.getKeyword()); - if(rev.contributorIsRegistered()){ - this.writer.write("\t\t" - + WikipediaXMLKeys.KEY_START_USERNAME.getKeyword()); - this.writer.write(rev.getContributorName()); - this.writer.write(WikipediaXMLKeys.KEY_END_USERNAME.getKeyword() - + "\r\n"); - - this.writer.write("\t\t" - + WikipediaXMLKeys.KEY_START_ID.getKeyword()); - this.writer.write(rev.getContributorId()); - this.writer.write(WikipediaXMLKeys.KEY_END_ID.getKeyword() - + "\r\n"); - } - else{ - this.writer.write("\t\t" - + WikipediaXMLKeys.KEY_START_IP.getKeyword()); - this.writer.write(rev.getContributorName()); - this.writer.write(WikipediaXMLKeys.KEY_END_IP.getKeyword() - + "\r\n"); - } - - this.writer.write(WikipediaXMLKeys.KEY_END_CONTRIBUTOR.getKeyword() - + "\r\n"); - - if(rev.isMinor()){ - this.writer.write("\t\t"+WikipediaXMLKeys.KEY_MINOR_FLAG.getKeyword() - + "\r\n"); - } - - this.writer.write("\t\t" - + WikipediaXMLKeys.KEY_START_COMMENT.getKeyword()); - this.writer.write(rev.getComment()); - this.writer.write(WikipediaXMLKeys.KEY_END_COMMENT.getKeyword() - + "\r\n"); - - - - - this.writer.write("\t\t" - + WikipediaXMLKeys.KEY_START_TEXT.getKeyword()); - if (rev.getRevisionText() != null) { - this.writer.write(rev.getRevisionText()); - } - this.writer.write(WikipediaXMLKeys.KEY_END_TEXT.getKeyword() - + "\r\n"); - - this.writer.write("\t" - + WikipediaXMLKeys.KEY_END_REVISION.getKeyword() + "\r\n"); - } - - if (task.getTaskType() == TaskTypes.TASK_PARTIAL_LAST - || task.getTaskType() == TaskTypes.TASK_FULL) { - - this.writer.write(WikipediaXMLKeys.KEY_END_PAGE.getKeyword() - + "\r\n"); - } - this.writer.flush(); - } - - /** - * Closes the writer. - * - * @throws IOException - * if an error occurred while closing the writer - */ - public void close() - throws IOException - { - this.writer.close(); - } + } + + /** + * Writes the diff task to the output using wikipedia xml notation. + * + * @param diff Reference to a diff task + * @throws IOException if an error occurs while writing the output + */ + public void writeDiff(final Task diff) + throws IOException { + writeDiff(diff, 0); + } + + /** + * Writes a part of the diff task, starting with the given element, to the + * output using wikipedia xml notation. + * + * @param diff Reference to a diff task + * @param start Position of the start element + * @throws IOException if an error occurs while writing the output + */ + public void writeDiff(final Task diff, final int start) + throws IOException { + + int size = diff.size(); + Diff d; + String previousRevision = null, currentRevision; + + this.writer + .write(WikipediaXMLKeys.KEY_START_PAGE.getKeyword() + "\r\n"); + + ArticleInformation header = diff.getHeader(); + + this.writer.write("\t" + WikipediaXMLKeys.KEY_START_TITLE.getKeyword()); + this.writer.write(header.getArticleName()); + this.writer.write(WikipediaXMLKeys.KEY_END_TITLE.getKeyword() + "\r\n"); + + this.writer.write("\t" + WikipediaXMLKeys.KEY_START_ID.getKeyword()); + this.writer.write(Integer.toString(header.getArticleId())); + this.writer.write(WikipediaXMLKeys.KEY_END_ID.getKeyword() + "\r\n"); + + this.writer.write("\t"); + this.writer.write(Integer.toString(diff.getPartCounter())); + this.writer.write("\r\n"); + + for (int i = start; i < size; i++) { + d = diff.get(i); + currentRevision = d.buildRevision(previousRevision); + + this.writer + .write("\t" + + WikipediaXMLKeys.KEY_START_REVISION.getKeyword() + + "\r\n"); + + this.writer.write("\t\t" + + WikipediaXMLKeys.KEY_START_ID.getKeyword()); + this.writer.write(Integer.toString(d.getRevisionID())); + this.writer + .write(WikipediaXMLKeys.KEY_END_ID.getKeyword() + "\r\n"); + + this.writer.write("\t\t"); + this.writer.write(Integer.toString(d.getRevisionCounter())); + this.writer.write("\r\n"); + + this.writer.write("\t\t" + + WikipediaXMLKeys.KEY_START_TIMESTAMP.getKeyword()); + this.writer.write(d.getTimeStamp().toString()); + this.writer.write(WikipediaXMLKeys.KEY_END_TIMESTAMP.getKeyword() + + "\r\n"); + + this.writer.write("\t\t" + + WikipediaXMLKeys.KEY_START_TEXT.getKeyword()); + if (currentRevision != null) { + this.writer.write(currentRevision); + previousRevision = currentRevision; + } + this.writer.write(WikipediaXMLKeys.KEY_END_TEXT.getKeyword() + + "\r\n"); + + this.writer.write("\t" + + WikipediaXMLKeys.KEY_END_REVISION.getKeyword() + "\r\n"); + + } + + this.writer.write(WikipediaXMLKeys.KEY_END_PAGE.getKeyword() + "\r\n"); + this.writer.flush(); + } + + /** + * Writes the diff task to the output using an xml representation of the + * diff information. + * + * @param diff Reference to a diff task + * @throws IOException if an error occurs while writing the output + */ + public void writeDiffFile(final Task diff) + throws IOException { + + int partsCount; + int size = diff.size(); + Diff d; + DiffPart p; + RevisionCodecData codec; + + this.writer + .write(WikipediaXMLKeys.KEY_START_PAGE.getKeyword() + "\r\n"); + + ArticleInformation header = diff.getHeader(); + + this.writer.write("\t" + WikipediaXMLKeys.KEY_START_TITLE.getKeyword()); + this.writer.write(header.getArticleName()); + this.writer.write(WikipediaXMLKeys.KEY_END_TITLE.getKeyword() + "\r\n"); + + this.writer.write("\t" + WikipediaXMLKeys.KEY_START_ID.getKeyword()); + this.writer.write(Integer.toString(header.getArticleId())); + this.writer.write(WikipediaXMLKeys.KEY_END_ID.getKeyword() + "\r\n"); + + this.writer.write("\t"); + this.writer.write(Integer.toString(diff.getPartCounter())); + this.writer.write("\r\n"); + + for (int i = 0; i < size; i++) { + d = diff.get(i); + + this.writer + .write("\t" + + WikipediaXMLKeys.KEY_START_REVISION.getKeyword() + + "\r\n"); + + codec = d.getCodecData(); + if (!codec.isConverted()) { + codec.totalSizeInBits(); + } + + this.writer.write("\t\t\r\n"); + + this.writer.write("\t\t\t" + + codec.getBlocksizeS() + "\r\n"); + this.writer.write("\t\t\t" + + codec.getBlocksizeE() + "\r\n"); + this.writer.write("\t\t\t" + + codec.getBlocksizeB() + "\r\n"); + this.writer.write("\t\t\t" + + codec.getBlocksizeL() + "\r\n"); + + this.writer.write("\t\t\r\n"); + + this.writer.write("\t\t" + + WikipediaXMLKeys.KEY_START_ID.getKeyword()); + this.writer.write(Integer.toString(d.getRevisionID())); + this.writer + .write(WikipediaXMLKeys.KEY_END_ID.getKeyword() + "\r\n"); + + this.writer.write("\t\t"); + this.writer.write(Integer.toString(d.getRevisionCounter())); + this.writer.write("\r\n"); + + this.writer.write("\t\t" + + WikipediaXMLKeys.KEY_START_TIMESTAMP.getKeyword()); + this.writer.write(d.getTimeStamp().toString()); + this.writer.write(WikipediaXMLKeys.KEY_END_TIMESTAMP.getKeyword() + + "\r\n"); + + this.writer.write("\t\t\r\n"); + partsCount = d.size(); + for (int j = 0; j < partsCount; j++) { + + p = d.get(j); + this.writer.write("\t\t\t\r\n"); + + this.writer.write("\t\t\t\t" + p.getAction() + + "\r\n"); + this.writer.write("\t\t\t\t" + + p.getStart() + "\r\n"); + this.writer.write("\t\t\t\t" + + p.getEnd() + "\r\n"); + if (p.getText() != null) { + this.writer + .write("\t\t\t\t" + + p.getText()); + this.writer.write("\r\n"); + } + + this.writer.write("\t\t\t\r\n"); + } + + this.writer.write("\t\t\r\n"); + this.writer.write("\t" + + WikipediaXMLKeys.KEY_END_REVISION.getKeyword() + "\r\n"); + } + + this.writer.write(WikipediaXMLKeys.KEY_END_PAGE.getKeyword() + "\r\n"); + this.writer.flush(); + } + + /** + * Writes the revision task to the output using wikipedia xml notation. + * + * @param task Reference to a revision task + * @throws IOException if an error occurs while writing the output + */ + public void writeRevision(final Task task) + throws IOException { + + if (task.getTaskType() == TaskTypes.TASK_PARTIAL_FIRST + || task.getTaskType() == TaskTypes.TASK_FULL) { + + this.writer.write(WikipediaXMLKeys.KEY_START_PAGE.getKeyword() + + "\r\n"); + + ArticleInformation header = task.getHeader(); + + this.writer.write("\t" + + WikipediaXMLKeys.KEY_START_TITLE.getKeyword()); + this.writer.write(header.getArticleName()); + this.writer.write(WikipediaXMLKeys.KEY_END_TITLE.getKeyword() + + "\r\n"); + + this.writer + .write("\t" + WikipediaXMLKeys.KEY_START_ID.getKeyword()); + this.writer.write(Integer.toString(header.getArticleId())); + this.writer + .write(WikipediaXMLKeys.KEY_END_ID.getKeyword() + "\r\n"); + } + + Revision rev; + Iterator revIt = task.iterator(); + while (revIt.hasNext()) { + + this.writer + .write("\t" + + WikipediaXMLKeys.KEY_START_REVISION.getKeyword() + + "\r\n"); + rev = revIt.next(); + + this.writer.write("\t\t" + + WikipediaXMLKeys.KEY_START_ID.getKeyword()); + this.writer.write(Integer.toString(rev.getRevisionID())); + this.writer + .write(WikipediaXMLKeys.KEY_END_ID.getKeyword() + "\r\n"); + + this.writer.write("\t\t"); + this.writer.write(Integer.toString(rev.getRevisionCounter())); + this.writer.write("\r\n"); + + this.writer.write("\t\t" + + WikipediaXMLKeys.KEY_START_TIMESTAMP.getKeyword()); + this.writer.write(rev.getTimeStamp().toString()); + this.writer.write(WikipediaXMLKeys.KEY_END_TIMESTAMP.getKeyword() + + "\r\n"); + + this.writer.write("\t\t" + + WikipediaXMLKeys.KEY_START_CONTRIBUTOR.getKeyword()); + if (rev.contributorIsRegistered()) { + this.writer.write("\t\t" + + WikipediaXMLKeys.KEY_START_USERNAME.getKeyword()); + this.writer.write(rev.getContributorName()); + this.writer.write(WikipediaXMLKeys.KEY_END_USERNAME.getKeyword() + + "\r\n"); + + this.writer.write("\t\t" + + WikipediaXMLKeys.KEY_START_ID.getKeyword()); + this.writer.write(rev.getContributorId()); + this.writer.write(WikipediaXMLKeys.KEY_END_ID.getKeyword() + + "\r\n"); + } else { + this.writer.write("\t\t" + + WikipediaXMLKeys.KEY_START_IP.getKeyword()); + this.writer.write(rev.getContributorName()); + this.writer.write(WikipediaXMLKeys.KEY_END_IP.getKeyword() + + "\r\n"); + } + + this.writer.write(WikipediaXMLKeys.KEY_END_CONTRIBUTOR.getKeyword() + + "\r\n"); + + if (rev.isMinor()) { + this.writer.write("\t\t" + WikipediaXMLKeys.KEY_MINOR_FLAG.getKeyword() + + "\r\n"); + } + + this.writer.write("\t\t" + + WikipediaXMLKeys.KEY_START_COMMENT.getKeyword()); + this.writer.write(rev.getComment()); + this.writer.write(WikipediaXMLKeys.KEY_END_COMMENT.getKeyword() + + "\r\n"); + + + this.writer.write("\t\t" + + WikipediaXMLKeys.KEY_START_TEXT.getKeyword()); + if (rev.getRevisionText() != null) { + this.writer.write(rev.getRevisionText()); + } + this.writer.write(WikipediaXMLKeys.KEY_END_TEXT.getKeyword() + + "\r\n"); + + this.writer.write("\t" + + WikipediaXMLKeys.KEY_END_REVISION.getKeyword() + "\r\n"); + } + + if (task.getTaskType() == TaskTypes.TASK_PARTIAL_LAST + || task.getTaskType() == TaskTypes.TASK_FULL) { + + this.writer.write(WikipediaXMLKeys.KEY_END_PAGE.getKeyword() + + "\r\n"); + } + this.writer.flush(); + } + + /** + * Closes the writer. + * + * @throws IOException if an error occurred while closing the writer + */ + public void close() + throws IOException { + this.writer.close(); + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/DiffTool.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/DiffTool.java index a94d2a97..3f12f6cd 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/DiffTool.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/DiffTool.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -28,63 +28,51 @@ /** * This class contains the start method for the DiffTool application. - * - * - * */ -public class DiffTool -{ +public class DiffTool { - /** - * Starts the DiffTool application. - * - * @param args - * program arguments args[0] has to be the path to the - * configuration file - */ - public static void main(final String[] args) - { + /** + * Starts the DiffTool application. + * + * @param args program arguments args[0] has to be the path to the + * configuration file + */ + public static void main(final String[] args) { - if (args.length != 1) { - throw new IllegalArgumentException( - "Configuration File ist missing."); - } + if (args.length != 1) { + throw new IllegalArgumentException( + "Configuration File ist missing."); + } - try { + try { - // Reads the configuration - ConfigSettings config = readConfiguration(args[0]); - new DiffToolThread(config).run(); - } - catch (Exception e) { - e.printStackTrace(); - } - } + // Reads the configuration + ConfigSettings config = readConfiguration(args[0]); + new DiffToolThread(config).run(); + } catch (Exception e) { + e.printStackTrace(); + } + } - /** - * Reads and parses the configuration file. - * - * @param path - * path to the configuration file - * @return ConfigurationSettings - * - * @throws IOException - * if an error occurred while reading the configuration file - * @throws SAXException - * if an error occurred while using the xml parser - * @throws ParserConfigurationException - * if the initialization of the xml parser failed - */ - private static ConfigSettings readConfiguration(final String path) - throws IOException, SAXException, ParserConfigurationException - { + /** + * Reads and parses the configuration file. + * + * @param path path to the configuration file + * @return ConfigurationSettings + * @throws IOException if an error occurred while reading the configuration file + * @throws SAXException if an error occurred while using the xml parser + * @throws ParserConfigurationException if the initialization of the xml parser failed + */ + private static ConfigSettings readConfiguration(final String path) + throws IOException, SAXException, ParserConfigurationException { - ConfigurationReader reader = new ConfigurationReader(path); - return reader.read(); - } + ConfigurationReader reader = new ConfigurationReader(path); + return reader.read(); + } - /** No object - Utility class */ - private DiffTool() - { - } + /** + * No object - Utility class + */ + private DiffTool() { + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/DiffToolThread.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/DiffToolThread.java index ad266881..17444d74 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/DiffToolThread.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/DiffToolThread.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -64,305 +64,291 @@ * This class represents the main method for the DiffTool application */ public class DiffToolThread - extends Thread -{ - - /** Reference to the DiffTool Logger */ - private static Logger logger; - - /** Reference to the Configuration */ - private final ConfigurationManager cconfig; - - /** Configuration Parameter - Statistical output flag */ - private boolean MODE_STATISTICAL_OUTPUT; - - /** - * (Constructor) Creates a DiffToolThread object. - * - * @param config - * Reference to the configuration - * - * @throws LoggingException - * if an error occurs while logging - */ - public DiffToolThread(final ConfigSettings config) - throws LoggingException - { - - this.cconfig = new ConfigurationManager(config); - - try { - MODE_STATISTICAL_OUTPUT = (Boolean) cconfig - .getConfigParameter(ConfigurationKeys.MODE_STATISTICAL_OUTPUT); - } - catch (ConfigurationException e) { - MODE_STATISTICAL_OUTPUT=false; - } - - logger = LoggingFactory.createLogger(LoggerType.DIFF_TOOL, "DiffTool"); - } - - /** - * This class is used to receive tasks from the diff modules and transmits - * them to the sql modules. - * - */ - private class TaskTransmitter - implements TaskTransmitterInterface - { - - /** Reference to the (dump) output writer */ - private final WriterInterface dumpWriter; - - /** Configuration Parameter - Output mode */ - private final OutputType MODE_OUTPUT; - - /** Configuration Parameter - Statistical output flag */ - private final boolean MODE_STATISTICAL_OUTPUT; - - /** Configuration Parameter - Datafile output flasg */ - private final boolean MODE_DATAFILE_OUTPUT; - - /** - * (Constructor) Creates a TaskTransmitter object. - * - * @throws ConfigurationException - * if an error occurs while accessing the configuration - * @throws IOException - * if an error occurs while writing the output - * @throws LoggingException - * if an error occurs while logging - */ - public TaskTransmitter() - throws ConfigurationException, IOException, LoggingException - { - - ConfigurationManager config = ConfigurationManager.getInstance(); - - MODE_OUTPUT = (OutputType) config - .getConfigParameter(ConfigurationKeys.MODE_OUTPUT); - - MODE_STATISTICAL_OUTPUT = (Boolean) cconfig - .getConfigParameter(ConfigurationKeys.MODE_STATISTICAL_OUTPUT); - - MODE_DATAFILE_OUTPUT = (Boolean) cconfig - .getConfigParameter(ConfigurationKeys.MODE_DATAFILE_OUTPUT); - - switch (MODE_OUTPUT) { - - case UNCOMPRESSED: - if(MODE_DATAFILE_OUTPUT){ - this.dumpWriter = new DataFileWriter("output"); - }else{ - if (MODE_STATISTICAL_OUTPUT) { - this.dumpWriter = new TimedSQLFileWriter("output", logger); - } - else { - this.dumpWriter = new SQLFileWriter("output", logger); - } - } - break; - - case SEVENZIP: - case BZIP2: - case ALTERNATE: - if(MODE_DATAFILE_OUTPUT){ - this.dumpWriter = new DataFileArchiveWriter("output"); - }else{ - if (MODE_STATISTICAL_OUTPUT) { - this.dumpWriter = new TimedSQLArchiveWriter("output", logger); - } - else { - this.dumpWriter = new SQLArchiveWriter("output", logger); - } - } - break; - - case DATABASE: - if(MODE_DATAFILE_OUTPUT){ - throw ErrorFactory - .createConfigurationException(ErrorKeys.DELTA_CONSUMERS_SQL_WRITER_OUTPUTFACTORY_ILLEGAL_OUTPUTMODE_VALUE); - }else{ - if (MODE_STATISTICAL_OUTPUT) { - this.dumpWriter = new TimedSQLDatabaseWriter(logger); - } - else { - this.dumpWriter = new SQLDatabaseWriter(logger); - } - } - break; - - default: - throw ErrorFactory - .createConfigurationException(ErrorKeys.DELTA_CONSUMERS_SQL_WRITER_OUTPUTFACTORY_ILLEGAL_OUTPUTMODE_VALUE); - } - } - - /** - * Receives a DiffTask Transmission. - */ - @Override - public void transmitDiff(final Task result) - { - writeOutput(result); - } - - /** - * Receives a partial DiffTask Transmission. - */ - @Override - public void transmitPartialDiff(final Task result) - { - writeOutput(result); - } - - @Override - public void close() throws IOException, SQLException { - dumpWriter.close(); - } - - /** - * Forwards the DiffTask to the encoding modules. - * - * @param result - * Reference to a DiffTask - */ - private void writeOutput(final Task result) - { - - try { - long time, start = System.currentTimeMillis(); - dumpWriter.process(result); - - time = System.currentTimeMillis() - start; - - SQLConsumerLogMessages.logDiffProcessed(logger, result, time); - - // Output Encoding Error - } - catch (SQLConsumerException e) { - - SQLConsumerLogMessages.logSQLConsumerException(logger, e); - e.printStackTrace(); - - // Critical Exceptions - } - catch (ConfigurationException | IOException e) { - throw new RuntimeException(e); - } + extends Thread { + + /** + * Reference to the DiffTool Logger + */ + private static Logger logger; + + /** + * Reference to the Configuration + */ + private final ConfigurationManager cconfig; + + /** + * Configuration Parameter - Statistical output flag + */ + private boolean MODE_STATISTICAL_OUTPUT; + + /** + * (Constructor) Creates a DiffToolThread object. + * + * @param config Reference to the configuration + * @throws LoggingException if an error occurs while logging + */ + public DiffToolThread(final ConfigSettings config) + throws LoggingException { + + this.cconfig = new ConfigurationManager(config); + + try { + MODE_STATISTICAL_OUTPUT = (Boolean) cconfig + .getConfigParameter(ConfigurationKeys.MODE_STATISTICAL_OUTPUT); + } catch (ConfigurationException e) { + MODE_STATISTICAL_OUTPUT = false; } + logger = LoggingFactory.createLogger(LoggerType.DIFF_TOOL, "DiffTool"); + } + + /** + * This class is used to receive tasks from the diff modules and transmits + * them to the sql modules. + */ + private class TaskTransmitter + implements TaskTransmitterInterface { + + /** + * Reference to the (dump) output writer + */ + private final WriterInterface dumpWriter; + + /** + * Configuration Parameter - Output mode + */ + private final OutputType MODE_OUTPUT; + + /** + * Configuration Parameter - Statistical output flag + */ + private final boolean MODE_STATISTICAL_OUTPUT; + + /** + * Configuration Parameter - Datafile output flasg + */ + private final boolean MODE_DATAFILE_OUTPUT; + + /** + * (Constructor) Creates a TaskTransmitter object. + * + * @throws ConfigurationException if an error occurs while accessing the configuration + * @throws IOException if an error occurs while writing the output + * @throws LoggingException if an error occurs while logging + */ + public TaskTransmitter() + throws ConfigurationException, IOException, LoggingException { + + ConfigurationManager config = ConfigurationManager.getInstance(); + + MODE_OUTPUT = (OutputType) config + .getConfigParameter(ConfigurationKeys.MODE_OUTPUT); + + MODE_STATISTICAL_OUTPUT = (Boolean) cconfig + .getConfigParameter(ConfigurationKeys.MODE_STATISTICAL_OUTPUT); + + MODE_DATAFILE_OUTPUT = (Boolean) cconfig + .getConfigParameter(ConfigurationKeys.MODE_DATAFILE_OUTPUT); + + switch (MODE_OUTPUT) { + + case UNCOMPRESSED: + if (MODE_DATAFILE_OUTPUT) { + this.dumpWriter = new DataFileWriter("output"); + } else { + if (MODE_STATISTICAL_OUTPUT) { + this.dumpWriter = new TimedSQLFileWriter("output", logger); + } else { + this.dumpWriter = new SQLFileWriter("output", logger); + } + } + break; + + case SEVENZIP: + case BZIP2: + case ALTERNATE: + if (MODE_DATAFILE_OUTPUT) { + this.dumpWriter = new DataFileArchiveWriter("output"); + } else { + if (MODE_STATISTICAL_OUTPUT) { + this.dumpWriter = new TimedSQLArchiveWriter("output", logger); + } else { + this.dumpWriter = new SQLArchiveWriter("output", logger); + } + } + break; + + case DATABASE: + if (MODE_DATAFILE_OUTPUT) { + throw ErrorFactory + .createConfigurationException(ErrorKeys.DELTA_CONSUMERS_SQL_WRITER_OUTPUTFACTORY_ILLEGAL_OUTPUTMODE_VALUE); + } else { + if (MODE_STATISTICAL_OUTPUT) { + this.dumpWriter = new TimedSQLDatabaseWriter(logger); + } else { + this.dumpWriter = new SQLDatabaseWriter(logger); + } + } + break; + + default: + throw ErrorFactory + .createConfigurationException(ErrorKeys.DELTA_CONSUMERS_SQL_WRITER_OUTPUTFACTORY_ILLEGAL_OUTPUTMODE_VALUE); + } + } + + /** + * Receives a DiffTask Transmission. + */ + @Override + public void transmitDiff(final Task result) { + writeOutput(result); + } + + /** + * Receives a partial DiffTask Transmission. + */ + @Override + public void transmitPartialDiff(final Task result) { + writeOutput(result); + } + + @Override + public void close() throws IOException, SQLException { + dumpWriter.close(); + } + + /** + * Forwards the DiffTask to the encoding modules. + * + * @param result Reference to a DiffTask + */ + private void writeOutput(final Task result) { + + try { + long time, start = System.currentTimeMillis(); + dumpWriter.process(result); + + time = System.currentTimeMillis() - start; + + SQLConsumerLogMessages.logDiffProcessed(logger, result, time); + + // Output Encoding Error + } catch (SQLConsumerException e) { + + SQLConsumerLogMessages.logSQLConsumerException(logger, e); + e.printStackTrace(); + + // Critical Exceptions + } catch (ConfigurationException | IOException e) { + throw new RuntimeException(e); + } + } + + + } + + /** + * Runs the diff creation process + */ + @Override + public void run() { + + try { + ArchiveManager archives = new ArchiveManager(); + ArticleReaderInterface articleReader; + ArchiveDescription description = null; + Task task; + DiffCalculatorInterface diffCalc; + + if (MODE_STATISTICAL_OUTPUT) { + diffCalc = new TimedDiffCalculator(new TaskTransmitter()); + } else { + diffCalc = new DiffCalculator(new TaskTransmitter()); + } + + long start, time; - } - - /** - * Runs the diff creation process - */ - @Override - public void run() - { - - try { - ArchiveManager archives = new ArchiveManager(); - ArticleReaderInterface articleReader; - ArchiveDescription description = null; - Task task; - DiffCalculatorInterface diffCalc; - - if (MODE_STATISTICAL_OUTPUT) { - diffCalc = new TimedDiffCalculator(new TaskTransmitter()); - } - else { - diffCalc = new DiffCalculator(new TaskTransmitter()); - } - - long start, time; - - while (archives.hasArchive()) { - - - // Retrieve Archive - try { - description = archives.getArchive(); - - // initialize filter - ArticleFilter nameFilter = new ArticleFilter(); - - articleReader = InputFactory.getTaskReader(description, - nameFilter); - ArticleConsumerLogMessages.logArchiveRetrieved(logger, - description); - - // Exception while accessing the archive - } - catch (ArticleReaderException e) { - - articleReader = null; - ArticleConsumerLogMessages.logExceptionRetrieveArchive( - logger, description, e); - } - - // Process Archive - while (articleReader != null) { - try { - if (articleReader.hasNext()) { - - start = System.currentTimeMillis(); - //read the next article (may be null if filtered) - task = articleReader.next(); - time = System.currentTimeMillis() - start; - - // task will be null if the name filter removed that - // article - if (task == null) { - continue; - } - - ArticleConsumerLogMessages - .logArticleRead(logger, task, time, - articleReader.getBytePosition()); - - start = System.currentTimeMillis(); - //calculate the diff for this article version - diffCalc.process(task); - time = System.currentTimeMillis() - start; - - DiffConsumerLogMessages.logArticleProcessed(logger, - task, time); - - } - else { - ArticleConsumerLogMessages.logNoMoreArticles( - logger, description); - articleReader = null; - } - - // Reset current article - } - catch (ArticleReaderException e) { - - ArticleConsumerLogMessages.logTaskReaderException( - logger, e); - articleReader.resetTaskCompleted(); - - } - catch (DiffException e) { - - DiffConsumerLogMessages.logDiffException(logger, e); - articleReader.resetTaskCompleted(); - diffCalc.reset(); - } - } - } - diffCalc.closeTransmitter(); - - ArticleConsumerLogMessages.logNoMoreArchives(logger); - - // Critical Exceptions - } - catch (Exception e) { - DiffToolLogMessages.logException(logger, e); - throw new RuntimeException(e); - } - } + while (archives.hasArchive()) { + + + // Retrieve Archive + try { + description = archives.getArchive(); + + // initialize filter + ArticleFilter nameFilter = new ArticleFilter(); + + articleReader = InputFactory.getTaskReader(description, + nameFilter); + ArticleConsumerLogMessages.logArchiveRetrieved(logger, + description); + + // Exception while accessing the archive + } catch (ArticleReaderException e) { + + articleReader = null; + ArticleConsumerLogMessages.logExceptionRetrieveArchive( + logger, description, e); + } + + // Process Archive + while (articleReader != null) { + try { + if (articleReader.hasNext()) { + + start = System.currentTimeMillis(); + //read the next article (may be null if filtered) + task = articleReader.next(); + time = System.currentTimeMillis() - start; + + // task will be null if the name filter removed that + // article + if (task == null) { + continue; + } + + ArticleConsumerLogMessages + .logArticleRead(logger, task, time, + articleReader.getBytePosition()); + + start = System.currentTimeMillis(); + //calculate the diff for this article version + diffCalc.process(task); + time = System.currentTimeMillis() - start; + + DiffConsumerLogMessages.logArticleProcessed(logger, + task, time); + + } else { + ArticleConsumerLogMessages.logNoMoreArticles( + logger, description); + articleReader = null; + } + + // Reset current article + } catch (ArticleReaderException e) { + + ArticleConsumerLogMessages.logTaskReaderException( + logger, e); + articleReader.resetTaskCompleted(); + + } catch (DiffException e) { + + DiffConsumerLogMessages.logDiffException(logger, e); + articleReader.resetTaskCompleted(); + diffCalc.reset(); + } + } + } + diffCalc.closeTransmitter(); + + ArticleConsumerLogMessages.logNoMoreArchives(logger); + + // Critical Exceptions + } catch (Exception e) { + DiffToolLogMessages.logException(logger, e); + throw new RuntimeException(e); + } + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/ConfigurationKeys.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/ConfigurationKeys.java index e2c00276..e2d0f1e9 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/ConfigurationKeys.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/ConfigurationKeys.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -19,282 +19,277 @@ /** * Contains all applicable keys for the configuration file. - * - * - * */ -public enum ConfigurationKeys -{ - - /* - * +DIVERSES+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ - */ - - - /** - * Type: SurrogateModes Used by: DiffCalculator, RevisionApi - *

- * Description: Surrogate Mode - */ - MODE_SURROGATES, - - /** - * Type: Integer Used by: SQLEncoder - *

- * Description: MaxAllowedPacket variable of the MySQL Server - */ - LIMIT_SQLSERVER_MAX_ALLOWED_PACKET, - - /** - * Type: OutputMode Used by: SQLConsumer - *

- * Description: Output Mode - */ - MODE_OUTPUT, - - /** - * Type: boolean Used by: RevisionApi - *

- * Description: Enables the zip compression - */ - MODE_ZIP_COMPRESSION_ENABLED, - - /** - * Type: boolean Used by: RevisionApi - *

- * Description: Enables the binary output - */ - MODE_BINARY_OUTPUT_ENABLED, - - /** - * Type: boolean Used by: All Consumers and the processing components - *

- * Description: Enables the statistical output - */ - MODE_STATISTICAL_OUTPUT, - - /** - * Type: boolean - *

- * Description: Write datafiles instead of SQL dumps - */ - MODE_DATAFILE_OUTPUT, - - /** - * Type: boolean Used by: All Consumers and the processing components - *

- * Description: Enables the debug output - */ - MODE_DEBUG_OUTPUT, - - /** - * Type: String Used by: everybody - *

- * Description: Charset name of the input data - *

- * Recommendation / Default: "UTF-8" - */ - WIKIPEDIA_ENCODING, - - /** - * Type: Integer Range: > 1 Used by: DiffConsumers - Diff Generation - *

- * Description: This number indicates which revisions should be full - * revisions. - *

- * A full revision is generated if the result of the revisionCounter of the - * revision modulo COUNTER_FULL_REVISION is 0. - *

- * Recommendation / Default: Currenty a value of 1000 is used. - *

- * Example: COUNTER_FULL_REVISION = 100 - *

- * FullRevisions are all revisions with a revisionCounter % 100 == 0 0, 100, - * 200, 300, 400, ... - */ - COUNTER_FULL_REVISION, - - /** - * Type: Integer Range: > 1 Used by: DiffConsumers - Common Longest - * Substring Search - *

- * Description: This number indicates when a matching sequence between two - * revisions is considered as sequence. - *

- * Recommendation / Default: Currently 12 Value should be greater than the - * encoded size of an operation. - */ - VALUE_MINIMUM_LONGEST_COMMON_SUBSTRING, - - - /* - * +OUTPUT+VERIFICATION++++++++++++++++++++++++++++++++++++++++++++++++++++++ - */ - - /** - * Type: String Used by: SQLConsumer - SQLFileWriter - *

- * Description: Output-Directory for the sql files - *

- * Recommendation / Default: No default value - has to be configured! - *

- * More consumers should lead to a speed-up - */ - PATH_OUTPUT_SQL_FILES, - - /** - * Type: Long Used by: SQLConsumer - SQLFileWriter - *

- * Description: Maximum size of an sql file (in bytes) - *

- * Recommendation / Default: Currently 100 MB - */ - LIMIT_SQL_FILE_SIZE, - - /** - * Type: Long Used by: SQLConsumer - SevenZipSQLWriter - *

- * Description: Maximum size of an sql archive file (in bytes) - *

- * Recommendation / Default: Currently not implemented - */ - LIMIT_SQL_ARCHIVE_SIZE, - - /** - * Type: Boolean Used by: DiffConsumer - DiffCalculator - *

- * Description: Enabels the verification of the diff generation - *

- * Recommendation / Default: Should only be used for debug purposes - */ - VERIFICATION_DIFF, - - /** - * Type: Boolean Used by: SQLConsumer - SQLFileWriter - *

- * Description: Enables the verification of the encoded revision data - *

- * Recommendation / Default: Should only be used for debug purposes - */ - VERIFICATION_ENCODING, - - - /* - * +RESOURCE+LIMITATIONS+++++++++++++++++++++++++++++++++++++++++++++++++++++ - */ - - /** - * Type: Long Used by: ArticleConsumers - *

- * Description: This value indicates the maximum size of an article task (in - * bytes). If the limit is reached the task will be splitted into parts. - *

- * Recommendation / Default: Currently 10 MB - *

- * USE WITH CAUTION! Large value could lead to a memory overflow - */ - LIMIT_TASK_SIZE_REVISIONS, - - /** - * Type: Long Used by: DiffConsumers - *

- * Description: This value indicates the maximum size of a diff task (in - * bytes). If the limit is reached the task will be splitted into parts. - *

- * Recommendation / Default: Currently 10 MB - *

- * USE WITH CAUTION! Large value could lead to a memory overflow - */ - LIMIT_TASK_SIZE_DIFFS, - - - /* - * +EXTERNAL+PROGRAMS++++++++++++++++++++++++++++++++++++++++++++++++++++++++ - */ - - /** - * Type: String Used by: ArticleConsumers - ArticleReader - InputFactory - *

- * Description: If you want to use 7Zip to decompress your 7z or bz2 - * archives set the corresponding path in the config file. - *

- * Recommendation / Default: not set, faster than bzip2 - */ - PATH_PROGRAM_7ZIP, - - - /* - * +UNCOMPRESSED+SERVER+SETTINGS++++++++++++++++++++++++++++++++++++++++++++++++++++++ - */ - - SQL_HOST, - - /** - * Type: String Used by: SQLConsumers - SQLDatabaseWriter - *

- * Description: Name of the sql database - *

- * Recommendation / Default: currently not used - */ - SQL_DATABASE, - - /** - * Type: String Used by: SQLConsumers - SQLDatabaseWriter - *

- * Description: Username of your sql producer - *

- * Recommendation / Default: currently not used - */ - SQL_USERNAME, - - /** - * Type: String Used by: SQLConsumers - SQLDatabaseWriter - *

- * Description: Password for the corresponding username - *

- * Recommendation / Default: currently not used - */ - SQL_PASSWORD, - - /* - * +LOGGING++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ - */ - - /** - * Type: String Used by: All Loggers - *

- * Description: Root-Directory for all logger - *

- * Recommendation / Default: "logs/" - */ - LOGGING_PATH_DIFFTOOL, - - /** - * Type: String Used by: DiffConsumer, SQLConsumer - *

- * Description: Output directory for articles with failed verifications - *

- * Recommendation / Default: "logs/" + "debug/" - */ - LOGGING_PATH_DEBUG, - - /** - * Type: {@link org.slf4j.event.Level} Used by: DiffTool Logger - *

- * Description: Log level for the diff tool logger - *

- * Recommendation / Default: Log.INFO - *

- * Note that the corresponding output directory for the logger has to exist - * when the LogLevel is not Level.OFF - */ - LOGGING_LOGLEVEL_DIFFTOOL, - - /** - * Type: java.util.Set Used by: ArticleFilter - *

- * Description: The Set of namespaces to keep in output - * - */ - NAMESPACES_TO_KEEP +public enum ConfigurationKeys { + + /* + * +DIVERSES+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + */ + + + /** + * Type: SurrogateModes Used by: DiffCalculator, RevisionApi + *

+ * Description: Surrogate Mode + */ + MODE_SURROGATES, + + /** + * Type: Integer Used by: SQLEncoder + *

+ * Description: MaxAllowedPacket variable of the MySQL Server + */ + LIMIT_SQLSERVER_MAX_ALLOWED_PACKET, + + /** + * Type: OutputMode Used by: SQLConsumer + *

+ * Description: Output Mode + */ + MODE_OUTPUT, + + /** + * Type: boolean Used by: RevisionApi + *

+ * Description: Enables the zip compression + */ + MODE_ZIP_COMPRESSION_ENABLED, + + /** + * Type: boolean Used by: RevisionApi + *

+ * Description: Enables the binary output + */ + MODE_BINARY_OUTPUT_ENABLED, + + /** + * Type: boolean Used by: All Consumers and the processing components + *

+ * Description: Enables the statistical output + */ + MODE_STATISTICAL_OUTPUT, + + /** + * Type: boolean + *

+ * Description: Write datafiles instead of SQL dumps + */ + MODE_DATAFILE_OUTPUT, + + /** + * Type: boolean Used by: All Consumers and the processing components + *

+ * Description: Enables the debug output + */ + MODE_DEBUG_OUTPUT, + + /** + * Type: String Used by: everybody + *

+ * Description: Charset name of the input data + *

+ * Recommendation / Default: "UTF-8" + */ + WIKIPEDIA_ENCODING, + + /** + * Type: Integer Range: > 1 Used by: DiffConsumers - Diff Generation + *

+ * Description: This number indicates which revisions should be full + * revisions. + *

+ * A full revision is generated if the result of the revisionCounter of the + * revision modulo COUNTER_FULL_REVISION is 0. + *

+ * Recommendation / Default: Currenty a value of 1000 is used. + *

+ * Example: COUNTER_FULL_REVISION = 100 + *

+ * FullRevisions are all revisions with a revisionCounter % 100 == 0 0, 100, + * 200, 300, 400, ... + */ + COUNTER_FULL_REVISION, + + /** + * Type: Integer Range: > 1 Used by: DiffConsumers - Common Longest + * Substring Search + *

+ * Description: This number indicates when a matching sequence between two + * revisions is considered as sequence. + *

+ * Recommendation / Default: Currently 12 Value should be greater than the + * encoded size of an operation. + */ + VALUE_MINIMUM_LONGEST_COMMON_SUBSTRING, + + + /* + * +OUTPUT+VERIFICATION++++++++++++++++++++++++++++++++++++++++++++++++++++++ + */ + + /** + * Type: String Used by: SQLConsumer - SQLFileWriter + *

+ * Description: Output-Directory for the sql files + *

+ * Recommendation / Default: No default value - has to be configured! + *

+ * More consumers should lead to a speed-up + */ + PATH_OUTPUT_SQL_FILES, + + /** + * Type: Long Used by: SQLConsumer - SQLFileWriter + *

+ * Description: Maximum size of an sql file (in bytes) + *

+ * Recommendation / Default: Currently 100 MB + */ + LIMIT_SQL_FILE_SIZE, + + /** + * Type: Long Used by: SQLConsumer - SevenZipSQLWriter + *

+ * Description: Maximum size of an sql archive file (in bytes) + *

+ * Recommendation / Default: Currently not implemented + */ + LIMIT_SQL_ARCHIVE_SIZE, + + /** + * Type: Boolean Used by: DiffConsumer - DiffCalculator + *

+ * Description: Enabels the verification of the diff generation + *

+ * Recommendation / Default: Should only be used for debug purposes + */ + VERIFICATION_DIFF, + + /** + * Type: Boolean Used by: SQLConsumer - SQLFileWriter + *

+ * Description: Enables the verification of the encoded revision data + *

+ * Recommendation / Default: Should only be used for debug purposes + */ + VERIFICATION_ENCODING, + + + /* + * +RESOURCE+LIMITATIONS+++++++++++++++++++++++++++++++++++++++++++++++++++++ + */ + + /** + * Type: Long Used by: ArticleConsumers + *

+ * Description: This value indicates the maximum size of an article task (in + * bytes). If the limit is reached the task will be splitted into parts. + *

+ * Recommendation / Default: Currently 10 MB + *

+ * USE WITH CAUTION! Large value could lead to a memory overflow + */ + LIMIT_TASK_SIZE_REVISIONS, + + /** + * Type: Long Used by: DiffConsumers + *

+ * Description: This value indicates the maximum size of a diff task (in + * bytes). If the limit is reached the task will be splitted into parts. + *

+ * Recommendation / Default: Currently 10 MB + *

+ * USE WITH CAUTION! Large value could lead to a memory overflow + */ + LIMIT_TASK_SIZE_DIFFS, + + + /* + * +EXTERNAL+PROGRAMS++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + */ + + /** + * Type: String Used by: ArticleConsumers - ArticleReader - InputFactory + *

+ * Description: If you want to use 7Zip to decompress your 7z or bz2 + * archives set the corresponding path in the config file. + *

+ * Recommendation / Default: not set, faster than bzip2 + */ + PATH_PROGRAM_7ZIP, + + + /* + * +UNCOMPRESSED+SERVER+SETTINGS++++++++++++++++++++++++++++++++++++++++++++++++++++++ + */ + + SQL_HOST, + + /** + * Type: String Used by: SQLConsumers - SQLDatabaseWriter + *

+ * Description: Name of the sql database + *

+ * Recommendation / Default: currently not used + */ + SQL_DATABASE, + + /** + * Type: String Used by: SQLConsumers - SQLDatabaseWriter + *

+ * Description: Username of your sql producer + *

+ * Recommendation / Default: currently not used + */ + SQL_USERNAME, + + /** + * Type: String Used by: SQLConsumers - SQLDatabaseWriter + *

+ * Description: Password for the corresponding username + *

+ * Recommendation / Default: currently not used + */ + SQL_PASSWORD, + + /* + * +LOGGING++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + */ + + /** + * Type: String Used by: All Loggers + *

+ * Description: Root-Directory for all logger + *

+ * Recommendation / Default: "logs/" + */ + LOGGING_PATH_DIFFTOOL, + + /** + * Type: String Used by: DiffConsumer, SQLConsumer + *

+ * Description: Output directory for articles with failed verifications + *

+ * Recommendation / Default: "logs/" + "debug/" + */ + LOGGING_PATH_DEBUG, + + /** + * Type: {@link org.slf4j.event.Level} Used by: DiffTool Logger + *

+ * Description: Log level for the diff tool logger + *

+ * Recommendation / Default: Log.INFO + *

+ * Note that the corresponding output directory for the logger has to exist + * when the LogLevel is not Level.OFF + */ + LOGGING_LOGLEVEL_DIFFTOOL, + + /** + * Type: java.util.Set Used by: ArticleFilter + *

+ * Description: The Set of namespaces to keep in output + */ + NAMESPACES_TO_KEEP } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/ConfigurationManager.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/ConfigurationManager.java index 4f9e7379..0e3f384c 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/ConfigurationManager.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/ConfigurationManager.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -28,117 +28,98 @@ /** * Singleton - Manages the configuration settings for the DiffTool. - * - * - * */ -public class ConfigurationManager -{ +public class ConfigurationManager { - /** Reference to the created instance */ - private static ConfigurationManager instance; + /** + * Reference to the created instance + */ + private static ConfigurationManager instance; - /** - * Returns the reference to the instance of the ConfigurationManager. - * - * @return ConfigurationManager - * - * @throws ConfigurationException - * if the ConfigurationManager has not been created during the - * startup of the application. - */ - public static ConfigurationManager getInstance() - throws ConfigurationException - { + /** + * Returns the reference to the instance of the ConfigurationManager. + * + * @return ConfigurationManager + * @throws ConfigurationException if the ConfigurationManager has not been created during the + * startup of the application. + */ + public static ConfigurationManager getInstance() + throws ConfigurationException { - if (instance == null) { - throw ErrorFactory - .createConfigurationException(ErrorKeys.CONFIGURATION_CONFIGURATIONMANAGER_NOT_INITIALIZED); - } - return instance; - } + if (instance == null) { + throw ErrorFactory + .createConfigurationException(ErrorKeys.CONFIGURATION_CONFIGURATIONMANAGER_NOT_INITIALIZED); + } + return instance; + } - /** Reference to the ConfigurationSettings */ - private final ConfigSettings config; + /** + * Reference to the ConfigurationSettings + */ + private final ConfigSettings config; - /** - * (Constructor) Creates the Configuration Manager - This constructor should - * only be called during the startup of the DiffTool Application. - * - * @param config - * Reference to the ConfigurationSettings - */ - public ConfigurationManager(final ConfigSettings config) - { - instance = this; - this.config = config; - } + /** + * (Constructor) Creates the Configuration Manager - This constructor should + * only be called during the startup of the DiffTool Application. + * + * @param config Reference to the ConfigurationSettings + */ + public ConfigurationManager(final ConfigSettings config) { + instance = this; + this.config = config; + } - /** - * Returns the list of input archives. - * - * @return list of input archives - */ - public List getArchiveList() - { - return this.config.getArchiveList(); - } + /** + * Returns the list of input archives. + * + * @return list of input archives + */ + public List getArchiveList() { + return this.config.getArchiveList(); + } - /** - * Returns the value of the configuration parameter. - * - * @param configParameter - * Key for the configuration parameter. - * @return Value of the configuration parameter - * - * @throws ConfigurationException - * if the configuration value was not defined or was not set. - */ - public Object getConfigParameter(final ConfigurationKeys configParameter) - throws ConfigurationException - { + /** + * Returns the value of the configuration parameter. + * + * @param configParameter Key for the configuration parameter. + * @return Value of the configuration parameter + * @throws ConfigurationException if the configuration value was not defined or was not set. + */ + public Object getConfigParameter(final ConfigurationKeys configParameter) + throws ConfigurationException { - Object o = this.config.getConfigParameter(configParameter); - if (o != null) { - return o; - } - //return standard values for some of the parameters if they - //are not set in the configuration - //this is only done for uncritical settings, e.g. debug or logging - //For other parameters, missing settings will produce an exception - else if(configParameter==ConfigurationKeys.LIMIT_SQL_ARCHIVE_SIZE){ - return Long.MAX_VALUE; - } - else if (configParameter==ConfigurationKeys.LIMIT_SQL_FILE_SIZE){ - return Long.MAX_VALUE; - } - else if (configParameter==ConfigurationKeys.MODE_STATISTICAL_OUTPUT){ - return false; - } - else if (configParameter==ConfigurationKeys.MODE_DEBUG_OUTPUT){ - return false; - } - else if (configParameter==ConfigurationKeys.VERIFICATION_ENCODING){ - return false; - } - else if (configParameter==ConfigurationKeys.VERIFICATION_DIFF){ - return false; - } - else if (configParameter==ConfigurationKeys.LOGGING_PATH_DEBUG){ - return ""; - } - else if (configParameter==ConfigurationKeys.NAMESPACES_TO_KEEP){ - return new HashSet(); - } - else if (configParameter==ConfigurationKeys.MODE_DATAFILE_OUTPUT){ - return false; - } - else{ - throw ErrorFactory - .createConfigurationException( - ErrorKeys.CONFIGURATION_CONFIGURATIONMANAGER_UNKNOWN_CONFIG_PARAMETER, - configParameter.toString()); - } - } + Object o = this.config.getConfigParameter(configParameter); + if (o != null) { + return o; + } + //return standard values for some of the parameters if they + //are not set in the configuration + //this is only done for uncritical settings, e.g. debug or logging + //For other parameters, missing settings will produce an exception + else if (configParameter == ConfigurationKeys.LIMIT_SQL_ARCHIVE_SIZE) { + return Long.MAX_VALUE; + } else if (configParameter == ConfigurationKeys.LIMIT_SQL_FILE_SIZE) { + return Long.MAX_VALUE; + } else if (configParameter == ConfigurationKeys.MODE_STATISTICAL_OUTPUT) { + return false; + } else if (configParameter == ConfigurationKeys.MODE_DEBUG_OUTPUT) { + return false; + } else if (configParameter == ConfigurationKeys.VERIFICATION_ENCODING) { + return false; + } else if (configParameter == ConfigurationKeys.VERIFICATION_DIFF) { + return false; + } else if (configParameter == ConfigurationKeys.LOGGING_PATH_DEBUG) { + return ""; + } else if (configParameter == ConfigurationKeys.NAMESPACES_TO_KEEP) { + return new HashSet(); + } else if (configParameter == ConfigurationKeys.MODE_DATAFILE_OUTPUT) { + return false; + } else { + throw ErrorFactory + .createConfigurationException( + ErrorKeys.CONFIGURATION_CONFIGURATIONMANAGER_UNKNOWN_CONFIG_PARAMETER, + configParameter.toString()); + } + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/ConfigurationReader.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/ConfigurationReader.java index 7505adff..a1f4ff55 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/ConfigurationReader.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/ConfigurationReader.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -41,810 +41,819 @@ /** * This Reader reads the xml-configuration files for the DiffTool. - * */ -public class ConfigurationReader -{ +public class ConfigurationReader { + + /** + * XML tree root node + */ + private final Element root; + + /** + * Section identifier - Mode + */ + private final String SECTION_MODE = "VALUES"; + + /** + * Key identifier - Mode >> Minimum longest common substring + */ + private final String KEY_VALUE_MINIMUM_LONGEST_COMMON_SUBSTRING = "VALUE_MINIMUM_LONGEST_COMMON_SUBSTRING"; + + /** + * Key identifier - Mode >> full revision counter + */ + private final String KEY_COUNTER_FULL_REVISION = "COUNTER_FULL_REVISION"; + + /** + * Section identifier - Externals + */ + private final String SECTION_EXTERNALS = "EXTERNALS"; + + /** + * Key identifier - Externals >> SevenZip + */ + private final String KEY_SEVENZIP = "SEVENZIP"; + + /** + * Section identifier - Input + */ + private final String SECTION_INPUT = "INPUT"; + + /** + * Key identifier - Input >> Surrogates Mode + */ + private final String KEY_MODE_SURROGATES = "MODE_SURROGATES"; + + /** + * Key identifier - Input >> Wikipedia Encoding + */ + private final String KEY_WIKIPEDIA_ENCODING = "WIKIPEDIA_ENCODING"; + + /** + * Subsection identifier - Input -> Archive + */ + private final String SUBSECTION_ARCHIVE = "ARCHIVE"; + + /** + * Key identifier - Input -> Archive >> Type + */ + private final String KEY_TYPE = "TYPE"; + + /** + * Key identifier - Input -> Archive >> Path + */ + private final String KEY_PATH = "PATH"; + + /** + * Key identifier - Input -> Archive >> Start + */ + private final String KEY_START = "START"; + + /** + * Section identifier - Output + */ + private final String SECTION_OUTPUT = "OUTPUT"; + + /** + * Key identifier - Output >> MODE + */ + private final String KEY_OUTPUT_MODE = "OUTPUT_MODE"; + + /** + * Key identifier - Output >> MODE >> UNCOMPRESSED File Size + */ + private final String KEY_LIMIT_SQL_FILE_SIZE = "LIMIT_SQL_FILE_SIZE"; + + /** + * Key identifier - Output >> Enable Datafile + */ + private final String KEY_OUTPUT_DATAFILE = "MODE_DATAFILE_OUTPUT"; + + /** + * Key identifier - Output >> MODE >> UNCOMPRESSED Archive Size + */ + private final String KEY_LIMIT_SQL_ARCHIVE_SIZE = "LIMIT_SQL_ARCHIVE_SIZE"; + + /** + * Key identifier - Output >> MODE >> Zip-Compression enabled + */ + private final String KEY_MODE_ZIP_COMPRESSION_ENABLED = "MODE_ZIP_COMPRESSION_ENABLED"; + + /** + * Key identifier - Output >> MODE >> Binary output enabled + */ + private final String KEY_MODE_BINARY_OUTPUT_ENABLED = "MODE_BINARY_OUTPUT_ENABLED"; + + /** + * Subsection identifier - Output -> UNCOMPRESSED + */ + private final String SUBSECTION_SQL = "UNCOMPRESSED"; + + /** + * Key identifier - Output -> UNCOMPRESSED >> Host + */ + private final String KEY_HOST = "HOST"; + + /** + * Key identifier - Output -> UNCOMPRESSED >> Database + */ + private final String KEY_DATABASE = "DATABASE"; + + /** + * Key identifier - Output -> UNCOMPRESSED >> User + */ + private final String KEY_USER = "USER"; + + /** + * Key identifier - Output -> UNCOMPRESSED >> Password + */ + private final String KEY_PASSWORD = "PASSWORD"; + + /** + * Section identifier - Cache + */ + private final String SECTION_CACHE = "CACHE"; + + /** + * Key identifier - Cache >> Task Size Revisions + */ + private final String KEY_LIMIT_TASK_SIZE_REVISIONS = "LIMIT_TASK_SIZE_REVISIONS"; + + /** + * Key identifier - Cache >> Task Size Diff + */ + private final String KEY_LIMIT_TASK_SIZE_DIFFS = "LIMIT_TASK_SIZE_DIFFS"; + + /** + * Key identifier - Cache >> SQLProducer MAXALLOWEDPACKET + */ + private final String KEY_LIMIT_SQLSERVER_MAX_ALLOWED_PACKET = "LIMIT_SQLSERVER_MAX_ALLOWED_PACKET"; + + /** + * Section identifier - Logging + */ + private final String SECTION_LOGGING = "LOGGING"; + + /** + * Section identifier - Logging >> Root folder + */ + private final String KEY_ROOT_FOLDER = "ROOT_FOLDER"; + + /** + * Subsection identifier - Logging -> DiffTool + */ + private final String SUBSUBSECTION_DIFF_TOOL = "DIFF_TOOL"; + + /** + * Key identifier - Logging -> ... >> Level + */ + private final String KEY_LOG_LEVEL = "LEVEL"; + + /** + * Key identifier - Logging -> ... >> Path + */ + private final String KEY_LOG_PATH = "PATH"; + + /** + * Section identifier - Debug + */ + private final String SECTION_DEBUG = "DEBUG"; + + /** + * Key identifier - Debug -> Output >> Verification Diff + */ + private final String KEY_VERIFICATION_DIFF = "VERIFICATION_DIFF"; + + /** + * Key identifier - Debug -> Output >> Verification Encoding + */ + private final String KEY_VERIFICATION_ENCODING = "VERIFICATION_ENCODING"; + + /** + * Key identifier - Debug -> Output >> Statistical + */ + private final String KEY_STATISTICAL_OUTPUT = "STATISTICAL_OUTPUT"; + + /** + * Subsection identifier - Debug -> Output + */ + private final String SUBSECTION_DEBUG_OUTPUT = "DEBUG_OUTPUT"; + + /** + * Key identifier - Debug -> Output >> Enabled + */ + private final String KEY_DEBUG_ENABLED = "ENABLED"; + + /** + * Key identifier - Debug -> Output >> Path + */ + private final String KEY_DEBUG_PATH = "PATH"; + + /** + * Section identifier - filter + */ + private final String SECTION_FILTER = "FILTER"; + + /** + * Subsection identifier - filter -> namespaces + */ + private final String SUBSECTION_FILTER_NAMESPACES = "NAMESPACES"; + + /** + * Key identifier - filter -> namespaces >> ns + */ + private final String NAMESPACE_TO_KEEP = "NS"; + + /** + * (Constructor) Creates a new ConfigurationReader object. + * + * @param path + * @throws IOException if an error occurs while reading the file + * @throws SAXException if an error occurs while building the document + * @throws ParserConfigurationException if an error occurs while parsing the document + */ + public ConfigurationReader(final String path) + throws IOException, SAXException, ParserConfigurationException { + + DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); + + DocumentBuilder loader = factory.newDocumentBuilder(); + + Document document = loader.parse(path); + root = document.getDocumentElement(); + } + + /** + * Reads the input of the configuration file and parses the into the + * ConfigSettings object. + * + * @return ConfigSettings + */ + public ConfigSettings read() { + + ConfigSettings config = new ConfigSettings(ConfigEnum.IMPORT); + + String name; + Node node; + NodeList list = root.getChildNodes(); + + int length = list.getLength(); + for (int i = 0; i < length; i++) { + node = list.item(i); + + name = node.getNodeName().toUpperCase(); + + if (name.equals(SECTION_MODE)) { + parseModeConfig(node, config); + } else if (name.equals(SECTION_EXTERNALS)) { + parseExternalsConfig(node, config); + } else if (name.equals(SECTION_INPUT)) { + parseInputConfig(node, config); + } else if (name.equals(SECTION_OUTPUT)) { + parseOutputConfig(node, config); + } else if (name.equals(SECTION_CACHE)) { + parseCacheConfig(node, config); + } else if (name.equals(SECTION_LOGGING)) { + parseLoggingConfig(node, config); + } else if (name.equals(SECTION_DEBUG)) { + parseDebugConfig(node, config); + } else if (name.equals(SECTION_FILTER)) { + parseFilterConfig(node, config); + } + } + + return config; + } + + + /** + * Parses the filter parameter section. + * + * @param node Reference to the current used xml node + * @param config Reference to the ConfigSettings + */ + private void parseFilterConfig(final Node node, final ConfigSettings config) { + + String name; + Node nnode; + final NodeList list = node.getChildNodes(); + final int length = list.getLength(); + + for (int i = 0; i < length; i++) { + nnode = list.item(i); + + name = nnode.getNodeName().toUpperCase(); + + if (name.equals(SUBSECTION_FILTER_NAMESPACES)) { + parseNamespaceFilterConfig(nnode, config); + } + + } + } + + /** + * Parses the namespaces parameter section. This is the subsection of filter. + * + * @param node Reference to the current used xml node + * @param config Reference to the ConfigSettings + */ + private void parseNamespaceFilterConfig(final Node node, final ConfigSettings config) { + String name; + Integer value; + Node nnode; + final NodeList list = node.getChildNodes(); + final int length = list.getLength(); + final Set namespaces = new HashSet<>(); + + for (int i = 0; i < length; i++) { + nnode = list.item(i); + + name = nnode.getNodeName().toUpperCase(); + if (name.equals(NAMESPACE_TO_KEEP)) { + + value = Integer.parseInt(nnode.getChildNodes().item(0) + .getNodeValue()); + namespaces.add(value); + + + } + + } + + config.setConfigParameter( + ConfigurationKeys.NAMESPACES_TO_KEEP, + namespaces); + + } + + + /** + * Parses the mode parameter section. + * + * @param node Reference to the current used xml node + * @param config Reference to the ConfigSettings + */ + private void parseModeConfig(final Node node, final ConfigSettings config) { + + String name; + Integer value; + Node nnode; + NodeList list = node.getChildNodes(); + + int length = list.getLength(); + for (int i = 0; i < length; i++) { + nnode = list.item(i); + + name = nnode.getNodeName().toUpperCase(); + if (name.equals(KEY_VALUE_MINIMUM_LONGEST_COMMON_SUBSTRING)) { + + value = Integer.parseInt(nnode.getChildNodes().item(0) + .getNodeValue()); + config.setConfigParameter( + ConfigurationKeys.VALUE_MINIMUM_LONGEST_COMMON_SUBSTRING, + value); + + } else if (name.equals(KEY_COUNTER_FULL_REVISION)) { + + value = Integer.parseInt(nnode.getChildNodes().item(0) + .getNodeValue()); + config.setConfigParameter( + ConfigurationKeys.COUNTER_FULL_REVISION, value); + + } + } + } + + /** + * Parses the externals parameter section. + * + * @param node Reference to the current used xml node + * @param config Reference to the ConfigSettings + */ + private void parseExternalsConfig(final Node node, + final ConfigSettings config) { + + String name, value; + Node nnode; + NodeList list = node.getChildNodes(); + + int length = list.getLength(); + for (int i = 0; i < length; i++) { + nnode = list.item(i); + + name = nnode.getNodeName().toUpperCase(); + if (name.equals(KEY_SEVENZIP)) { + + value = nnode.getChildNodes().item(0).getNodeValue(); + value = value.substring(1, value.length() - 1); + + config.setConfigParameter(ConfigurationKeys.PATH_PROGRAM_7ZIP, + value); + + } + } + } + + /** + * Parses the input parameter section. + * + * @param node Reference to the current used xml node + * @param config Reference to the ConfigSettings + */ + private void parseInputConfig(final Node node, final ConfigSettings config) { + + String name, value; + Node nnode; + NodeList list = node.getChildNodes(); + + int length = list.getLength(); + for (int i = 0; i < length; i++) { + nnode = list.item(i); + + name = nnode.getNodeName().toUpperCase(); + if (name.equals(KEY_WIKIPEDIA_ENCODING)) { + + value = nnode.getChildNodes().item(0).getNodeValue(); + config.setConfigParameter(ConfigurationKeys.WIKIPEDIA_ENCODING, + value); + + } else if (name.equals(KEY_MODE_SURROGATES)) { + + SurrogateModes oValue = SurrogateModes.parse(nnode + .getChildNodes().item(0).getNodeValue()); + config.setConfigParameter(ConfigurationKeys.MODE_SURROGATES, + oValue); + + } else if (name.equals(SUBSECTION_ARCHIVE)) { - /** XML tree root node */ - private final Element root; + parseInputArchive(nnode, config); + + } + } + } - /** Section identifier - Mode */ - private final String SECTION_MODE = "VALUES"; + /** + * Parses the input archive subsection. + * + * @param node Reference to the current used xml node + * @param config Reference to the ConfigSettings + */ + private void parseInputArchive(final Node node, final ConfigSettings config) { - /** Key identifier - Mode >> Minimum longest common substring */ - private final String KEY_VALUE_MINIMUM_LONGEST_COMMON_SUBSTRING = "VALUE_MINIMUM_LONGEST_COMMON_SUBSTRING"; + String name; - /** Key identifier - Mode >> full revision counter */ - private final String KEY_COUNTER_FULL_REVISION = "COUNTER_FULL_REVISION"; + InputType type = null; + String path = null; + long startPosition = 0; - /** Section identifier - Externals */ - private final String SECTION_EXTERNALS = "EXTERNALS"; + Node nnode; + NodeList list = node.getChildNodes(); - /** Key identifier - Externals >> SevenZip */ - private final String KEY_SEVENZIP = "SEVENZIP"; + int length = list.getLength(); + for (int i = 0; i < length; i++) { + nnode = list.item(i); - /** Section identifier - Input */ - private final String SECTION_INPUT = "INPUT"; + name = nnode.getNodeName().toUpperCase(); + if (name.equals(KEY_TYPE)) { - /** Key identifier - Input >> Surrogates Mode */ - private final String KEY_MODE_SURROGATES = "MODE_SURROGATES"; + type = InputType.parse(nnode.getChildNodes().item(0) + .getNodeValue()); - /** Key identifier - Input >> Wikipedia Encoding */ - private final String KEY_WIKIPEDIA_ENCODING = "WIKIPEDIA_ENCODING"; + } else if (name.equals(KEY_PATH)) { - /** Subsection identifier - Input -> Archive */ - private final String SUBSECTION_ARCHIVE = "ARCHIVE"; + path = nnode.getChildNodes().item(0).getNodeValue(); + path = path.substring(1, path.length() - 1); - /** Key identifier - Input -> Archive >> Type */ - private final String KEY_TYPE = "TYPE"; + } else if (name.equals(KEY_START)) { - /** Key identifier - Input -> Archive >> Path */ - private final String KEY_PATH = "PATH"; + startPosition = Long.parseLong(nnode.getChildNodes().item(0) + .getNodeValue()); - /** Key identifier - Input -> Archive >> Start */ - private final String KEY_START = "START"; + } + } - /** Section identifier - Output */ - private final String SECTION_OUTPUT = "OUTPUT"; + if (type == null || path == null) { + throw new IllegalArgumentException("Illegal Archive Description"); + } - /** Key identifier - Output >> MODE */ - private final String KEY_OUTPUT_MODE = "OUTPUT_MODE"; + ArchiveDescription archive = new ArchiveDescription(type, path); + if (startPosition > 0) { + archive.setStartPosition(startPosition); + } - /** Key identifier - Output >> MODE >> UNCOMPRESSED File Size */ - private final String KEY_LIMIT_SQL_FILE_SIZE = "LIMIT_SQL_FILE_SIZE"; + config.add(archive); + } - /** Key identifier - Output >> Enable Datafile */ - private final String KEY_OUTPUT_DATAFILE = "MODE_DATAFILE_OUTPUT"; + /** + * Parses the output parameter section. + * + * @param node Reference to the current used xml node + * @param config Reference to the ConfigSettings + */ + private void parseOutputConfig(final Node node, final ConfigSettings config) { - /** Key identifier - Output >> MODE >> UNCOMPRESSED Archive Size */ - private final String KEY_LIMIT_SQL_ARCHIVE_SIZE = "LIMIT_SQL_ARCHIVE_SIZE"; + String name; + Long lValue; + Boolean bValue; + Node nnode; + NodeList list = node.getChildNodes(); - /** Key identifier - Output >> MODE >> Zip-Compression enabled */ - private final String KEY_MODE_ZIP_COMPRESSION_ENABLED = "MODE_ZIP_COMPRESSION_ENABLED"; + int length = list.getLength(); + for (int i = 0; i < length; i++) { + nnode = list.item(i); - /** Key identifier - Output >> MODE >> Binary output enabled */ - private final String KEY_MODE_BINARY_OUTPUT_ENABLED = "MODE_BINARY_OUTPUT_ENABLED"; + name = nnode.getNodeName().toUpperCase(); + if (name.equals(KEY_OUTPUT_MODE)) { - /** Subsection identifier - Output -> UNCOMPRESSED */ - private final String SUBSECTION_SQL = "UNCOMPRESSED"; + OutputType oValue = OutputType.parse(nnode.getChildNodes() + .item(0).getNodeValue()); + config.setConfigParameter(ConfigurationKeys.MODE_OUTPUT, oValue); - /** Key identifier - Output -> UNCOMPRESSED >> Host */ - private final String KEY_HOST = "HOST"; + } else if (name.equals(KEY_PATH)) { - /** Key identifier - Output -> UNCOMPRESSED >> Database */ - private final String KEY_DATABASE = "DATABASE"; + String path = nnode.getChildNodes().item(0).getNodeValue(); + path = path.substring(1, path.length() - 1); - /** Key identifier - Output -> UNCOMPRESSED >> User */ - private final String KEY_USER = "USER"; + config.setConfigParameter( + ConfigurationKeys.PATH_OUTPUT_SQL_FILES, path); - /** Key identifier - Output -> UNCOMPRESSED >> Password */ - private final String KEY_PASSWORD = "PASSWORD"; + } else if (name.equals(KEY_OUTPUT_DATAFILE)) { + bValue = Boolean.parseBoolean(nnode.getChildNodes().item(0) + .getNodeValue()); + config.setConfigParameter( + ConfigurationKeys.MODE_DATAFILE_OUTPUT, bValue); + } else if (name.equals(KEY_LIMIT_SQL_FILE_SIZE)) { - /** Section identifier - Cache */ - private final String SECTION_CACHE = "CACHE"; + lValue = Long.parseLong(nnode.getChildNodes().item(0) + .getNodeValue()); + config.setConfigParameter( + ConfigurationKeys.LIMIT_SQL_FILE_SIZE, lValue); - /** Key identifier - Cache >> Task Size Revisions */ - private final String KEY_LIMIT_TASK_SIZE_REVISIONS = "LIMIT_TASK_SIZE_REVISIONS"; + } else if (name.equals(KEY_LIMIT_SQL_ARCHIVE_SIZE)) { - /** Key identifier - Cache >> Task Size Diff */ - private final String KEY_LIMIT_TASK_SIZE_DIFFS = "LIMIT_TASK_SIZE_DIFFS"; + lValue = Long.parseLong(nnode.getChildNodes().item(0) + .getNodeValue()); + config.setConfigParameter( + ConfigurationKeys.LIMIT_SQL_ARCHIVE_SIZE, lValue); - /** Key identifier - Cache >> SQLProducer MAXALLOWEDPACKET */ - private final String KEY_LIMIT_SQLSERVER_MAX_ALLOWED_PACKET = "LIMIT_SQLSERVER_MAX_ALLOWED_PACKET"; + } else if (name.equals(KEY_MODE_ZIP_COMPRESSION_ENABLED)) { - /** Section identifier - Logging */ - private final String SECTION_LOGGING = "LOGGING"; + bValue = Boolean.parseBoolean(nnode.getChildNodes().item(0) + .getNodeValue()); + config.setConfigParameter( + ConfigurationKeys.MODE_ZIP_COMPRESSION_ENABLED, bValue); - /** Section identifier - Logging >> Root folder */ - private final String KEY_ROOT_FOLDER = "ROOT_FOLDER"; + } else if (name.equals(KEY_MODE_BINARY_OUTPUT_ENABLED)) { - /** Subsection identifier - Logging -> DiffTool */ - private final String SUBSUBSECTION_DIFF_TOOL = "DIFF_TOOL"; + bValue = Boolean.parseBoolean(nnode.getChildNodes().item(0) + .getNodeValue()); + config.setConfigParameter( + ConfigurationKeys.MODE_BINARY_OUTPUT_ENABLED, bValue); - /** Key identifier - Logging -> ... >> Level */ - private final String KEY_LOG_LEVEL = "LEVEL"; + } else if (name.equals(SUBSECTION_SQL)) { - /** Key identifier - Logging -> ... >> Path */ - private final String KEY_LOG_PATH = "PATH"; + parseSQLConfig(nnode, config); - /** Section identifier - Debug */ - private final String SECTION_DEBUG = "DEBUG"; + } + } + } - /** Key identifier - Debug -> Output >> Verification Diff */ - private final String KEY_VERIFICATION_DIFF = "VERIFICATION_DIFF"; + /** + * Parses the sql parameter section. + * + * @param node Reference to the current used xml node + * @param config Reference to the ConfigSettings + */ + private void parseSQLConfig(final Node node, final ConfigSettings config) { - /** Key identifier - Debug -> Output >> Verification Encoding */ - private final String KEY_VERIFICATION_ENCODING = "VERIFICATION_ENCODING"; + String name, value; + Node nnode; + NodeList list = node.getChildNodes(); - /** Key identifier - Debug -> Output >> Statistical */ - private final String KEY_STATISTICAL_OUTPUT = "STATISTICAL_OUTPUT"; + int length = list.getLength(); + for (int i = 0; i < length; i++) { + nnode = list.item(i); - /** Subsection identifier - Debug -> Output */ - private final String SUBSECTION_DEBUG_OUTPUT = "DEBUG_OUTPUT"; + name = nnode.getNodeName().toUpperCase(); + if (name.equals(KEY_HOST)) { - /** Key identifier - Debug -> Output >> Enabled */ - private final String KEY_DEBUG_ENABLED = "ENABLED"; + value = nnode.getChildNodes().item(0).getNodeValue(); + config.setConfigParameter(ConfigurationKeys.SQL_HOST, value); - /** Key identifier - Debug -> Output >> Path */ - private final String KEY_DEBUG_PATH = "PATH"; + } else if (name.equals(KEY_DATABASE)) { - /** Section identifier - filter */ - private final String SECTION_FILTER = "FILTER"; + value = nnode.getChildNodes().item(0).getNodeValue(); + config.setConfigParameter(ConfigurationKeys.SQL_DATABASE, value); + + } else if (name.equals(KEY_USER)) { + + value = nnode.getChildNodes().item(0).getNodeValue(); + config.setConfigParameter(ConfigurationKeys.SQL_USERNAME, value); - /** Subsection identifier - filter -> namespaces */ - private final String SUBSECTION_FILTER_NAMESPACES = "NAMESPACES"; + } else if (name.equals(KEY_PASSWORD)) { - /** Key identifier - filter -> namespaces >> ns */ - private final String NAMESPACE_TO_KEEP = "NS"; + value = nnode.getChildNodes().item(0).getNodeValue(); + config.setConfigParameter(ConfigurationKeys.SQL_PASSWORD, value); + } + } + } - /** - * (Constructor) Creates a new ConfigurationReader object. - * - * @param path - * - * @throws IOException - * if an error occurs while reading the file - * @throws SAXException - * if an error occurs while building the document - * - * @throws ParserConfigurationException - * if an error occurs while parsing the document - */ - public ConfigurationReader(final String path) - throws IOException, SAXException, ParserConfigurationException - { - - DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); - - DocumentBuilder loader = factory.newDocumentBuilder(); - - Document document = loader.parse(path); - root = document.getDocumentElement(); - } - - /** - * Reads the input of the configuration file and parses the into the - * ConfigSettings object. - * - * @return ConfigSettings - */ - public ConfigSettings read() - { - - ConfigSettings config = new ConfigSettings(ConfigEnum.IMPORT); - - String name; - Node node; - NodeList list = root.getChildNodes(); - - int length = list.getLength(); - for (int i = 0; i < length; i++) { - node = list.item(i); - - name = node.getNodeName().toUpperCase(); - - if (name.equals(SECTION_MODE)) { - parseModeConfig(node, config); - } - else if (name.equals(SECTION_EXTERNALS)) { - parseExternalsConfig(node, config); - } - else if (name.equals(SECTION_INPUT)) { - parseInputConfig(node, config); - } - else if (name.equals(SECTION_OUTPUT)) { - parseOutputConfig(node, config); - } - else if (name.equals(SECTION_CACHE)) { - parseCacheConfig(node, config); - } - else if (name.equals(SECTION_LOGGING)) { - parseLoggingConfig(node, config); - } - else if (name.equals(SECTION_DEBUG)) { - parseDebugConfig(node, config); - } - else if (name.equals(SECTION_FILTER)) { - parseFilterConfig(node, config); - } - } - - return config; - } - - - /** - * Parses the filter parameter section. - * - * @param node - * Reference to the current used xml node - * @param config - * Reference to the ConfigSettings - */ - private void parseFilterConfig(final Node node, final ConfigSettings config) - { - - String name; - Node nnode; - final NodeList list = node.getChildNodes(); - final int length = list.getLength(); - - for (int i = 0; i < length; i++) { - nnode = list.item(i); - - name = nnode.getNodeName().toUpperCase(); - - if (name.equals(SUBSECTION_FILTER_NAMESPACES)) { - parseNamespaceFilterConfig(nnode, config); - } - - } - } - - /** - * Parses the namespaces parameter section. This is the subsection of filter. - * - * @param node - * Reference to the current used xml node - * @param config - * Reference to the ConfigSettings - */ - private void parseNamespaceFilterConfig(final Node node, final ConfigSettings config) { - String name; - Integer value; - Node nnode; - final NodeList list = node.getChildNodes(); - final int length = list.getLength(); - final Set namespaces = new HashSet<>(); - - for (int i = 0; i < length; i++) { - nnode = list.item(i); - - name = nnode.getNodeName().toUpperCase(); - if (name.equals(NAMESPACE_TO_KEEP)) { - - value = Integer.parseInt(nnode.getChildNodes().item(0) - .getNodeValue()); - namespaces.add(value); - - - } - - } - - config.setConfigParameter( - ConfigurationKeys.NAMESPACES_TO_KEEP, - namespaces); - - } - - - /** - * Parses the mode parameter section. - * - * @param node - * Reference to the current used xml node - * @param config - * Reference to the ConfigSettings - */ - private void parseModeConfig(final Node node, final ConfigSettings config) - { - - String name; - Integer value; - Node nnode; - NodeList list = node.getChildNodes(); - - int length = list.getLength(); - for (int i = 0; i < length; i++) { - nnode = list.item(i); - - name = nnode.getNodeName().toUpperCase(); - if (name.equals(KEY_VALUE_MINIMUM_LONGEST_COMMON_SUBSTRING)) { - - value = Integer.parseInt(nnode.getChildNodes().item(0) - .getNodeValue()); - config.setConfigParameter( - ConfigurationKeys.VALUE_MINIMUM_LONGEST_COMMON_SUBSTRING, - value); - - } - else if (name.equals(KEY_COUNTER_FULL_REVISION)) { - - value = Integer.parseInt(nnode.getChildNodes().item(0) - .getNodeValue()); - config.setConfigParameter( - ConfigurationKeys.COUNTER_FULL_REVISION, value); - - } - } - } - - /** - * Parses the externals parameter section. - * - * @param node - * Reference to the current used xml node - * @param config - * Reference to the ConfigSettings - */ - private void parseExternalsConfig(final Node node, - final ConfigSettings config) - { - - String name, value; - Node nnode; - NodeList list = node.getChildNodes(); - - int length = list.getLength(); - for (int i = 0; i < length; i++) { - nnode = list.item(i); - - name = nnode.getNodeName().toUpperCase(); - if (name.equals(KEY_SEVENZIP)) { - - value = nnode.getChildNodes().item(0).getNodeValue(); - value = value.substring(1, value.length() - 1); - - config.setConfigParameter(ConfigurationKeys.PATH_PROGRAM_7ZIP, - value); - - } - } - } + /** + * Parses the cache parameter section. + * + * @param node Reference to the current used xml node + * @param config Reference to the ConfigSettings + */ + private void parseCacheConfig(final Node node, final ConfigSettings config) { - /** - * Parses the input parameter section. - * - * @param node - * Reference to the current used xml node - * @param config - * Reference to the ConfigSettings - */ - private void parseInputConfig(final Node node, final ConfigSettings config) - { - - String name, value; - Node nnode; - NodeList list = node.getChildNodes(); - - int length = list.getLength(); - for (int i = 0; i < length; i++) { - nnode = list.item(i); - - name = nnode.getNodeName().toUpperCase(); - if (name.equals(KEY_WIKIPEDIA_ENCODING)) { - - value = nnode.getChildNodes().item(0).getNodeValue(); - config.setConfigParameter(ConfigurationKeys.WIKIPEDIA_ENCODING, - value); - - } - else if (name.equals(KEY_MODE_SURROGATES)) { - - SurrogateModes oValue = SurrogateModes.parse(nnode - .getChildNodes().item(0).getNodeValue()); - config.setConfigParameter(ConfigurationKeys.MODE_SURROGATES, - oValue); - - } - else if (name.equals(SUBSECTION_ARCHIVE)) { - - parseInputArchive(nnode, config); - - } - } - } - - /** - * Parses the input archive subsection. - * - * @param node - * Reference to the current used xml node - * @param config - * Reference to the ConfigSettings - */ - private void parseInputArchive(final Node node, final ConfigSettings config) - { - - String name; - - InputType type = null; - String path = null; - long startPosition = 0; - - Node nnode; - NodeList list = node.getChildNodes(); - - int length = list.getLength(); - for (int i = 0; i < length; i++) { - nnode = list.item(i); - - name = nnode.getNodeName().toUpperCase(); - if (name.equals(KEY_TYPE)) { - - type = InputType.parse(nnode.getChildNodes().item(0) - .getNodeValue()); - - } - else if (name.equals(KEY_PATH)) { - - path = nnode.getChildNodes().item(0).getNodeValue(); - path = path.substring(1, path.length() - 1); - - } - else if (name.equals(KEY_START)) { - - startPosition = Long.parseLong(nnode.getChildNodes().item(0) - .getNodeValue()); - - } - } - - if (type == null || path == null) { - throw new IllegalArgumentException("Illegal Archive Description"); - } - - ArchiveDescription archive = new ArchiveDescription(type, path); - if (startPosition > 0) { - archive.setStartPosition(startPosition); - } - - config.add(archive); - } - - /** - * Parses the output parameter section. - * - * @param node - * Reference to the current used xml node - * @param config - * Reference to the ConfigSettings - */ - private void parseOutputConfig(final Node node, final ConfigSettings config) - { - - String name; - Long lValue; - Boolean bValue; - Node nnode; - NodeList list = node.getChildNodes(); - - int length = list.getLength(); - for (int i = 0; i < length; i++) { - nnode = list.item(i); + String name; + Long lValue; + Node nnode; + NodeList list = node.getChildNodes(); - name = nnode.getNodeName().toUpperCase(); - if (name.equals(KEY_OUTPUT_MODE)) { + int length = list.getLength(); + for (int i = 0; i < length; i++) { + nnode = list.item(i); + + name = nnode.getNodeName().toUpperCase(); + if (name.equals(KEY_LIMIT_TASK_SIZE_REVISIONS)) { + + lValue = Long.parseLong(nnode.getChildNodes().item(0) + .getNodeValue()); + config.setConfigParameter( + ConfigurationKeys.LIMIT_TASK_SIZE_REVISIONS, lValue); + + } else if (name.equals(KEY_LIMIT_TASK_SIZE_DIFFS)) { + + lValue = Long.parseLong(nnode.getChildNodes().item(0) + .getNodeValue()); + config.setConfigParameter( + ConfigurationKeys.LIMIT_TASK_SIZE_DIFFS, lValue); + + } else if (name.equals(KEY_LIMIT_SQLSERVER_MAX_ALLOWED_PACKET)) { + + lValue = Long.parseLong(nnode.getChildNodes().item(0) + .getNodeValue()); + config.setConfigParameter( + ConfigurationKeys.LIMIT_SQLSERVER_MAX_ALLOWED_PACKET, + lValue); + + } + } + } + + /** + * Parses the logging parameter section. + * + * @param node Reference to the current used xml node + * @param config Reference to the ConfigSettings + */ + private void parseLoggingConfig(final Node node, final ConfigSettings config) { - OutputType oValue = OutputType.parse(nnode.getChildNodes() - .item(0).getNodeValue()); - config.setConfigParameter(ConfigurationKeys.MODE_OUTPUT, oValue); - - } - else if (name.equals(KEY_PATH)) { - - String path = nnode.getChildNodes().item(0).getNodeValue(); - path = path.substring(1, path.length() - 1); - - config.setConfigParameter( - ConfigurationKeys.PATH_OUTPUT_SQL_FILES, path); - - } - else if (name.equals(KEY_OUTPUT_DATAFILE)) { - bValue = Boolean.parseBoolean(nnode.getChildNodes().item(0) - .getNodeValue()); - config.setConfigParameter( - ConfigurationKeys.MODE_DATAFILE_OUTPUT, bValue); - } - else if (name.equals(KEY_LIMIT_SQL_FILE_SIZE)) { - - lValue = Long.parseLong(nnode.getChildNodes().item(0) - .getNodeValue()); - config.setConfigParameter( - ConfigurationKeys.LIMIT_SQL_FILE_SIZE, lValue); - - } - else if (name.equals(KEY_LIMIT_SQL_ARCHIVE_SIZE)) { - - lValue = Long.parseLong(nnode.getChildNodes().item(0) - .getNodeValue()); - config.setConfigParameter( - ConfigurationKeys.LIMIT_SQL_ARCHIVE_SIZE, lValue); - - } - else if (name.equals(KEY_MODE_ZIP_COMPRESSION_ENABLED)) { - - bValue = Boolean.parseBoolean(nnode.getChildNodes().item(0) - .getNodeValue()); - config.setConfigParameter( - ConfigurationKeys.MODE_ZIP_COMPRESSION_ENABLED, bValue); - - } - else if (name.equals(KEY_MODE_BINARY_OUTPUT_ENABLED)) { - - bValue = Boolean.parseBoolean(nnode.getChildNodes().item(0) - .getNodeValue()); - config.setConfigParameter( - ConfigurationKeys.MODE_BINARY_OUTPUT_ENABLED, bValue); - - } - else if (name.equals(SUBSECTION_SQL)) { - - parseSQLConfig(nnode, config); - - } - } - } - - /** - * Parses the sql parameter section. - * - * @param node - * Reference to the current used xml node - * @param config - * Reference to the ConfigSettings - */ - private void parseSQLConfig(final Node node, final ConfigSettings config) - { - - String name, value; - Node nnode; - NodeList list = node.getChildNodes(); - - int length = list.getLength(); - for (int i = 0; i < length; i++) { - nnode = list.item(i); - - name = nnode.getNodeName().toUpperCase(); - if (name.equals(KEY_HOST)) { - - value = nnode.getChildNodes().item(0).getNodeValue(); - config.setConfigParameter(ConfigurationKeys.SQL_HOST, value); - - } - else if (name.equals(KEY_DATABASE)) { - - value = nnode.getChildNodes().item(0).getNodeValue(); - config.setConfigParameter(ConfigurationKeys.SQL_DATABASE, value); - - } - else if (name.equals(KEY_USER)) { - - value = nnode.getChildNodes().item(0).getNodeValue(); - config.setConfigParameter(ConfigurationKeys.SQL_USERNAME, value); - - } - else if (name.equals(KEY_PASSWORD)) { - - value = nnode.getChildNodes().item(0).getNodeValue(); - config.setConfigParameter(ConfigurationKeys.SQL_PASSWORD, value); - } - } - } - - /** - * Parses the cache parameter section. - * - * @param node - * Reference to the current used xml node - * @param config - * Reference to the ConfigSettings - */ - private void parseCacheConfig(final Node node, final ConfigSettings config) - { - - String name; - Long lValue; - Node nnode; - NodeList list = node.getChildNodes(); - - int length = list.getLength(); - for (int i = 0; i < length; i++) { - nnode = list.item(i); - - name = nnode.getNodeName().toUpperCase(); - if (name.equals(KEY_LIMIT_TASK_SIZE_REVISIONS)) { - - lValue = Long.parseLong(nnode.getChildNodes().item(0) - .getNodeValue()); - config.setConfigParameter( - ConfigurationKeys.LIMIT_TASK_SIZE_REVISIONS, lValue); - - } - else if (name.equals(KEY_LIMIT_TASK_SIZE_DIFFS)) { - - lValue = Long.parseLong(nnode.getChildNodes().item(0) - .getNodeValue()); - config.setConfigParameter( - ConfigurationKeys.LIMIT_TASK_SIZE_DIFFS, lValue); - - } - else if (name.equals(KEY_LIMIT_SQLSERVER_MAX_ALLOWED_PACKET)) { - - lValue = Long.parseLong(nnode.getChildNodes().item(0) - .getNodeValue()); - config.setConfigParameter( - ConfigurationKeys.LIMIT_SQLSERVER_MAX_ALLOWED_PACKET, - lValue); - - } - } - } - - /** - * Parses the logging parameter section. - * - * @param node - * Reference to the current used xml node - * @param config - * Reference to the ConfigSettings - */ - private void parseLoggingConfig(final Node node, final ConfigSettings config) - { - - String name; - String value; - Node nnode; - NodeList list = node.getChildNodes(); - - int length = list.getLength(); - for (int i = 0; i < length; i++) { - nnode = list.item(i); - - name = nnode.getNodeName().toUpperCase(); - if (name.equals(KEY_ROOT_FOLDER)) { - - value = nnode.getChildNodes().item(0).getNodeValue(); - value = value.substring(1, value.length() - 1); - - config.setConfigParameter( - ConfigurationKeys.LOGGING_PATH_DIFFTOOL, value); - - } - else if (name.equals(SUBSUBSECTION_DIFF_TOOL)) { - - parseLoggerConfig(nnode, config, null, - ConfigurationKeys.LOGGING_LOGLEVEL_DIFFTOOL); - - } - } - } - - /** - * Parses the information for a logger. - * - * @param node - * Reference to the current used xml node - * @param config - * Reference to the ConfigSettings - * @param logPath - * Key for the path of this logger. - * @param logLevel - * Key for the level of this logger. - */ - private void parseLoggerConfig(final Node node, - final ConfigSettings config, final ConfigurationKeys logPath, - final ConfigurationKeys logLevel) - { - - String name, value; - Level level; - Node nnode; - NodeList list = node.getChildNodes(); - - int length = list.getLength(); - for (int i = 0; i < length; i++) { - nnode = list.item(i); - - name = nnode.getNodeName().toUpperCase(); - if (name.equals(KEY_LOG_PATH)) { - - value = nnode.getChildNodes().item(0).getNodeValue(); - value = value.substring(1, value.length() - 1); - config.setConfigParameter(logPath, value); - - } - else if (name.equals(KEY_LOG_LEVEL)) { - - level = Level.valueOf(nnode.getChildNodes().item(0) - .getNodeValue()); - config.setConfigParameter(logLevel, level); - } - } - } - - /** - * Parses the debug parameter section. - * - * @param node - * Reference to the current used xml node - * @param config - * Reference to the ConfigSettings - */ - private void parseDebugConfig(final Node node, final ConfigSettings config) - { - - String name; - Boolean value; - Node nnode; - NodeList list = node.getChildNodes(); - - int length = list.getLength(); - for (int i = 0; i < length; i++) { - nnode = list.item(i); - - name = nnode.getNodeName().toUpperCase(); - if (name.equals(KEY_VERIFICATION_DIFF)) { - - value = Boolean.parseBoolean(nnode.getChildNodes().item(0) - .getNodeValue()); - config.setConfigParameter(ConfigurationKeys.VERIFICATION_DIFF, - value); - - } - else if (name.equals(KEY_VERIFICATION_ENCODING)) { - - value = Boolean.parseBoolean(nnode.getChildNodes().item(0) - .getNodeValue()); - - config.setConfigParameter( - ConfigurationKeys.VERIFICATION_ENCODING, value); - - } - else if (name.equals(KEY_STATISTICAL_OUTPUT)) { - - value = Boolean.parseBoolean(nnode.getChildNodes().item(0) - .getNodeValue()); - config.setConfigParameter( - ConfigurationKeys.MODE_STATISTICAL_OUTPUT, value); - - } - else if (name.equals(SUBSECTION_DEBUG_OUTPUT)) { - - parseDebugOutputConfig(nnode, config); - } - } - } - - /** - * Parses the debug output parameter subsection. - * - * @param node - * Reference to the current used xml node - * @param config - * Reference to the ConfigSettings - */ - private void parseDebugOutputConfig(final Node node, - final ConfigSettings config) - { - - String name, value; - Node nnode; - NodeList list = node.getChildNodes(); - - int length = list.getLength(); - for (int i = 0; i < length; i++) { - nnode = list.item(i); - - name = nnode.getNodeName().toUpperCase(); - if (name.equals(KEY_DEBUG_PATH)) { - - value = nnode.getChildNodes().item(0).getNodeValue(); - value = value.substring(1, value.length() - 1); - - config.setConfigParameter(ConfigurationKeys.LOGGING_PATH_DEBUG, - value); - - } - else if (name.equals(KEY_DEBUG_ENABLED)) { - - Boolean enabled = Boolean.parseBoolean(nnode.getChildNodes() - .item(0).getNodeValue()); - config.setConfigParameter(ConfigurationKeys.MODE_DEBUG_OUTPUT, - enabled); - } - } - } + String name; + String value; + Node nnode; + NodeList list = node.getChildNodes(); + + int length = list.getLength(); + for (int i = 0; i < length; i++) { + nnode = list.item(i); + + name = nnode.getNodeName().toUpperCase(); + if (name.equals(KEY_ROOT_FOLDER)) { + + value = nnode.getChildNodes().item(0).getNodeValue(); + value = value.substring(1, value.length() - 1); + + config.setConfigParameter( + ConfigurationKeys.LOGGING_PATH_DIFFTOOL, value); + + } else if (name.equals(SUBSUBSECTION_DIFF_TOOL)) { + + parseLoggerConfig(nnode, config, null, + ConfigurationKeys.LOGGING_LOGLEVEL_DIFFTOOL); + + } + } + } + + /** + * Parses the information for a logger. + * + * @param node Reference to the current used xml node + * @param config Reference to the ConfigSettings + * @param logPath Key for the path of this logger. + * @param logLevel Key for the level of this logger. + */ + private void parseLoggerConfig(final Node node, + final ConfigSettings config, final ConfigurationKeys logPath, + final ConfigurationKeys logLevel) { + + String name, value; + Level level; + Node nnode; + NodeList list = node.getChildNodes(); + + int length = list.getLength(); + for (int i = 0; i < length; i++) { + nnode = list.item(i); + + name = nnode.getNodeName().toUpperCase(); + if (name.equals(KEY_LOG_PATH)) { + + value = nnode.getChildNodes().item(0).getNodeValue(); + value = value.substring(1, value.length() - 1); + config.setConfigParameter(logPath, value); + + } else if (name.equals(KEY_LOG_LEVEL)) { + + level = Level.valueOf(nnode.getChildNodes().item(0) + .getNodeValue()); + config.setConfigParameter(logLevel, level); + } + } + } + + /** + * Parses the debug parameter section. + * + * @param node Reference to the current used xml node + * @param config Reference to the ConfigSettings + */ + private void parseDebugConfig(final Node node, final ConfigSettings config) { + + String name; + Boolean value; + Node nnode; + NodeList list = node.getChildNodes(); + + int length = list.getLength(); + for (int i = 0; i < length; i++) { + nnode = list.item(i); + + name = nnode.getNodeName().toUpperCase(); + if (name.equals(KEY_VERIFICATION_DIFF)) { + + value = Boolean.parseBoolean(nnode.getChildNodes().item(0) + .getNodeValue()); + config.setConfigParameter(ConfigurationKeys.VERIFICATION_DIFF, + value); + + } else if (name.equals(KEY_VERIFICATION_ENCODING)) { + + value = Boolean.parseBoolean(nnode.getChildNodes().item(0) + .getNodeValue()); + + config.setConfigParameter( + ConfigurationKeys.VERIFICATION_ENCODING, value); + + } else if (name.equals(KEY_STATISTICAL_OUTPUT)) { + + value = Boolean.parseBoolean(nnode.getChildNodes().item(0) + .getNodeValue()); + config.setConfigParameter( + ConfigurationKeys.MODE_STATISTICAL_OUTPUT, value); + + } else if (name.equals(SUBSECTION_DEBUG_OUTPUT)) { + + parseDebugOutputConfig(nnode, config); + } + } + } + + /** + * Parses the debug output parameter subsection. + * + * @param node Reference to the current used xml node + * @param config Reference to the ConfigSettings + */ + private void parseDebugOutputConfig(final Node node, + final ConfigSettings config) { + + String name, value; + Node nnode; + NodeList list = node.getChildNodes(); + + int length = list.getLength(); + for (int i = 0; i < length; i++) { + nnode = list.item(i); + + name = nnode.getNodeName().toUpperCase(); + if (name.equals(KEY_DEBUG_PATH)) { + + value = nnode.getChildNodes().item(0).getNodeValue(); + value = value.substring(1, value.length() - 1); + + config.setConfigParameter(ConfigurationKeys.LOGGING_PATH_DEBUG, + value); + + } else if (name.equals(KEY_DEBUG_ENABLED)) { + + Boolean enabled = Boolean.parseBoolean(nnode.getChildNodes() + .item(0).getNodeValue()); + config.setConfigParameter(ConfigurationKeys.MODE_DEBUG_OUTPUT, + enabled); + } + } + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/OutputTypes.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/OutputTypes.java index efa45755..516e90d4 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/OutputTypes.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/OutputTypes.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -19,20 +19,22 @@ /** * This class represents the enumeration of OutputTypes of the IndexGenerator. - * - * - * */ -public enum OutputTypes -{ +public enum OutputTypes { - /** Output to the Database */ - DATABASE, + /** + * Output to the Database + */ + DATABASE, - /** Output as single sql file. */ - SQL, + /** + * Output as single sql file. + */ + SQL, - /** Output as datafile. */ - DATAFILE + /** + * Output as datafile. + */ + DATAFILE } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/ConfigGUI.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/ConfigGUI.java index 9fccfd31..0a4174d6 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/ConfigGUI.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/ConfigGUI.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -41,57 +41,50 @@ * If the output mode is set to bzip2, it is currently not possible * to split the output into several files. However, the ConfigGUI allows for * this setting. - * - * - * - * - * */ public class ConfigGUI - extends JFrame -{ + extends JFrame { - private static final long serialVersionUID = 1L; + private static final long serialVersionUID = 1L; - /** Reference to the ConfigController */ - private final ConfigController controller; + /** + * Reference to the ConfigController + */ + private final ConfigController controller; - /** - * (Constructor) Creates a new ConfigGUI object. - */ - public ConfigGUI() - { + /** + * (Constructor) Creates a new ConfigGUI object. + */ + public ConfigGUI() { - this.controller = new ConfigController(); + this.controller = new ConfigController(); - this.setTitle("RevisionMachine DiffTool - Configuration"); + this.setTitle("RevisionMachine DiffTool - Configuration"); - setSize(600, 400); - setResizable(false); - setDefaultCloseOperation(EXIT_ON_CLOSE); + setSize(600, 400); + setResizable(false); + setDefaultCloseOperation(EXIT_ON_CLOSE); - Dimension d = Toolkit.getDefaultToolkit().getScreenSize(); - setLocation((d.width - getSize().width) / 2, - (d.height - getSize().height) / 2); + Dimension d = Toolkit.getDefaultToolkit().getScreenSize(); + setLocation((d.width - getSize().width) / 2, + (d.height - getSize().height) / 2); - this.setJMenuBar(new ConfigMenuBar(controller)); - this.setContentPane(new ConfigPanel(controller)); + this.setJMenuBar(new ConfigMenuBar(controller)); + this.setContentPane(new ConfigPanel(controller)); - //load default parameters - this.controller.defaultConfiguration(); - } + //load default parameters + this.controller.defaultConfiguration(); + } - /** - * ConfigurationTool - Main Method - *

- * Starts the ConfigurationTool GUI - * - * @param args - * program arguments (not used) - */ - public static void main(final String[] args) - { - new ConfigGUI().setVisible(true); - } + /** + * ConfigurationTool - Main Method + *

+ * Starts the ConfigurationTool GUI + * + * @param args program arguments (not used) + */ + public static void main(final String[] args) { + new ConfigGUI().setVisible(true); + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/ConfigMenuBar.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/ConfigMenuBar.java index 08260ed8..6ca80e62 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/ConfigMenuBar.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/ConfigMenuBar.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -25,65 +25,60 @@ /** * MenuBar of the ConfigurationTool - * - * - * */ @SuppressWarnings("serial") public class ConfigMenuBar - extends JMenuBar -{ + extends JMenuBar { - /** Reference to the controller */ - private final ConfigController controller; + /** + * Reference to the controller + */ + private final ConfigController controller; - /** - * (Constructor) Create the ConfigMenuBar object. - * - * @param controller - * reference to the controller - */ - public ConfigMenuBar(final ConfigController controller) - { + /** + * (Constructor) Create the ConfigMenuBar object. + * + * @param controller reference to the controller + */ + public ConfigMenuBar(final ConfigController controller) { - this.controller = controller; + this.controller = controller; - createSystemMenu(); - } + createSystemMenu(); + } - /** - * Creates the System menu and its menu items. - */ - private void createSystemMenu() - { + /** + * Creates the System menu and its menu items. + */ + private void createSystemMenu() { - JMenu system = new JMenu("System"); + JMenu system = new JMenu("System"); - JMenuItem importConfig = new JMenuItem("Import Configuration"); - importConfig.addActionListener(e -> controller.loadConfiguration()); + JMenuItem importConfig = new JMenuItem("Import Configuration"); + importConfig.addActionListener(e -> controller.loadConfiguration()); - system.add(importConfig); + system.add(importConfig); - JMenuItem exportConfig = new JMenuItem("Export Configuration"); - exportConfig.addActionListener(e -> controller.saveConfiguration()); + JMenuItem exportConfig = new JMenuItem("Export Configuration"); + exportConfig.addActionListener(e -> controller.saveConfiguration()); - system.add(exportConfig); + system.add(exportConfig); - system.addSeparator(); + system.addSeparator(); - JMenuItem defaultConfig = new JMenuItem( - "Reset to default parameters"); - defaultConfig.addActionListener(e -> controller.defaultConfiguration()); + JMenuItem defaultConfig = new JMenuItem( + "Reset to default parameters"); + defaultConfig.addActionListener(e -> controller.defaultConfiguration()); - system.add(defaultConfig); + system.add(defaultConfig); - system.addSeparator(); + system.addSeparator(); - JMenuItem systemClose = new JMenuItem("Close"); - systemClose.addActionListener(e -> System.exit(-1)); + JMenuItem systemClose = new JMenuItem("Close"); + systemClose.addActionListener(e -> System.exit(-1)); - system.add(systemClose); + system.add(systemClose); - this.add(system); - } + this.add(system); + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/control/ArchiveRegistry.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/control/ArchiveRegistry.java index ae6c8d21..28e72549 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/control/ArchiveRegistry.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/control/ArchiveRegistry.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -30,145 +30,126 @@ *

* Contains all input archives and represents the table model to display the * archives in the InputPanel. - * - * - * */ @SuppressWarnings("serial") public class ArchiveRegistry - extends AbstractTableModel -{ - - /** List of input archives */ - private final List archives; - - /** Name of columns */ - private final String[] columnNames; - - /** - * (Constructor) Creates a new ArchiveRegistry. - */ - public ArchiveRegistry() - { - this.columnNames = new String[] { "Input Type", "Start Position", - "Path" }; - this.archives = new ArrayList<>(); - } - - /** - * Returns the name of the specified column. - * - * @param col - * index of the column - * - * @return name of the column - */ - @Override - public String getColumnName(final int col) - { - return this.columnNames[col]; - } - - /** - * Returns the number of columns. - */ - @Override - public int getColumnCount() - { - return this.columnNames.length; - } - - /** - * Returns the number of rows. - */ - @Override - public int getRowCount() - { - return this.archives.size(); - } - - /** - * Returns the value at the specified position. - * - * @param row - * index of the row - * @param col - * index of the column - * - * @return string representation of the specified field - */ - @Override - public Object getValueAt(final int row, final int col) - { - - switch (col) { - case 0: - return archives.get(row).getType(); - case 1: - return archives.get(row).getStartPosition(); - case 2: - return archives.get(row).getPath(); - } - - return "---"; - } - - /** - * Adds an archive description. - * - * @param description - * archive description - */ - public void addArchive(final ArchiveDescription description) - { - this.archives.add(description); - } - - /** - * Removes an archive description. - * - * @param index - * index of the archive. - */ - public void removeArchive(final int index) - { - this.archives.remove(index); - } - - /** - * Returns the archive at the specified position. - * - * @param index - * position - * @return ArchiveDescription - */ - public ArchiveDescription get(final int index) - { - return this.archives.get(index); - } - - /** - * Deletes all contained archive descriptions. - */ - public void clear() - { - this.archives.clear(); - } - - /** - * Adds the ArchiveDescriptions contained in the configuration. - * - * @param config - * Reference to the configuration - */ - public void applyConfiguration(final ConfigSettings config) - { - - clear(); - - Iterator aIt = config.archiveIterator(); - while (aIt.hasNext()) { - addArchive(aIt.next()); - } - } + extends AbstractTableModel { + + /** + * List of input archives + */ + private final List archives; + + /** + * Name of columns + */ + private final String[] columnNames; + + /** + * (Constructor) Creates a new ArchiveRegistry. + */ + public ArchiveRegistry() { + this.columnNames = new String[]{"Input Type", "Start Position", + "Path"}; + this.archives = new ArrayList<>(); + } + + /** + * Returns the name of the specified column. + * + * @param col index of the column + * @return name of the column + */ + @Override + public String getColumnName(final int col) { + return this.columnNames[col]; + } + + /** + * Returns the number of columns. + */ + @Override + public int getColumnCount() { + return this.columnNames.length; + } + + /** + * Returns the number of rows. + */ + @Override + public int getRowCount() { + return this.archives.size(); + } + + /** + * Returns the value at the specified position. + * + * @param row index of the row + * @param col index of the column + * @return string representation of the specified field + */ + @Override + public Object getValueAt(final int row, final int col) { + + switch (col) { + case 0: + return archives.get(row).getType(); + case 1: + return archives.get(row).getStartPosition(); + case 2: + return archives.get(row).getPath(); + } + + return "---"; + } + + /** + * Adds an archive description. + * + * @param description archive description + */ + public void addArchive(final ArchiveDescription description) { + this.archives.add(description); + } + + /** + * Removes an archive description. + * + * @param index index of the archive. + */ + public void removeArchive(final int index) { + this.archives.remove(index); + } + + /** + * Returns the archive at the specified position. + * + * @param index position + * @return ArchiveDescription + */ + public ArchiveDescription get(final int index) { + return this.archives.get(index); + } + + /** + * Deletes all contained archive descriptions. + */ + public void clear() { + this.archives.clear(); + } + + /** + * Adds the ArchiveDescriptions contained in the configuration. + * + * @param config Reference to the configuration + */ + public void applyConfiguration(final ConfigSettings config) { + + clear(); + + Iterator aIt = config.archiveIterator(); + while (aIt.hasNext()) { + addArchive(aIt.next()); + } + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/control/ComponentRegistry.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/control/ComponentRegistry.java index 804ef6a2..ad8441d6 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/control/ComponentRegistry.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/control/ComponentRegistry.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -26,114 +26,101 @@ /** * ComponentsRegistry of the ConfigurationTool - * - * - * */ -public class ComponentRegistry -{ +public class ComponentRegistry { - /** Reference to the GUI */ - private ConfigGUI gui; + /** + * Reference to the GUI + */ + private ConfigGUI gui; - /** Map that contains references to the important panels */ - private final Map map; + /** + * Map that contains references to the important panels + */ + private final Map map; - /** - * (Constructor) Creates a ComponentRegistry. - */ - public ComponentRegistry() - { - this.map = new HashMap<>(); - } + /** + * (Constructor) Creates a ComponentRegistry. + */ + public ComponentRegistry() { + this.map = new HashMap<>(); + } - /** - * Registers the panel with the given key. - * - * @param key - * key - * @param panel - * panel - */ - public void register(final PanelKeys key, final AbstractPanel panel) - { - this.map.put(key, panel); - } + /** + * Registers the panel with the given key. + * + * @param key key + * @param panel panel + */ + public void register(final PanelKeys key, final AbstractPanel panel) { + this.map.put(key, panel); + } - /** - * Sets the reference of the GUI. - * - * @param gui - * GUI - */ - public void registerGUI(final ConfigGUI gui) - { - this.gui = gui; - } + /** + * Sets the reference of the GUI. + * + * @param gui GUI + */ + public void registerGUI(final ConfigGUI gui) { + this.gui = gui; + } - /** - * Adds the xml description of the panels content to the StringBuilder. - * Errors which occur during the xml transformation will be added to the - * ConfigVerification. - * - * @param builder - * Reference to a StringBuilder object - * @param errors - * Reference to the ConfigVerification object - */ - public void toXML(final StringBuilder builder, - final ConfigVerification errors) - { + /** + * Adds the xml description of the panels content to the StringBuilder. + * Errors which occur during the xml transformation will be added to the + * ConfigVerification. + * + * @param builder Reference to a StringBuilder object + * @param errors Reference to the ConfigVerification object + */ + public void toXML(final StringBuilder builder, + final ConfigVerification errors) { - map.get(PanelKeys.PANEL_VALUES).toXML(builder, errors); - map.get(PanelKeys.PANEL_EXTERNALS).toXML(builder, errors); - map.get(PanelKeys.PANEL_INPUT).toXML(builder, errors); - map.get(PanelKeys.PANEL_OUTPUT).toXML(builder, errors); - map.get(PanelKeys.PANEL_SQL).toXML(builder, errors); - map.get(PanelKeys.PANEL_CACHE).toXML(builder, errors); - map.get(PanelKeys.PANEL_LOGGING).toXML(builder, errors); - map.get(PanelKeys.PANEL_DEBUG).toXML(builder, errors); - map.get(PanelKeys.PANEL_FILTER).toXML(builder, errors); - } + map.get(PanelKeys.PANEL_VALUES).toXML(builder, errors); + map.get(PanelKeys.PANEL_EXTERNALS).toXML(builder, errors); + map.get(PanelKeys.PANEL_INPUT).toXML(builder, errors); + map.get(PanelKeys.PANEL_OUTPUT).toXML(builder, errors); + map.get(PanelKeys.PANEL_SQL).toXML(builder, errors); + map.get(PanelKeys.PANEL_CACHE).toXML(builder, errors); + map.get(PanelKeys.PANEL_LOGGING).toXML(builder, errors); + map.get(PanelKeys.PANEL_DEBUG).toXML(builder, errors); + map.get(PanelKeys.PANEL_FILTER).toXML(builder, errors); + } - /** - * Reads the configuration parameters described in the panel from the - * ConfigSettings and and sets the contained values. - * - * @param config - * Reference to the ConfigSettings object - */ - public void applyConfig(final ConfigSettings config) - { + /** + * Reads the configuration parameters described in the panel from the + * ConfigSettings and and sets the contained values. + * + * @param config Reference to the ConfigSettings object + */ + public void applyConfig(final ConfigSettings config) { - map.get(PanelKeys.PANEL_VALUES).applyConfig(config); - map.get(PanelKeys.PANEL_EXTERNALS).applyConfig(config); - map.get(PanelKeys.PANEL_INPUT).applyConfig(config); - map.get(PanelKeys.PANEL_OUTPUT).applyConfig(config); - map.get(PanelKeys.PANEL_SQL).applyConfig(config); - map.get(PanelKeys.PANEL_CACHE).applyConfig(config); - map.get(PanelKeys.PANEL_LOGGING).applyConfig(config); - map.get(PanelKeys.PANEL_DEBUG).applyConfig(config); - map.get(PanelKeys.PANEL_FILTER).applyConfig(config); - } + map.get(PanelKeys.PANEL_VALUES).applyConfig(config); + map.get(PanelKeys.PANEL_EXTERNALS).applyConfig(config); + map.get(PanelKeys.PANEL_INPUT).applyConfig(config); + map.get(PanelKeys.PANEL_OUTPUT).applyConfig(config); + map.get(PanelKeys.PANEL_SQL).applyConfig(config); + map.get(PanelKeys.PANEL_CACHE).applyConfig(config); + map.get(PanelKeys.PANEL_LOGGING).applyConfig(config); + map.get(PanelKeys.PANEL_DEBUG).applyConfig(config); + map.get(PanelKeys.PANEL_FILTER).applyConfig(config); + } - /** - * Returns the reference of the GUI. - * - * @return reference to the GUI - */ - public ConfigGUI getGUI() - { - return this.gui; - } + /** + * Returns the reference of the GUI. + * + * @return reference to the GUI + */ + public ConfigGUI getGUI() { + return this.gui; + } - /** - * Repaints the GUI. - */ - public void repaint() - { - if (gui != null) { - gui.repaint(); - } - } + /** + * Repaints the GUI. + */ + public void repaint() { + if (gui != null) { + gui.repaint(); + } + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/control/ConfigController.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/control/ConfigController.java index 80c21265..3a5a2747 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/control/ConfigController.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/control/ConfigController.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -32,601 +32,551 @@ /** * Controller of the ConfigurationTool - * - * - * */ -public class ConfigController -{ - - /** Reference to the ArchiveRegistry */ - private final ArchiveRegistry archives; - - /** Reference to the ComponentRegistry */ - private ComponentRegistry components; - - /** Reference to the configuration */ - private final ConfigSettings config; - - /** - * Configuration settings - Flag that indicates whether the 7Zip support is - * enabled or not - */ - private boolean enable7Zip; - - /** - * Configuration settings - Flag that indicates whether debug output is - * enabled - */ - private boolean enableDebugOutput; - - /** - * Configuration settings - Flag that indicates whether diff verification is - * enabled - */ - private boolean enableDiffVerification; - - /** - * Configuration settings - Flag that indicates whether encoding - * verification is enabled - */ - private boolean enableEncodingVerification; - - /** - * Configuration settings - Flag that indicates whether the database output - * mode is enabled - */ - private boolean enableSQLDatabaseOutput; - - /** - * Configuration settings - Flag that indicates whether output should - * be a datafile instead of an sql dump - */ - private boolean enableDataFileOutput; - - /** - * Configuration settings - Flag that indicates whether statistical output - * is enabled - */ - private boolean enableStatsOutput; - - /** - * Configuration settings - Flag that indicates whether statistical output - * is enabled - */ - private boolean enableZipCompression; - - /** Reference to the ConfigVerification */ - private ConfigVerification errors; - - /** - * Configuration settings - Flag that indicates whether multiple output - * files are allowed - */ - private boolean multipleOutputFiles; - - /** Configuration settings - Output compression mode */ - private OutputCompressionEnum outputCompression; - - /** Configuration settings - Output file limit */ - private long outputFileLimit; - - /** Configuration settings - Surrogate Mode */ - private SurrogateModes surrogates; - - /** XML Representation of the content */ - private StringBuilder xmlConfig; - - /** - * (Constructor) Creates a new ConfigController. - */ - public ConfigController() - { - - this.components = new ComponentRegistry(); - this.archives = new ArchiveRegistry(); - - this.config = new ConfigSettings(); - - this.enable7Zip = false; - - this.outputFileLimit = -1; - this.multipleOutputFiles = false; - this.outputCompression = OutputCompressionEnum.None; - - this.enableZipCompression = true; - this.enableDebugOutput = false; - this.enableSQLDatabaseOutput = false; - - this.surrogates = SurrogateModes.DISCARD_REVISION; - - } - - /** - * Adds an archive to the archive registry. - * - * @param archive - * reference to the archive - */ - public void addArchive(final ArchiveDescription archive) - { - this.archives.addArchive(archive); - } - - /** - * Applies the configuration file. - *

- * The input settings will be ignored if a default configuration was used. - */ - private void applyConfig() - { - this.components.applyConfig(config); - - switch (config.getConfigType()) { - case DEFAULT: - break; - case IMPORT: - this.archives.applyConfiguration(config); - } - - repaint(); - } - - /** - * Creates the xml content representation of the currently used settings. - * - * @return TRUE if the ConfigVerfication contains no items, FALSE otherwise - */ - public boolean createConfigurationXML() - { - - errors = new ConfigVerification(); - xmlConfig = new StringBuilder(); - - xmlConfig.append("\r\n"); - components.toXML(xmlConfig, errors); - xmlConfig.append("\r\n"); - - if (errors.getRowCount() != 0) { - - // TODO: invoke the dialog at another place - new ConfigDialog(this).setVisible(true); - - return false; - } - - return true; - } - - /** - * Applies the default parameter to the currently loaded config - */ - public void defaultConfiguration() - { - config.defaultConfiguration(); - applyConfig(); - } - - /** - * Returns the reference to the ArchiveRegistry. - * - * @return archive registry - */ - public ArchiveRegistry getArchives() - { - return archives; - } - - /** - * Return the reference to the ConfigVerifactions. - * - * @return ConfigVerification - */ - public ConfigVerification getConfigErrors() - { - return errors; - } - - /** - * Returns the output compression mode. - * - * @return output compression mode - */ - public OutputCompressionEnum getOutputCompression() - { - return outputCompression; - } - - /** - * Returns the maximum size of an output file. - * - * @return maximum size of an output file. - */ - public long getOutputFileLimit() - { - return outputFileLimit; - } - - /** - * Returns the reference to the component registry. - * - * @return component registry - */ - public ComponentRegistry getRegistry() - { - return components; - } - - /** - * Returns the surrogate mode. - * - * @return surrogate mode - */ - public SurrogateModes getSurrogates() - { - return surrogates; - } - - /** - * Returns whether the 7Zip support is enabled or not. - * - * @return TRUE | FALSE - */ - public boolean is7ZipEnabled() - { - return enable7Zip; - } - - /** - * Returns whether the debug output is enabled. - * - * @return debug output flag - */ - public boolean isDebugOutputEnabled() - { - return enableDebugOutput; - } - - /** - * Returns whether the diff verification mode is enabled. - * - * @return diff verification flag - */ - public boolean isDiffVerificationEnabled() - { - return enableDiffVerification; - } - - /** - * Returns whether the database output mode is enabled. - * - * @return database output flag - */ - public boolean isEnableSQLDatabaseOutput() - { - return enableSQLDatabaseOutput; - } - - /** - * Returns whether the encoding verification mode is enabled. - * - * @return encoding verification flag - */ - public boolean isEncodingVerificationEnabled() - { - return enableEncodingVerification; - } - - /** - * Returns whether multiple output files should be used. - * - * @return multiple output files flag - */ - public boolean isMultipleOutputFiles() - { - return multipleOutputFiles; - } - - /** - * Returns whether the statistical output mode is enabled. - * - * @return statistical output flag - */ - public boolean isStatsOutputEnabled() - { - return enableStatsOutput; - } - - /** - * Returns whether the Zip-Compression is enabled or not. - * - * @return Zip-Compression flag - */ - public boolean isZipCompressionEnabled() - { - return enableZipCompression; - } - - /** - * Loads the configuration from the specified file - * - * @param path - * input file - */ - public void loadConfig(final String path) - { - config.loadConfig(path); - applyConfig(); - } - - /** - * Loads the configuration file. The path of the file will be chosen by - * displaying a FileChooser Dialog. - */ - public void loadConfiguration() - { - - XMLFileChooser fc = new XMLFileChooser(); - if (fc.showOpenDialog(new JPanel()) == XMLFileChooser.APPROVE_OPTION) { - this.loadConfig(fc.getSelectedFile().getPath()); - } - } - - /** - * Registers the panel with the given key. - * - * @param key - * key - * @param panel - * panel - */ - public void register(final PanelKeys key, final AbstractPanel panel) - { - this.components.register(key, panel); - } - - /** - * Removes the specified archive from the archive registry. - * - * @param index - * index of the archive - */ - public void removeArchive(final int index) - { - this.archives.removeArchive(index); - } - - /** - * Repaints the GUI. - */ - public void repaint() - { - this.components.repaint(); - } - - /** - * Saves the configuration file. The path of the file will be chosen by - * displaying a FileChooser Dialog. - */ - public void saveConfiguration() - { - - if (this.createConfigurationXML()) { - - XMLFileChooser fc = new XMLFileChooser(); - if (fc.showSaveDialog(new JPanel()) == XMLFileChooser.APPROVE_OPTION) { - - String path = fc.getSelectedFile().getPath(); - if (path.indexOf('.') == -1) { - path += ".xml"; - } - - if (this.saveConfiguration(path)) { - System.out.println("SAVE CONFIG SUCCESSFULL"); - } - else { - - System.out.println("SAVE CONFIG FAILED"); - } - } - - } - } - - /** - * Save the configuration to a file. - * - * @param path - * output path - * @return TRUE if the configuration was succesfully exported FALSE - * otherwise - */ - public boolean saveConfiguration(final String path) - { - - if (xmlConfig != null && !errors.hasFailed()) { - - boolean success = true; - - FileWriter writer = null; - try { - writer = new FileWriter(path); - writer.write(xmlConfig.toString()); - writer.flush(); - - } - catch (IOException ioe) { - ioe.printStackTrace(); - success = false; - } - finally { - if (writer != null) { - try { - writer.close(); - } - catch (IOException ioe) { - success = false; - } - } - } - - return success; - } - - return false; - } - - - /** - * Enables or disables the 7Zip support. - *

- * If the support is disabled the and the OutputCompression Mode was 7Zip - * the Mode will be reseted to None. - * - * @param enable7Zip - * 7Zip support flag - */ - public void setEnable7Zip(final boolean enable7Zip) - { - this.enable7Zip = enable7Zip; - if (!this.enable7Zip) { - if (outputCompression == OutputCompressionEnum.SevenZip) { - outputCompression = OutputCompressionEnum.None; - } - } - } - - /** - * Sets the debug output mode. - * - * @param enableDebugOutput - * debug output flag - */ - public void setEnableDebugOutput(final boolean enableDebugOutput) - { - this.enableDebugOutput = enableDebugOutput; - } - - /** - * Sets the diff verification mode. - * - * @param enableDiffVerification - * diff verification mode - */ - public void setEnableDiffVerification(final boolean enableDiffVerification) - { - this.enableDiffVerification = enableDiffVerification; - } - - /** - * Sets the encoding verification mode. - * - * @param enableEncodingVerification - * diff verification mode - */ - public void setEnableEncodingVerification( - final boolean enableEncodingVerification) - { - this.enableEncodingVerification = enableEncodingVerification; - } - - /** - * Sets the database output flag. - * - * @param enableSQLDatabaseOutput - * database output flag - */ - public void setEnableSQLDatabaseOutput(final boolean enableSQLDatabaseOutput) - { - this.enableSQLDatabaseOutput = enableSQLDatabaseOutput; - } - - /** - * Sets the statistical output mode. - * - * @param enableStatsOutput - * statistical output flag - */ - public void setEnableStatsOutput(final boolean enableStatsOutput) - { - this.enableStatsOutput = enableStatsOutput; - } - - /** - * Sets the Zip-Compression mode. - * - * @param enableZipCompression - * Zip-Compression flag - */ - public void setEnableZipCompression(final boolean enableZipCompression) - { - this.enableZipCompression = enableZipCompression; - } - - /** - * Sets whether multiple output files should be used. - * - * @param multipleOutputFiles - * multiple output files flag - */ - public void setMultipleOutputFiles(final boolean multipleOutputFiles) - { - this.multipleOutputFiles = multipleOutputFiles; - } - - /** - * Sets the output compression mode. - * - * @param outputCompression - * output compression mode - */ - public void setOutputCompression( - final OutputCompressionEnum outputCompression) - { - this.outputCompression = outputCompression; - } - - /** - * Sets the maximum size of an output file. - * - * @param outputFileLimit - * maximum size of an output file - */ - public void setOutputFileLimit(final long outputFileLimit) - { - this.outputFileLimit = outputFileLimit; - } - - /** - * Sets the reference to the component registry. - * - * @param registry - * component registry - */ - public void setRegistry(final ComponentRegistry registry) - { - this.components = registry; - } - - /** - * Sets the surrogate mode. - * - * @param surrogates - * surrogate mode - */ - public void setSurrogates(final SurrogateModes surrogates) - { - this.surrogates = surrogates; - } - - public boolean isEnableDataFileOutput() - { - return enableDataFileOutput; - } - - public void setEnableDataFileOutput(boolean enableDataFileOutput) - { - this.enableDataFileOutput = enableDataFileOutput; - } +public class ConfigController { + + /** + * Reference to the ArchiveRegistry + */ + private final ArchiveRegistry archives; + + /** + * Reference to the ComponentRegistry + */ + private ComponentRegistry components; + + /** + * Reference to the configuration + */ + private final ConfigSettings config; + + /** + * Configuration settings - Flag that indicates whether the 7Zip support is + * enabled or not + */ + private boolean enable7Zip; + + /** + * Configuration settings - Flag that indicates whether debug output is + * enabled + */ + private boolean enableDebugOutput; + + /** + * Configuration settings - Flag that indicates whether diff verification is + * enabled + */ + private boolean enableDiffVerification; + + /** + * Configuration settings - Flag that indicates whether encoding + * verification is enabled + */ + private boolean enableEncodingVerification; + + /** + * Configuration settings - Flag that indicates whether the database output + * mode is enabled + */ + private boolean enableSQLDatabaseOutput; + + /** + * Configuration settings - Flag that indicates whether output should + * be a datafile instead of an sql dump + */ + private boolean enableDataFileOutput; + + /** + * Configuration settings - Flag that indicates whether statistical output + * is enabled + */ + private boolean enableStatsOutput; + + /** + * Configuration settings - Flag that indicates whether statistical output + * is enabled + */ + private boolean enableZipCompression; + + /** + * Reference to the ConfigVerification + */ + private ConfigVerification errors; + + /** + * Configuration settings - Flag that indicates whether multiple output + * files are allowed + */ + private boolean multipleOutputFiles; + + /** + * Configuration settings - Output compression mode + */ + private OutputCompressionEnum outputCompression; + + /** + * Configuration settings - Output file limit + */ + private long outputFileLimit; + + /** + * Configuration settings - Surrogate Mode + */ + private SurrogateModes surrogates; + + /** + * XML Representation of the content + */ + private StringBuilder xmlConfig; + + /** + * (Constructor) Creates a new ConfigController. + */ + public ConfigController() { + + this.components = new ComponentRegistry(); + this.archives = new ArchiveRegistry(); + + this.config = new ConfigSettings(); + + this.enable7Zip = false; + + this.outputFileLimit = -1; + this.multipleOutputFiles = false; + this.outputCompression = OutputCompressionEnum.None; + + this.enableZipCompression = true; + this.enableDebugOutput = false; + this.enableSQLDatabaseOutput = false; + + this.surrogates = SurrogateModes.DISCARD_REVISION; + + } + + /** + * Adds an archive to the archive registry. + * + * @param archive reference to the archive + */ + public void addArchive(final ArchiveDescription archive) { + this.archives.addArchive(archive); + } + + /** + * Applies the configuration file. + *

+ * The input settings will be ignored if a default configuration was used. + */ + private void applyConfig() { + this.components.applyConfig(config); + + switch (config.getConfigType()) { + case DEFAULT: + break; + case IMPORT: + this.archives.applyConfiguration(config); + } + + repaint(); + } + + /** + * Creates the xml content representation of the currently used settings. + * + * @return TRUE if the ConfigVerfication contains no items, FALSE otherwise + */ + public boolean createConfigurationXML() { + + errors = new ConfigVerification(); + xmlConfig = new StringBuilder(); + + xmlConfig.append("\r\n"); + components.toXML(xmlConfig, errors); + xmlConfig.append("\r\n"); + + if (errors.getRowCount() != 0) { + + // TODO: invoke the dialog at another place + new ConfigDialog(this).setVisible(true); + + return false; + } + + return true; + } + + /** + * Applies the default parameter to the currently loaded config + */ + public void defaultConfiguration() { + config.defaultConfiguration(); + applyConfig(); + } + + /** + * Returns the reference to the ArchiveRegistry. + * + * @return archive registry + */ + public ArchiveRegistry getArchives() { + return archives; + } + + /** + * Return the reference to the ConfigVerifactions. + * + * @return ConfigVerification + */ + public ConfigVerification getConfigErrors() { + return errors; + } + + /** + * Returns the output compression mode. + * + * @return output compression mode + */ + public OutputCompressionEnum getOutputCompression() { + return outputCompression; + } + + /** + * Returns the maximum size of an output file. + * + * @return maximum size of an output file. + */ + public long getOutputFileLimit() { + return outputFileLimit; + } + + /** + * Returns the reference to the component registry. + * + * @return component registry + */ + public ComponentRegistry getRegistry() { + return components; + } + + /** + * Returns the surrogate mode. + * + * @return surrogate mode + */ + public SurrogateModes getSurrogates() { + return surrogates; + } + + /** + * Returns whether the 7Zip support is enabled or not. + * + * @return TRUE | FALSE + */ + public boolean is7ZipEnabled() { + return enable7Zip; + } + + /** + * Returns whether the debug output is enabled. + * + * @return debug output flag + */ + public boolean isDebugOutputEnabled() { + return enableDebugOutput; + } + + /** + * Returns whether the diff verification mode is enabled. + * + * @return diff verification flag + */ + public boolean isDiffVerificationEnabled() { + return enableDiffVerification; + } + + /** + * Returns whether the database output mode is enabled. + * + * @return database output flag + */ + public boolean isEnableSQLDatabaseOutput() { + return enableSQLDatabaseOutput; + } + + /** + * Returns whether the encoding verification mode is enabled. + * + * @return encoding verification flag + */ + public boolean isEncodingVerificationEnabled() { + return enableEncodingVerification; + } + + /** + * Returns whether multiple output files should be used. + * + * @return multiple output files flag + */ + public boolean isMultipleOutputFiles() { + return multipleOutputFiles; + } + + /** + * Returns whether the statistical output mode is enabled. + * + * @return statistical output flag + */ + public boolean isStatsOutputEnabled() { + return enableStatsOutput; + } + + /** + * Returns whether the Zip-Compression is enabled or not. + * + * @return Zip-Compression flag + */ + public boolean isZipCompressionEnabled() { + return enableZipCompression; + } + + /** + * Loads the configuration from the specified file + * + * @param path input file + */ + public void loadConfig(final String path) { + config.loadConfig(path); + applyConfig(); + } + + /** + * Loads the configuration file. The path of the file will be chosen by + * displaying a FileChooser Dialog. + */ + public void loadConfiguration() { + + XMLFileChooser fc = new XMLFileChooser(); + if (fc.showOpenDialog(new JPanel()) == XMLFileChooser.APPROVE_OPTION) { + this.loadConfig(fc.getSelectedFile().getPath()); + } + } + + /** + * Registers the panel with the given key. + * + * @param key key + * @param panel panel + */ + public void register(final PanelKeys key, final AbstractPanel panel) { + this.components.register(key, panel); + } + + /** + * Removes the specified archive from the archive registry. + * + * @param index index of the archive + */ + public void removeArchive(final int index) { + this.archives.removeArchive(index); + } + + /** + * Repaints the GUI. + */ + public void repaint() { + this.components.repaint(); + } + + /** + * Saves the configuration file. The path of the file will be chosen by + * displaying a FileChooser Dialog. + */ + public void saveConfiguration() { + + if (this.createConfigurationXML()) { + + XMLFileChooser fc = new XMLFileChooser(); + if (fc.showSaveDialog(new JPanel()) == XMLFileChooser.APPROVE_OPTION) { + + String path = fc.getSelectedFile().getPath(); + if (path.indexOf('.') == -1) { + path += ".xml"; + } + + if (this.saveConfiguration(path)) { + System.out.println("SAVE CONFIG SUCCESSFULL"); + } else { + + System.out.println("SAVE CONFIG FAILED"); + } + } + + } + } + + /** + * Save the configuration to a file. + * + * @param path output path + * @return TRUE if the configuration was succesfully exported FALSE + * otherwise + */ + public boolean saveConfiguration(final String path) { + + if (xmlConfig != null && !errors.hasFailed()) { + + boolean success = true; + + FileWriter writer = null; + try { + writer = new FileWriter(path); + writer.write(xmlConfig.toString()); + writer.flush(); + + } catch (IOException ioe) { + ioe.printStackTrace(); + success = false; + } finally { + if (writer != null) { + try { + writer.close(); + } catch (IOException ioe) { + success = false; + } + } + } + + return success; + } + + return false; + } + + + /** + * Enables or disables the 7Zip support. + *

+ * If the support is disabled the and the OutputCompression Mode was 7Zip + * the Mode will be reseted to None. + * + * @param enable7Zip 7Zip support flag + */ + public void setEnable7Zip(final boolean enable7Zip) { + this.enable7Zip = enable7Zip; + if (!this.enable7Zip) { + if (outputCompression == OutputCompressionEnum.SevenZip) { + outputCompression = OutputCompressionEnum.None; + } + } + } + + /** + * Sets the debug output mode. + * + * @param enableDebugOutput debug output flag + */ + public void setEnableDebugOutput(final boolean enableDebugOutput) { + this.enableDebugOutput = enableDebugOutput; + } + + /** + * Sets the diff verification mode. + * + * @param enableDiffVerification diff verification mode + */ + public void setEnableDiffVerification(final boolean enableDiffVerification) { + this.enableDiffVerification = enableDiffVerification; + } + + /** + * Sets the encoding verification mode. + * + * @param enableEncodingVerification diff verification mode + */ + public void setEnableEncodingVerification( + final boolean enableEncodingVerification) { + this.enableEncodingVerification = enableEncodingVerification; + } + + /** + * Sets the database output flag. + * + * @param enableSQLDatabaseOutput database output flag + */ + public void setEnableSQLDatabaseOutput(final boolean enableSQLDatabaseOutput) { + this.enableSQLDatabaseOutput = enableSQLDatabaseOutput; + } + + /** + * Sets the statistical output mode. + * + * @param enableStatsOutput statistical output flag + */ + public void setEnableStatsOutput(final boolean enableStatsOutput) { + this.enableStatsOutput = enableStatsOutput; + } + + /** + * Sets the Zip-Compression mode. + * + * @param enableZipCompression Zip-Compression flag + */ + public void setEnableZipCompression(final boolean enableZipCompression) { + this.enableZipCompression = enableZipCompression; + } + + /** + * Sets whether multiple output files should be used. + * + * @param multipleOutputFiles multiple output files flag + */ + public void setMultipleOutputFiles(final boolean multipleOutputFiles) { + this.multipleOutputFiles = multipleOutputFiles; + } + + /** + * Sets the output compression mode. + * + * @param outputCompression output compression mode + */ + public void setOutputCompression( + final OutputCompressionEnum outputCompression) { + this.outputCompression = outputCompression; + } + + /** + * Sets the maximum size of an output file. + * + * @param outputFileLimit maximum size of an output file + */ + public void setOutputFileLimit(final long outputFileLimit) { + this.outputFileLimit = outputFileLimit; + } + + /** + * Sets the reference to the component registry. + * + * @param registry component registry + */ + public void setRegistry(final ComponentRegistry registry) { + this.components = registry; + } + + /** + * Sets the surrogate mode. + * + * @param surrogates surrogate mode + */ + public void setSurrogates(final SurrogateModes surrogates) { + this.surrogates = surrogates; + } + + public boolean isEnableDataFileOutput() { + return enableDataFileOutput; + } + + public void setEnableDataFileOutput(boolean enableDataFileOutput) { + this.enableDataFileOutput = enableDataFileOutput; + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/control/ConfigSettings.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/control/ConfigSettings.java index b55b3cf6..036ae061 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/control/ConfigSettings.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/control/ConfigSettings.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -37,232 +37,215 @@ /** * This class contain all configuration parameters. - * */ -public class ConfigSettings -{ - - /** Returns the type of the configuration */ - private ConfigEnum type; - - /** List of input archives */ - private List archives; - - /** Map that contains the configuration parameters and values */ - private Map parameterMap; - - /** - * Creates an empty {@link ConfigSettings} object of unspecified - * type. - */ - public ConfigSettings() - { - this.parameterMap = new HashMap<>(); - this.archives = new ArrayList<>(); - } - - /** - * Creates an empty {@link ConfigSettings} object of given type. - * - * @param type - * Configuration Type - */ - public ConfigSettings(final ConfigEnum type) - { - this.type = type; - this.parameterMap = new HashMap<>(); - this.archives = new ArrayList<>(); - } - - /** - * Adds an input archive description object to the input archive list. - * - * @param archive - * ArchiveDescription - */ - public void add(final ArchiveDescription archive) - { - this.archives.add(archive); - } - - /** - * Returns the input archive at the specified position. - * - * @param index - * position - * @return input archive description - */ - public ArchiveDescription getArchiveDescription(int index) - { - return this.archives.get(index); - } - - /** - * Returns the list of input archives. - * - * @return list of the input archive descriptions - */ - public List getArchiveList() - { - return this.archives; - } - - /** - * Returns the number of input archives. - * - * @return size of the input archive list - */ - public int archiveSize() - { - return this.archives.size(); - } - - /** - * Returns an iterator over the input archive list. - * - * @return Iterator - */ - public Iterator archiveIterator() - { - return this.archives.iterator(); - } - - /** - * Assigns the given value to the the given key. - * - * @param key - * configuration key - * @param value - * value - */ - public void setConfigParameter(final ConfigurationKeys key, Object value) - { - // before setting parameter, check if paths have trailing File.separator - if (key == ConfigurationKeys.LOGGING_PATH_DEBUG - || key == ConfigurationKeys.LOGGING_PATH_DIFFTOOL - || key == ConfigurationKeys.PATH_OUTPUT_SQL_FILES) { - - String v = (String) value; - // if we do not have a trailing file separator and the current - // path is compatible to the system that is running the config tool, - // then add a trailing separator - if (!v.endsWith(File.separator) && v.contains(File.separator)) { - value = v + File.separator; - } - } - - this.parameterMap.put(key, value); - } - - /** - * Returns the value related to the configuration key or null if the key is - * not contained. - * - * @param configParameter - * configuration key - * @return value or null - */ - public Object getConfigParameter(final ConfigurationKeys configParameter) - { - if (this.parameterMap.containsKey(configParameter)) { - return this.parameterMap.get(configParameter); - } - - return null; - } - - /** - * Applies the default single thread configuration of the DiffTool to this - * settings. - */ - public void defaultConfiguration() - { - clear(); - - setConfigParameter(ConfigurationKeys.VALUE_MINIMUM_LONGEST_COMMON_SUBSTRING, 12); - - setConfigParameter(ConfigurationKeys.COUNTER_FULL_REVISION, 1000); - - setConfigParameter(ConfigurationKeys.LIMIT_TASK_SIZE_REVISIONS, 5000000L); - - setConfigParameter(ConfigurationKeys.LIMIT_TASK_SIZE_DIFFS, 1000000L); - - setConfigParameter(ConfigurationKeys.LIMIT_SQLSERVER_MAX_ALLOWED_PACKET, 1000000L); - - setConfigParameter(ConfigurationKeys.MODE_SURROGATES, SurrogateModes.DISCARD_REVISION); - - setConfigParameter(ConfigurationKeys.WIKIPEDIA_ENCODING, StandardCharsets.UTF_8.toString()); - - setConfigParameter(ConfigurationKeys.MODE_OUTPUT, OutputType.BZIP2); - - setConfigParameter(ConfigurationKeys.MODE_DATAFILE_OUTPUT, false); - - setConfigParameter(ConfigurationKeys.MODE_ZIP_COMPRESSION_ENABLED, true); - - setConfigParameter(ConfigurationKeys.LIMIT_SQL_FILE_SIZE, 1000000000L); - - setConfigParameter(ConfigurationKeys.LOGGING_PATH_DIFFTOOL, "logs"); - - setConfigParameter(ConfigurationKeys.LOGGING_LOGLEVEL_DIFFTOOL, Level.INFO); - - setConfigParameter(ConfigurationKeys.VERIFICATION_DIFF, false); - - setConfigParameter(ConfigurationKeys.VERIFICATION_ENCODING, false); - - setConfigParameter(ConfigurationKeys.MODE_DEBUG_OUTPUT, false); - - setConfigParameter(ConfigurationKeys.MODE_STATISTICAL_OUTPUT, false); - - Set defaultNamespaces = new HashSet<>(); - defaultNamespaces.add(0); - defaultNamespaces.add(1); - setConfigParameter(ConfigurationKeys.NAMESPACES_TO_KEEP, defaultNamespaces); - - this.type = ConfigEnum.DEFAULT; - } - - - /** - * Deletes all contained input archives and configuration parameter. - */ - public void clear() - { - this.parameterMap.clear(); - this.archives.clear(); - } - - /** - * Returns the configuration type. - * - * @return configuration type - */ - public ConfigEnum getConfigType() - { - return this.type; - } +public class ConfigSettings { + + /** + * Returns the type of the configuration + */ + private ConfigEnum type; + + /** + * List of input archives + */ + private List archives; + + /** + * Map that contains the configuration parameters and values + */ + private Map parameterMap; + + /** + * Creates an empty {@link ConfigSettings} object of unspecified + * type. + */ + public ConfigSettings() { + this.parameterMap = new HashMap<>(); + this.archives = new ArrayList<>(); + } + + /** + * Creates an empty {@link ConfigSettings} object of given type. + * + * @param type Configuration Type + */ + public ConfigSettings(final ConfigEnum type) { + this.type = type; + this.parameterMap = new HashMap<>(); + this.archives = new ArrayList<>(); + } + + /** + * Adds an input archive description object to the input archive list. + * + * @param archive ArchiveDescription + */ + public void add(final ArchiveDescription archive) { + this.archives.add(archive); + } + + /** + * Returns the input archive at the specified position. + * + * @param index position + * @return input archive description + */ + public ArchiveDescription getArchiveDescription(int index) { + return this.archives.get(index); + } + + /** + * Returns the list of input archives. + * + * @return list of the input archive descriptions + */ + public List getArchiveList() { + return this.archives; + } + + /** + * Returns the number of input archives. + * + * @return size of the input archive list + */ + public int archiveSize() { + return this.archives.size(); + } + + /** + * Returns an iterator over the input archive list. + * + * @return Iterator + */ + public Iterator archiveIterator() { + return this.archives.iterator(); + } + + /** + * Assigns the given value to the the given key. + * + * @param key configuration key + * @param value value + */ + public void setConfigParameter(final ConfigurationKeys key, Object value) { + // before setting parameter, check if paths have trailing File.separator + if (key == ConfigurationKeys.LOGGING_PATH_DEBUG + || key == ConfigurationKeys.LOGGING_PATH_DIFFTOOL + || key == ConfigurationKeys.PATH_OUTPUT_SQL_FILES) { + + String v = (String) value; + // if we do not have a trailing file separator and the current + // path is compatible to the system that is running the config tool, + // then add a trailing separator + if (!v.endsWith(File.separator) && v.contains(File.separator)) { + value = v + File.separator; + } + } + + this.parameterMap.put(key, value); + } + + /** + * Returns the value related to the configuration key or null if the key is + * not contained. + * + * @param configParameter configuration key + * @return value or null + */ + public Object getConfigParameter(final ConfigurationKeys configParameter) { + if (this.parameterMap.containsKey(configParameter)) { + return this.parameterMap.get(configParameter); + } + + return null; + } + + /** + * Applies the default single thread configuration of the DiffTool to this + * settings. + */ + public void defaultConfiguration() { + clear(); + + setConfigParameter(ConfigurationKeys.VALUE_MINIMUM_LONGEST_COMMON_SUBSTRING, 12); + + setConfigParameter(ConfigurationKeys.COUNTER_FULL_REVISION, 1000); + + setConfigParameter(ConfigurationKeys.LIMIT_TASK_SIZE_REVISIONS, 5000000L); + + setConfigParameter(ConfigurationKeys.LIMIT_TASK_SIZE_DIFFS, 1000000L); + + setConfigParameter(ConfigurationKeys.LIMIT_SQLSERVER_MAX_ALLOWED_PACKET, 1000000L); + + setConfigParameter(ConfigurationKeys.MODE_SURROGATES, SurrogateModes.DISCARD_REVISION); + + setConfigParameter(ConfigurationKeys.WIKIPEDIA_ENCODING, StandardCharsets.UTF_8.toString()); + + setConfigParameter(ConfigurationKeys.MODE_OUTPUT, OutputType.BZIP2); + + setConfigParameter(ConfigurationKeys.MODE_DATAFILE_OUTPUT, false); + + setConfigParameter(ConfigurationKeys.MODE_ZIP_COMPRESSION_ENABLED, true); + + setConfigParameter(ConfigurationKeys.LIMIT_SQL_FILE_SIZE, 1000000000L); + + setConfigParameter(ConfigurationKeys.LOGGING_PATH_DIFFTOOL, "logs"); + + setConfigParameter(ConfigurationKeys.LOGGING_LOGLEVEL_DIFFTOOL, Level.INFO); + + setConfigParameter(ConfigurationKeys.VERIFICATION_DIFF, false); + + setConfigParameter(ConfigurationKeys.VERIFICATION_ENCODING, false); + + setConfigParameter(ConfigurationKeys.MODE_DEBUG_OUTPUT, false); + + setConfigParameter(ConfigurationKeys.MODE_STATISTICAL_OUTPUT, false); + + Set defaultNamespaces = new HashSet<>(); + defaultNamespaces.add(0); + defaultNamespaces.add(1); + setConfigParameter(ConfigurationKeys.NAMESPACES_TO_KEEP, defaultNamespaces); + + this.type = ConfigEnum.DEFAULT; + } + + + /** + * Deletes all contained input archives and configuration parameter. + */ + public void clear() { + this.parameterMap.clear(); + this.archives.clear(); + } + + /** + * Returns the configuration type. + * + * @return configuration type + */ + public ConfigEnum getConfigType() { + return this.type; + } - /** - * Loads the configuration settings from a file. - * - * @param path - * path to the configuration file - */ - public void loadConfig(final String path) - { - try { + /** + * Loads the configuration settings from a file. + * + * @param path path to the configuration file + */ + public void loadConfig(final String path) { + try { - ConfigurationReader reader = new ConfigurationReader(path); - ConfigSettings settings = reader.read(); + ConfigurationReader reader = new ConfigurationReader(path); + ConfigSettings settings = reader.read(); - clear(); + clear(); - this.type = settings.type; - this.parameterMap = settings.parameterMap; - this.archives = settings.archives; + this.type = settings.type; + this.parameterMap = settings.parameterMap; + this.archives = settings.archives; - } - catch (Exception e) { - e.printStackTrace(); - } - } + } catch (Exception e) { + e.printStackTrace(); + } + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/control/ConfigVerification.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/control/ConfigVerification.java index 1d92c9f6..74ef502e 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/control/ConfigVerification.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/control/ConfigVerification.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -28,110 +28,104 @@ /** * This class contains the list of error or warning messages that have been * generated during the verification of the configuration settings. - * - * - * */ @SuppressWarnings("serial") public class ConfigVerification - extends AbstractTableModel -{ - - /** If an error message was added to the list. */ - private boolean failed; - - /** List of configuration items */ - private final List list; - - /** Column names of the table representation */ - private final String[] columnNames; - - /** - * (Constructor) Creates an empty ConfigVerification object. - */ - public ConfigVerification() - { - this.list = new ArrayList<>(); - this.failed = false; - - this.columnNames = new String[] { "Type", "Error", "Message" }; - } - - /** - * Adds a configuration item to the list. - * - * @param item - * configuration item - */ - public void add(final ConfigItem item) - { - failed = failed || item.getType() == ConfigItemTypes.ERROR; - this.list.add(item); - } - - /** - * Returns the name of the column with the index col. - * - * @return column name of the specified column. - */ - @Override - public String getColumnName(final int col) - { - return this.columnNames[col]; - } - - /** - * Returns the number of columns. - * - * @return number of columns - */ - @Override - public int getColumnCount() - { - return 3; - } - - /** - * Returns the number of rows. - * - * @return number of rows - */ - @Override - public int getRowCount() - { - return list.size(); - } - - /** - * Returns the value at the specified column of the specified row. - * - * @return value - */ - @Override - public Object getValueAt(final int row, final int column) - { - - ConfigItem item = this.list.get(row); - - switch (column) { - case 0: - return item.getType(); - case 1: - return item.getKey(); - case 2: - return item.getMessage(); - } - return null; - } - - /** - * Returns whether the configuration item list contains an error message or - * not. - * - * @return TRUE | FALSE - */ - public boolean hasFailed() - { - return this.failed; - } + extends AbstractTableModel { + + /** + * If an error message was added to the list. + */ + private boolean failed; + + /** + * List of configuration items + */ + private final List list; + + /** + * Column names of the table representation + */ + private final String[] columnNames; + + /** + * (Constructor) Creates an empty ConfigVerification object. + */ + public ConfigVerification() { + this.list = new ArrayList<>(); + this.failed = false; + + this.columnNames = new String[]{"Type", "Error", "Message"}; + } + + /** + * Adds a configuration item to the list. + * + * @param item configuration item + */ + public void add(final ConfigItem item) { + failed = failed || item.getType() == ConfigItemTypes.ERROR; + this.list.add(item); + } + + /** + * Returns the name of the column with the index col. + * + * @return column name of the specified column. + */ + @Override + public String getColumnName(final int col) { + return this.columnNames[col]; + } + + /** + * Returns the number of columns. + * + * @return number of columns + */ + @Override + public int getColumnCount() { + return 3; + } + + /** + * Returns the number of rows. + * + * @return number of rows + */ + @Override + public int getRowCount() { + return list.size(); + } + + /** + * Returns the value at the specified column of the specified row. + * + * @return value + */ + @Override + public Object getValueAt(final int row, final int column) { + + ConfigItem item = this.list.get(row); + + switch (column) { + case 0: + return item.getType(); + case 1: + return item.getKey(); + case 2: + return item.getMessage(); + } + return null; + } + + /** + * Returns whether the configuration item list contains an error message or + * not. + * + * @return TRUE | FALSE + */ + public boolean hasFailed() { + return this.failed; + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/data/ConfigEnum.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/data/ConfigEnum.java index 326d77f4..2277ca0f 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/data/ConfigEnum.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/data/ConfigEnum.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -19,16 +19,16 @@ /** * Contains the keys for the configuration types. - * - * - * */ -public enum ConfigEnum -{ +public enum ConfigEnum { - /** Default Configuration */ - DEFAULT, + /** + * Default Configuration + */ + DEFAULT, - /** Imported Configuration */ - IMPORT + /** + * Imported Configuration + */ + IMPORT } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/data/ConfigErrorKeys.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/data/ConfigErrorKeys.java index d3dab38c..f171b332 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/data/ConfigErrorKeys.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/data/ConfigErrorKeys.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -19,28 +19,36 @@ /** * Contains the keys for the configuration verification error types. - * - * - * */ -public enum ConfigErrorKeys -{ +public enum ConfigErrorKeys { - /** Mode was enabled, but no value was set */ - COMMAND_NOT_SET, + /** + * Mode was enabled, but no value was set + */ + COMMAND_NOT_SET, - /** Configuration value out of range */ - VALUE_OUT_OF_RANGE, + /** + * Configuration value out of range + */ + VALUE_OUT_OF_RANGE, - /** Path was not set */ - PATH_NOT_SET, + /** + * Path was not set + */ + PATH_NOT_SET, - /** Illegal configuration value */ - ILLEGAL_INPUT, + /** + * Illegal configuration value + */ + ILLEGAL_INPUT, - /** Illegal input file type */ - ILLEGAL_INPUT_FILE, + /** + * Illegal input file type + */ + ILLEGAL_INPUT_FILE, - /** Required value is missing */ - MISSING_VALUE + /** + * Required value is missing + */ + MISSING_VALUE } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/data/ConfigItem.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/data/ConfigItem.java index f375442a..8c096242 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/data/ConfigItem.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/data/ConfigItem.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -19,68 +19,63 @@ /** * This class represents configuration verfication messages. - * - * - * */ -public class ConfigItem -{ +public class ConfigItem { - /** Type of message */ - private final ConfigItemTypes type; + /** + * Type of message + */ + private final ConfigItemTypes type; - /** Type of error */ - private final ConfigErrorKeys key; + /** + * Type of error + */ + private final ConfigErrorKeys key; - /** Message */ - private final String message; + /** + * Message + */ + private final String message; - /** - * (Constructor) Creates a new ConfigItem - * - * @param type - * Type of message - * @param key - * Type of error - * @param message - * Message - */ - public ConfigItem(final ConfigItemTypes type, final ConfigErrorKeys key, - final String message) - { + /** + * (Constructor) Creates a new ConfigItem + * + * @param type Type of message + * @param key Type of error + * @param message Message + */ + public ConfigItem(final ConfigItemTypes type, final ConfigErrorKeys key, + final String message) { - this.type = type; - this.key = key; - this.message = message; - } + this.type = type; + this.key = key; + this.message = message; + } - /** - * Returns the type of error. - * - * @return type of error - */ - public ConfigErrorKeys getKey() - { - return key; - } + /** + * Returns the type of error. + * + * @return type of error + */ + public ConfigErrorKeys getKey() { + return key; + } - /** - * Returns the message. - * - * @return message - */ - public String getMessage() - { - return message; - } + /** + * Returns the message. + * + * @return message + */ + public String getMessage() { + return message; + } - /** - * Returns the item type. - * - * @return item type - */ - public ConfigItemTypes getType() - { - return type; - } + /** + * Returns the item type. + * + * @return item type + */ + public ConfigItemTypes getType() { + return type; + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/data/ConfigItemTypes.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/data/ConfigItemTypes.java index 817cf301..7c89f04d 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/data/ConfigItemTypes.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/data/ConfigItemTypes.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -19,16 +19,16 @@ /** * Contains the keys for the different types of configuration items. - * - * - * */ -public enum ConfigItemTypes -{ +public enum ConfigItemTypes { - /** Warning message */ - WARNING, + /** + * Warning message + */ + WARNING, - /** Error message */ - ERROR + /** + * Error message + */ + ERROR } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/data/OutputCompressionEnum.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/data/OutputCompressionEnum.java index 87b55a4d..d729ae46 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/data/OutputCompressionEnum.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/data/OutputCompressionEnum.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -19,19 +19,21 @@ /** * Contains the keys for the different types of the DiffTool outputs. - * - * - * */ -public enum OutputCompressionEnum -{ +public enum OutputCompressionEnum { - /** Uncompressed output type */ - None, + /** + * Uncompressed output type + */ + None, - /** SevenZip output type */ - SevenZip, + /** + * SevenZip output type + */ + SevenZip, - /** BZip2 output type */ - BZip2, + /** + * BZip2 output type + */ + BZip2, } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/data/PanelKeys.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/data/PanelKeys.java index 61059452..84339a96 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/data/PanelKeys.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/data/PanelKeys.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -19,38 +19,52 @@ /** * Contains the keys for the different panels of the ConfigurationTool. - * - * - * */ -public enum PanelKeys -{ +public enum PanelKeys { + + /** + * Key for the mode panel + */ + PANEL_VALUES, + + /** + * Key for the externals panel + */ + PANEL_EXTERNALS, - /** Key for the mode panel */ - PANEL_VALUES, + /** + * Key for the input panel + */ + PANEL_INPUT, - /** Key for the externals panel */ - PANEL_EXTERNALS, + /** + * Key for the output panel + */ + PANEL_OUTPUT, - /** Key for the input panel */ - PANEL_INPUT, + /** + * Key for the sql panel + */ + PANEL_SQL, - /** Key for the output panel */ - PANEL_OUTPUT, + /** + * Key for the cache panel + */ + PANEL_CACHE, - /** Key for the sql panel */ - PANEL_SQL, + /** + * Key for the logging panel + */ + PANEL_LOGGING, - /** Key for the cache panel */ - PANEL_CACHE, + /** + * Key for the debug panel + */ + PANEL_DEBUG, - /** Key for the logging panel */ - PANEL_LOGGING, + /** + * Key for the filter panel + */ + PANEL_FILTER - /** Key for the debug panel */ - PANEL_DEBUG, - - /** Key for the filter panel */ - PANEL_FILTER - } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/dialogs/ConfigDialog.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/dialogs/ConfigDialog.java index e3677de1..e0e1cc7c 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/dialogs/ConfigDialog.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/dialogs/ConfigDialog.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -34,59 +34,48 @@ /** * ConfigDialog - Displays the ConfigVerification elements. - * - * - * */ @SuppressWarnings("serial") public class ConfigDialog - extends JDialog -{ - - /** - * Panel of the ConfigDialog - * - * - * - */ - private class ConfigDialogPanel - extends AbstractPanel - { - - private JTable itemTable; - private JScrollPane itemScrollPane; - - private JButton returnButton; - private JButton saveButton; - - /** - * (Constructor) Creates the ConfigDialogPanel. - * - * @param controller - * Reference to the controller - */ - public ConfigDialogPanel(final ConfigController controller) - { - super(controller); - createItemTable(); - createButtons(); - } - - /** - * Creates the buttons of the dialog panel. - */ - private void createButtons() - { - - returnButton = new JButton("Return"); - returnButton.setBounds(105, 195, 120, 25); - returnButton.addActionListener(e -> close()); - - this.add(returnButton); - - saveButton = new JButton("Save"); - saveButton.setBounds(235, 195, 120, 25); - saveButton.addActionListener(e -> { + extends JDialog { + + /** + * Panel of the ConfigDialog + */ + private class ConfigDialogPanel + extends AbstractPanel { + + private JTable itemTable; + private JScrollPane itemScrollPane; + + private JButton returnButton; + private JButton saveButton; + + /** + * (Constructor) Creates the ConfigDialogPanel. + * + * @param controller Reference to the controller + */ + public ConfigDialogPanel(final ConfigController controller) { + super(controller); + createItemTable(); + createButtons(); + } + + /** + * Creates the buttons of the dialog panel. + */ + private void createButtons() { + + returnButton = new JButton("Return"); + returnButton.setBounds(105, 195, 120, 25); + returnButton.addActionListener(e -> close()); + + this.add(returnButton); + + saveButton = new JButton("Save"); + saveButton.setBounds(235, 195, 120, 25); + saveButton.addActionListener(e -> { XMLFileChooser fc = new XMLFileChooser(); if (fc.showSaveDialog(new JPanel()) == XMLFileChooser.APPROVE_OPTION) { @@ -98,112 +87,102 @@ private void createButtons() if (controller.saveConfiguration(path)) { System.out.println("SAVE CONFIG SUCCESSFULL"); - } - else { + } else { System.out.println("SAVE CONFIG FAILED"); } } }); - this.add(saveButton); - } - - /** - * Creates the JTable for displaying the input archives. - */ - private void createItemTable() - { - itemTable = new JTable(controller.getConfigErrors()); - itemTable.setSelectionMode(ListSelectionModel.SINGLE_SELECTION); - - itemScrollPane = new JScrollPane(itemTable); - itemScrollPane.setBounds(10, 10, 470, 180); - - this.add(itemScrollPane); - } - - /** - * empty method - */ - @Override - public void relocate() - { - - } - - /** - * A call of this method should validate the positions of the panels - * components. - */ - @Override - public void validate() - { - - ConfigVerification verification = controller.getConfigErrors(); - if (verification != null) { - saveButton.setEnabled(!verification.hasFailed()); - } - else { - saveButton.setEnabled(false); - } - } - - /** - * empty method - * - * @deprecated - * @throws UnsupportedOperationException - */ - @Deprecated - @Override - public void toXML(final StringBuilder builder, - final ConfigVerification errors) - { - throw new UnsupportedOperationException(); - } - - /** - * empty method - * - * @deprecated - * @throws UnsupportedOperationException - */ - @Deprecated - @Override - public void applyConfig(final ConfigSettings config) - { - throw new UnsupportedOperationException(); - } - } - - /** - * (Constructor) Creates a new ConfigDialog. - * - * @param controller - * Reference to the controller - */ - public ConfigDialog(final ConfigController controller) - { - super(controller.getRegistry().getGUI(), true); - - this.setTitle("Verification"); - - setSize(500, 250); - setResizable(false); - - Dimension d = Toolkit.getDefaultToolkit().getScreenSize(); - setLocation((d.width - getSize().width) / 2, - (d.height - getSize().height) / 2); - - this.setContentPane(new ConfigDialogPanel(controller)); - } - - /** - * Closes the dialog. - */ - public void close() - { - this.setVisible(true); - this.dispose(); - } + this.add(saveButton); + } + + /** + * Creates the JTable for displaying the input archives. + */ + private void createItemTable() { + itemTable = new JTable(controller.getConfigErrors()); + itemTable.setSelectionMode(ListSelectionModel.SINGLE_SELECTION); + + itemScrollPane = new JScrollPane(itemTable); + itemScrollPane.setBounds(10, 10, 470, 180); + + this.add(itemScrollPane); + } + + /** + * empty method + */ + @Override + public void relocate() { + + } + + /** + * A call of this method should validate the positions of the panels + * components. + */ + @Override + public void validate() { + + ConfigVerification verification = controller.getConfigErrors(); + if (verification != null) { + saveButton.setEnabled(!verification.hasFailed()); + } else { + saveButton.setEnabled(false); + } + } + + /** + * empty method + * + * @throws UnsupportedOperationException + * @deprecated + */ + @Deprecated + @Override + public void toXML(final StringBuilder builder, + final ConfigVerification errors) { + throw new UnsupportedOperationException(); + } + + /** + * empty method + * + * @throws UnsupportedOperationException + * @deprecated + */ + @Deprecated + @Override + public void applyConfig(final ConfigSettings config) { + throw new UnsupportedOperationException(); + } + } + + /** + * (Constructor) Creates a new ConfigDialog. + * + * @param controller Reference to the controller + */ + public ConfigDialog(final ConfigController controller) { + super(controller.getRegistry().getGUI(), true); + + this.setTitle("Verification"); + + setSize(500, 250); + setResizable(false); + + Dimension d = Toolkit.getDefaultToolkit().getScreenSize(); + setLocation((d.width - getSize().width) / 2, + (d.height - getSize().height) / 2); + + this.setContentPane(new ConfigDialogPanel(controller)); + } + + /** + * Closes the dialog. + */ + public void close() { + this.setVisible(true); + this.dispose(); + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/dialogs/InputDialog.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/dialogs/InputDialog.java index 7e499f6a..70358836 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/dialogs/InputDialog.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/dialogs/InputDialog.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -37,70 +37,59 @@ /** * InputDialog - Dialog to specify input archives. - * - * - * */ @SuppressWarnings("serial") public class InputDialog - extends JDialog -{ - - /** - * Panel of the InputDialog - * - * - * - */ - private class InputDialogPanel - extends AbstractPanel - { - - /** - * (Constructor) Creates the InputDialogPanel. - * - * @param controller - * Reference to the controller - */ - public InputDialogPanel(final ConfigController controller) - { - super(controller); - createPathSettings(); - createTypeChooser(); - createButtons(); - createStartLabel(); - } - - private JLabel pathLabel; - private JTextField pathField; - private JButton searchButton; - - private JLabel typeLabel; - private JComboBox typeChooser; - - private JLabel startLabel; - private JTextField startPosition; - - private JButton addButton; - private JButton cancelButton; - - /** - * Creates the path input components. - */ - private void createPathSettings() - { - pathLabel = new JLabel("Please enter the path: "); - pathLabel.setBounds(10, 10, 150, 25); - this.add(pathLabel); - - pathField = new JTextField(); - pathField.setBounds(10, 40, 250, 25); - this.add(pathField); - - searchButton = new JButton("Search"); - searchButton.setBounds(180, 10, 80, 25); - - searchButton.addActionListener(e -> { + extends JDialog { + + /** + * Panel of the InputDialog + */ + private class InputDialogPanel + extends AbstractPanel { + + /** + * (Constructor) Creates the InputDialogPanel. + * + * @param controller Reference to the controller + */ + public InputDialogPanel(final ConfigController controller) { + super(controller); + createPathSettings(); + createTypeChooser(); + createButtons(); + createStartLabel(); + } + + private JLabel pathLabel; + private JTextField pathField; + private JButton searchButton; + + private JLabel typeLabel; + private JComboBox typeChooser; + + private JLabel startLabel; + private JTextField startPosition; + + private JButton addButton; + private JButton cancelButton; + + /** + * Creates the path input components. + */ + private void createPathSettings() { + pathLabel = new JLabel("Please enter the path: "); + pathLabel.setBounds(10, 10, 150, 25); + this.add(pathLabel); + + pathField = new JTextField(); + pathField.setBounds(10, 40, 250, 25); + this.add(pathField); + + searchButton = new JButton("Search"); + searchButton.setBounds(180, 10, 80, 25); + + searchButton.addActionListener(e -> { JFileChooser fc = new JFileChooser(); if (fc.showOpenDialog(new JPanel()) == JFileChooser.APPROVE_OPTION) { @@ -108,56 +97,53 @@ private void createPathSettings() } }); - this.add(searchButton); - } + this.add(searchButton); + } - /** - * Creates the start input components. - */ - private void createStartLabel() - { + /** + * Creates the start input components. + */ + private void createStartLabel() { - startLabel = new JLabel("Ignore all bytes before:"); - startLabel.setBounds(10, 120, 130, 25); - this.add(startLabel); + startLabel = new JLabel("Ignore all bytes before:"); + startLabel.setBounds(10, 120, 130, 25); + this.add(startLabel); - startPosition = new JTextField(); - startPosition.setBounds(150, 120, 110, 25); - this.add(startPosition); - } + startPosition = new JTextField(); + startPosition.setBounds(150, 120, 110, 25); + this.add(startPosition); + } - /** - * Creates the input type chooser. - */ - private void createTypeChooser() - { + /** + * Creates the input type chooser. + */ + private void createTypeChooser() { - typeLabel = new JLabel("Input type: "); - typeLabel.setBounds(10, 80, 130, 25); - this.add(typeLabel); + typeLabel = new JLabel("Input type: "); + typeLabel.setBounds(10, 80, 130, 25); + this.add(typeLabel); - typeChooser = new JComboBox<>(); - typeChooser.setBounds(150, 80, 110, 25); + typeChooser = new JComboBox<>(); + typeChooser.setBounds(150, 80, 110, 25); - typeChooser.addItem(InputType.XML); + typeChooser.addItem(InputType.XML); - if (this.controller.is7ZipEnabled()) { - typeChooser.addItem(InputType.SEVENZIP); - } + if (this.controller.is7ZipEnabled()) { + typeChooser.addItem(InputType.SEVENZIP); + } - typeChooser.addItem(InputType.BZIP2); + typeChooser.addItem(InputType.BZIP2); - this.add(typeChooser); - } + this.add(typeChooser); + } - /** - * Creates the buttons of the dialog panel. - */ - private void createButtons() - { - addButton = new JButton("Add"); - addButton.setBounds(10, 170, 120, 25); - addButton.addActionListener(e -> { + /** + * Creates the buttons of the dialog panel. + */ + private void createButtons() { + addButton = new JButton("Add"); + addButton.setBounds(10, 170, 120, 25); + addButton.addActionListener(e -> { String path = pathField.getText(); if (path.length() == 0) { return; @@ -170,105 +156,98 @@ private void createButtons() close(); }); - this.add(addButton); - - cancelButton = new JButton("Cancel"); - cancelButton.setBounds(140, 170, 120, 25); - cancelButton.addActionListener(e -> close()); - - this.add(cancelButton); - } - - /** - * empty method - */ - @Override - public void validate() - { - - } - - /** - * A call of this method should validate the positions of the panels - * components. - */ - @Override - public void relocate() - { - - int w = 250, h = 185; - int x = (this.getWidth() - w) / 2, y = (this.getHeight() - h) / 2; - - pathLabel.setLocation(x, y); - pathField.setLocation(x, y + 30); - searchButton.setLocation(x + 170, y); - - typeLabel.setLocation(x, y + 70); - typeChooser.setLocation(x + 140, y + 70); - - startLabel.setLocation(x, y + 110); - startPosition.setLocation(x + 140, y + 110); - - addButton.setLocation(x, y + 160); - cancelButton.setLocation(x + 130, y + 160); - } - - /** - * empty method - * - * @deprecated - * @throws UnsupportedOperationException - */ - @Deprecated - @Override - public void toXML(final StringBuilder builder, - final ConfigVerification errors) - { - throw new UnsupportedOperationException(); - } - - /** - * empty method - * - * @deprecated - * @throws UnsupportedOperationException - */ - @Deprecated - @Override - public void applyConfig(final ConfigSettings config) - { - throw new UnsupportedOperationException(); - } - } - - /** - * (Constructor) Creates a new InputDialog. - * - * @param controller - * Reference to the controller - */ - public InputDialog(final ConfigController controller) - { - super(controller.getRegistry().getGUI(), true); - - this.setTitle("Add an input file"); - - setSize(300, 250); - setResizable(false); - - Dimension d = Toolkit.getDefaultToolkit().getScreenSize(); - setLocation((d.width - getSize().width) / 2, - (d.height - getSize().height) / 2); - - this.setContentPane(new InputDialogPanel(controller)); - } - - /** - * Closes the dialog. - */ - public void close() - { - this.setVisible(true); - this.dispose(); - } + this.add(addButton); + + cancelButton = new JButton("Cancel"); + cancelButton.setBounds(140, 170, 120, 25); + cancelButton.addActionListener(e -> close()); + + this.add(cancelButton); + } + + /** + * empty method + */ + @Override + public void validate() { + + } + + /** + * A call of this method should validate the positions of the panels + * components. + */ + @Override + public void relocate() { + + int w = 250, h = 185; + int x = (this.getWidth() - w) / 2, y = (this.getHeight() - h) / 2; + + pathLabel.setLocation(x, y); + pathField.setLocation(x, y + 30); + searchButton.setLocation(x + 170, y); + + typeLabel.setLocation(x, y + 70); + typeChooser.setLocation(x + 140, y + 70); + + startLabel.setLocation(x, y + 110); + startPosition.setLocation(x + 140, y + 110); + + addButton.setLocation(x, y + 160); + cancelButton.setLocation(x + 130, y + 160); + } + + /** + * empty method + * + * @throws UnsupportedOperationException + * @deprecated + */ + @Deprecated + @Override + public void toXML(final StringBuilder builder, + final ConfigVerification errors) { + throw new UnsupportedOperationException(); + } + + /** + * empty method + * + * @throws UnsupportedOperationException + * @deprecated + */ + @Deprecated + @Override + public void applyConfig(final ConfigSettings config) { + throw new UnsupportedOperationException(); + } + } + + /** + * (Constructor) Creates a new InputDialog. + * + * @param controller Reference to the controller + */ + public InputDialog(final ConfigController controller) { + super(controller.getRegistry().getGUI(), true); + + this.setTitle("Add an input file"); + + setSize(300, 250); + setResizable(false); + + Dimension d = Toolkit.getDefaultToolkit().getScreenSize(); + setLocation((d.width - getSize().width) / 2, + (d.height - getSize().height) / 2); + + this.setContentPane(new InputDialogPanel(controller)); + } + + /** + * Closes the dialog. + */ + public void close() { + this.setVisible(true); + this.dispose(); + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/dialogs/XMLFileChooser.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/dialogs/XMLFileChooser.java index 88750eee..76ce433c 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/dialogs/XMLFileChooser.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/dialogs/XMLFileChooser.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -24,49 +24,41 @@ /** * This object represents a xml file filter. - * - * - * */ @SuppressWarnings("serial") public class XMLFileChooser - extends JFileChooser -{ + extends JFileChooser { - /** - * (Constructor) Creates an FileChooser with a xml file filter. - */ - public XMLFileChooser() - { + /** + * (Constructor) Creates an FileChooser with a xml file filter. + */ + public XMLFileChooser() { - setFileFilter(new FileFilter() - { + setFileFilter(new FileFilter() { - @Override - public String getDescription() - { - return ".xml"; - } + @Override + public String getDescription() { + return ".xml"; + } @Override - public boolean accept(final File f) - { + public boolean accept(final File f) { - // Always accept directories - if (f.isDirectory()) { - return true; - } + // Always accept directories + if (f.isDirectory()) { + return true; + } - int p = f.getName().indexOf("."); + int p = f.getName().indexOf("."); - // Files need a ending - if (p == -1) { - return false; - } + // Files need a ending + if (p == -1) { + return false; + } - // Verify the ending - return f.getName().substring(p).equals(".xml"); - } - }); - } + // Verify the ending + return f.getName().substring(p).equals(".xml"); + } + }); + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/panels/AbstractPanel.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/panels/AbstractPanel.java index e21a4a0f..5be25e12 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/panels/AbstractPanel.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/panels/AbstractPanel.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -30,79 +30,70 @@ *

* All panels (which contain configuration parameters) will inherit from this * class. - * - * - * */ @SuppressWarnings("serial") public abstract class AbstractPanel - extends JPanel -{ + extends JPanel { - /** Reference to the controller */ - protected final ConfigController controller; + /** + * Reference to the controller + */ + protected final ConfigController controller; - /** - * (Constructor) Creates an AbstractPanel object. - * - * @param controller - * Reference to the controller - */ - public AbstractPanel(final ConfigController controller) - { - this.controller = controller; - this.setLayout(null); - } + /** + * (Constructor) Creates an AbstractPanel object. + * + * @param controller Reference to the controller + */ + public AbstractPanel(final ConfigController controller) { + this.controller = controller; + this.setLayout(null); + } - /** - * A call of this method should validate the status of the panels - * components. - */ - @Override - public abstract void validate(); + /** + * A call of this method should validate the status of the panels + * components. + */ + @Override + public abstract void validate(); - /** - * A call of this method should validate the positions of the panels - * components. - */ - public abstract void relocate(); + /** + * A call of this method should validate the positions of the panels + * components. + */ + public abstract void relocate(); - /** - * The default paint method was expanded with calls of the validate() and - * relocate() methods. - * - * @param g - * Graphics - */ - @Override - public void paint(final Graphics g) - { + /** + * The default paint method was expanded with calls of the validate() and + * relocate() methods. + * + * @param g Graphics + */ + @Override + public void paint(final Graphics g) { - validate(); - relocate(); + validate(); + relocate(); - super.paint(g); - } + super.paint(g); + } - /** - * Adds the xml description of the panels content to the StringBuilder. - * Errors which occur during the xml transformation will be added to the - * ConfigVerification. - * - * @param builder - * Reference to a StringBuilder object - * @param errors - * Reference to the ConfigVerification object - */ - public abstract void toXML(final StringBuilder builder, - final ConfigVerification errors); + /** + * Adds the xml description of the panels content to the StringBuilder. + * Errors which occur during the xml transformation will be added to the + * ConfigVerification. + * + * @param builder Reference to a StringBuilder object + * @param errors Reference to the ConfigVerification object + */ + public abstract void toXML(final StringBuilder builder, + final ConfigVerification errors); - /** - * Reads the configuration parameters described in the panel from the - * ConfigSettings and and sets the contained values. - * - * @param config - * Reference to the ConfigSettings object - */ - public abstract void applyConfig(final ConfigSettings config); + /** + * Reads the configuration parameters described in the panel from the + * ConfigSettings and and sets the contained values. + * + * @param config Reference to the ConfigSettings object + */ + public abstract void applyConfig(final ConfigSettings config); } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/panels/CachePanel.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/panels/CachePanel.java index 92e1f81f..9079f9ab 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/panels/CachePanel.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/panels/CachePanel.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -35,275 +35,251 @@ *

* This panel contains all components for setting configuration parameters * related to the cache. - * - * - * */ @SuppressWarnings("serial") public class CachePanel - extends AbstractPanel -{ - - private JLabel taskLimitationsLabel; - - private JLabel articleTaskLabel; - private JTextField articleTaskLimitField; - - private JLabel diffTaskLabel; - private JTextField diffTaskLimitField; - - private JLabel sqlProducerLimitationsLabel; - - private JLabel maxAllowedPacketLabel; - private JTextField maxAllowedPacketField; - - /** - * (Constructor) Creates a new CachePanel. - * - * @param controller - * Reference to the controller - */ - public CachePanel(final ConfigController controller) - { - - super(controller); - controller.register(PanelKeys.PANEL_CACHE, this); - - createTaskSettings(); - createSQLProducerSettings(); - } - - // --------------------------------------------------------------------------// - // CONSTRUCTION METHODS // - // --------------------------------------------------------------------------// - - private void createTaskSettings() - { - - taskLimitationsLabel = new JLabel("Task Limitations (in byte)"); - taskLimitationsLabel.setBounds(10, 10, 250, 25); - this.add(taskLimitationsLabel); - - articleTaskLabel = new JLabel("Article-Task: "); - articleTaskLabel.setBorder(BorderFactory.createRaisedBevelBorder()); - articleTaskLabel.setBounds(10, 40, 100, 25); - this.add(articleTaskLabel); - - articleTaskLimitField = new JTextField(); - articleTaskLimitField.setBounds(120, 40, 200, 25); - this.add(articleTaskLimitField); - - diffTaskLabel = new JLabel("Diff-Task: "); - diffTaskLabel.setBorder(BorderFactory.createRaisedBevelBorder()); - diffTaskLabel.setBounds(10, 70, 100, 25); - this.add(diffTaskLabel); - - diffTaskLimitField = new JTextField(); - diffTaskLimitField.setBounds(120, 70, 200, 25); - this.add(diffTaskLimitField); - } - - - private void createSQLProducerSettings() - { - - sqlProducerLimitationsLabel = new JLabel( - "SQLProducer Limitations (in byte)"); - sqlProducerLimitationsLabel.setBounds(10, 210, 250, 25); - this.add(sqlProducerLimitationsLabel); - - maxAllowedPacketLabel = new JLabel("MAX_ALLOWED_PACKET"); - maxAllowedPacketLabel - .setBorder(BorderFactory.createRaisedBevelBorder()); - maxAllowedPacketLabel.setBounds(10, 240, 160, 25); - this.add(maxAllowedPacketLabel); - - maxAllowedPacketField = new JTextField(); - maxAllowedPacketField.setBounds(180, 240, 140, 25); - this.add(maxAllowedPacketField); - } - - // --------------------------------------------------------------------------// - // VALIDATION METHODS // - // --------------------------------------------------------------------------// - - /** - * A call of this method should validate the status of the panels - * components. - */ - @Override - public void validate() - { - - } - - /** - * A call of this method should validate the positions of the panels - * components. - */ - @Override - public void relocate() - { - - int w = 310, h = 255; - int x = (this.getWidth() - w) / 2, y = (this.getHeight() - h) / 2; - - taskLimitationsLabel.setLocation(x, y); - articleTaskLabel.setLocation(x, y + 30); - articleTaskLimitField.setLocation(x + 110, y + 30); - diffTaskLabel.setLocation(x, y + 60); - diffTaskLimitField.setLocation(x + 110, y + 60); - - sqlProducerLimitationsLabel.setLocation(x, y + 100); - maxAllowedPacketLabel.setLocation(x, y + 130); - maxAllowedPacketField.setLocation(x + 170, y + 130); - } - - // --------------------------------------------------------------------------// - // INPUT/OUTPUT METHODS // - // --------------------------------------------------------------------------// - - /** - * Reads the configuration parameters described in the panel from the - * ConfigSettings and and sets the contained values. - * - * @param config - * Reference to the ConfigSettings object - */ - @Override - public void applyConfig(final ConfigSettings config) - { - - Object o = config - .getConfigParameter(ConfigurationKeys.LIMIT_TASK_SIZE_REVISIONS); - if (o != null) { - this.articleTaskLimitField.setText(Long.toString((Long) o)); - } - else { - this.articleTaskLimitField.setText(""); - } - - o = config.getConfigParameter(ConfigurationKeys.LIMIT_TASK_SIZE_DIFFS); - if (o != null) { - this.diffTaskLimitField.setText(Long.toString((Long) o)); - } - else { - this.diffTaskLimitField.setText(""); - } - - o = config - .getConfigParameter(ConfigurationKeys.LIMIT_SQLSERVER_MAX_ALLOWED_PACKET); - if (o != null) { - this.maxAllowedPacketField.setText(Long.toString((Long) o)); - } - else { - this.maxAllowedPacketField.setText(""); - } - } - - /** - * Adds the xml description of the panels content to the StringBuilder. - * Errors which occur during the xml transformation will be added to the - * ConfigVerification. - * - * @param builder - * Reference to a StringBuilder object - * @param errors - * Reference to the ConfigVerification object - */ - @Override - public void toXML(final StringBuilder builder, - final ConfigVerification errors) - { - - long tasksizeRevisions = -1, tasksizeDiffs = -1, maxAllowedPacket = -1; - - // Check the ArticleTask size input - String text = this.articleTaskLimitField.getText(); - if (text.length() == 0) { - errors.add(new ConfigItem(ConfigItemTypes.ERROR, - ConfigErrorKeys.MISSING_VALUE, - "The value for the size of ArticleTasks" + " is missing.")); - } - else { - try { - tasksizeRevisions = Long.parseLong(text); - if (tasksizeRevisions < 1000000) { - errors.add(new ConfigItem(ConfigItemTypes.ERROR, - ConfigErrorKeys.VALUE_OUT_OF_RANGE, - "The value for the size of an " - + "ArticleTask has to be at least " - + "1000000 Byte.")); - } - } - catch (NumberFormatException nfe) { - errors.add(new ConfigItem(ConfigItemTypes.ERROR, - ConfigErrorKeys.ILLEGAL_INPUT, - "NumberFormatException for the size of" - + " ArticleTasks")); - } - } - - // Check the DiffTask size input - text = this.diffTaskLimitField.getText(); - if (text.length() == 0) { - errors.add(new ConfigItem(ConfigItemTypes.ERROR, - ConfigErrorKeys.MISSING_VALUE, - "The value for the size of DiffTasks" + " is missing.")); - } - else { - try { - tasksizeDiffs = Long.parseLong(text); - if (tasksizeDiffs < 1000000) { - errors.add(new ConfigItem(ConfigItemTypes.ERROR, - ConfigErrorKeys.VALUE_OUT_OF_RANGE, - "The value for the size of a DiffTask " - + "has to be at least 1000000 Byte.")); - } - } - catch (NumberFormatException nfe) { - errors.add(new ConfigItem(ConfigItemTypes.ERROR, - ConfigErrorKeys.ILLEGAL_INPUT, - "NumberFormatException for the size of" + " DiffTasks")); - } - } - - // Check the SQLProducer MaxAllowedPacket input - text = this.maxAllowedPacketField.getText(); - if (text.length() == 0) { - errors.add(new ConfigItem(ConfigItemTypes.ERROR, - ConfigErrorKeys.MISSING_VALUE, - "The value for SQLProducer MaxAllowedPacket" - + " is missing.")); - } - else { - try { - maxAllowedPacket = Long.parseLong(text); - if (maxAllowedPacket < 1000000) { - errors.add(new ConfigItem(ConfigItemTypes.WARNING, - ConfigErrorKeys.VALUE_OUT_OF_RANGE, - "The value for SQLProducer " - + "MaxAllowedPacket should be at least" - + " 1000000 Byte.")); - } - } - catch (NumberFormatException nfe) { - errors.add(new ConfigItem(ConfigItemTypes.ERROR, - ConfigErrorKeys.ILLEGAL_INPUT, - "NumberFormatException for the size of" - + " SQLProducer MaxAllowedPacket")); - } - } - - builder.append("\t\r\n"); - builder.append("\t\t" + tasksizeRevisions - + "\r\n"); - builder.append("\t\t" + tasksizeDiffs - + "\r\n"); - builder.append("\t\t" - + maxAllowedPacket - + "\r\n"); - - builder.append("\t\r\n"); - } + extends AbstractPanel { + + private JLabel taskLimitationsLabel; + + private JLabel articleTaskLabel; + private JTextField articleTaskLimitField; + + private JLabel diffTaskLabel; + private JTextField diffTaskLimitField; + + private JLabel sqlProducerLimitationsLabel; + + private JLabel maxAllowedPacketLabel; + private JTextField maxAllowedPacketField; + + /** + * (Constructor) Creates a new CachePanel. + * + * @param controller Reference to the controller + */ + public CachePanel(final ConfigController controller) { + + super(controller); + controller.register(PanelKeys.PANEL_CACHE, this); + + createTaskSettings(); + createSQLProducerSettings(); + } + + // --------------------------------------------------------------------------// + // CONSTRUCTION METHODS // + // --------------------------------------------------------------------------// + + private void createTaskSettings() { + + taskLimitationsLabel = new JLabel("Task Limitations (in byte)"); + taskLimitationsLabel.setBounds(10, 10, 250, 25); + this.add(taskLimitationsLabel); + + articleTaskLabel = new JLabel("Article-Task: "); + articleTaskLabel.setBorder(BorderFactory.createRaisedBevelBorder()); + articleTaskLabel.setBounds(10, 40, 100, 25); + this.add(articleTaskLabel); + + articleTaskLimitField = new JTextField(); + articleTaskLimitField.setBounds(120, 40, 200, 25); + this.add(articleTaskLimitField); + + diffTaskLabel = new JLabel("Diff-Task: "); + diffTaskLabel.setBorder(BorderFactory.createRaisedBevelBorder()); + diffTaskLabel.setBounds(10, 70, 100, 25); + this.add(diffTaskLabel); + + diffTaskLimitField = new JTextField(); + diffTaskLimitField.setBounds(120, 70, 200, 25); + this.add(diffTaskLimitField); + } + + + private void createSQLProducerSettings() { + + sqlProducerLimitationsLabel = new JLabel( + "SQLProducer Limitations (in byte)"); + sqlProducerLimitationsLabel.setBounds(10, 210, 250, 25); + this.add(sqlProducerLimitationsLabel); + + maxAllowedPacketLabel = new JLabel("MAX_ALLOWED_PACKET"); + maxAllowedPacketLabel + .setBorder(BorderFactory.createRaisedBevelBorder()); + maxAllowedPacketLabel.setBounds(10, 240, 160, 25); + this.add(maxAllowedPacketLabel); + + maxAllowedPacketField = new JTextField(); + maxAllowedPacketField.setBounds(180, 240, 140, 25); + this.add(maxAllowedPacketField); + } + + // --------------------------------------------------------------------------// + // VALIDATION METHODS // + // --------------------------------------------------------------------------// + + /** + * A call of this method should validate the status of the panels + * components. + */ + @Override + public void validate() { + + } + + /** + * A call of this method should validate the positions of the panels + * components. + */ + @Override + public void relocate() { + + int w = 310, h = 255; + int x = (this.getWidth() - w) / 2, y = (this.getHeight() - h) / 2; + + taskLimitationsLabel.setLocation(x, y); + articleTaskLabel.setLocation(x, y + 30); + articleTaskLimitField.setLocation(x + 110, y + 30); + diffTaskLabel.setLocation(x, y + 60); + diffTaskLimitField.setLocation(x + 110, y + 60); + + sqlProducerLimitationsLabel.setLocation(x, y + 100); + maxAllowedPacketLabel.setLocation(x, y + 130); + maxAllowedPacketField.setLocation(x + 170, y + 130); + } + + // --------------------------------------------------------------------------// + // INPUT/OUTPUT METHODS // + // --------------------------------------------------------------------------// + + /** + * Reads the configuration parameters described in the panel from the + * ConfigSettings and and sets the contained values. + * + * @param config Reference to the ConfigSettings object + */ + @Override + public void applyConfig(final ConfigSettings config) { + + Object o = config + .getConfigParameter(ConfigurationKeys.LIMIT_TASK_SIZE_REVISIONS); + if (o != null) { + this.articleTaskLimitField.setText(Long.toString((Long) o)); + } else { + this.articleTaskLimitField.setText(""); + } + + o = config.getConfigParameter(ConfigurationKeys.LIMIT_TASK_SIZE_DIFFS); + if (o != null) { + this.diffTaskLimitField.setText(Long.toString((Long) o)); + } else { + this.diffTaskLimitField.setText(""); + } + + o = config + .getConfigParameter(ConfigurationKeys.LIMIT_SQLSERVER_MAX_ALLOWED_PACKET); + if (o != null) { + this.maxAllowedPacketField.setText(Long.toString((Long) o)); + } else { + this.maxAllowedPacketField.setText(""); + } + } + + /** + * Adds the xml description of the panels content to the StringBuilder. + * Errors which occur during the xml transformation will be added to the + * ConfigVerification. + * + * @param builder Reference to a StringBuilder object + * @param errors Reference to the ConfigVerification object + */ + @Override + public void toXML(final StringBuilder builder, + final ConfigVerification errors) { + + long tasksizeRevisions = -1, tasksizeDiffs = -1, maxAllowedPacket = -1; + + // Check the ArticleTask size input + String text = this.articleTaskLimitField.getText(); + if (text.length() == 0) { + errors.add(new ConfigItem(ConfigItemTypes.ERROR, + ConfigErrorKeys.MISSING_VALUE, + "The value for the size of ArticleTasks" + " is missing.")); + } else { + try { + tasksizeRevisions = Long.parseLong(text); + if (tasksizeRevisions < 1000000) { + errors.add(new ConfigItem(ConfigItemTypes.ERROR, + ConfigErrorKeys.VALUE_OUT_OF_RANGE, + "The value for the size of an " + + "ArticleTask has to be at least " + + "1000000 Byte.")); + } + } catch (NumberFormatException nfe) { + errors.add(new ConfigItem(ConfigItemTypes.ERROR, + ConfigErrorKeys.ILLEGAL_INPUT, + "NumberFormatException for the size of" + + " ArticleTasks")); + } + } + + // Check the DiffTask size input + text = this.diffTaskLimitField.getText(); + if (text.length() == 0) { + errors.add(new ConfigItem(ConfigItemTypes.ERROR, + ConfigErrorKeys.MISSING_VALUE, + "The value for the size of DiffTasks" + " is missing.")); + } else { + try { + tasksizeDiffs = Long.parseLong(text); + if (tasksizeDiffs < 1000000) { + errors.add(new ConfigItem(ConfigItemTypes.ERROR, + ConfigErrorKeys.VALUE_OUT_OF_RANGE, + "The value for the size of a DiffTask " + + "has to be at least 1000000 Byte.")); + } + } catch (NumberFormatException nfe) { + errors.add(new ConfigItem(ConfigItemTypes.ERROR, + ConfigErrorKeys.ILLEGAL_INPUT, + "NumberFormatException for the size of" + " DiffTasks")); + } + } + + // Check the SQLProducer MaxAllowedPacket input + text = this.maxAllowedPacketField.getText(); + if (text.length() == 0) { + errors.add(new ConfigItem(ConfigItemTypes.ERROR, + ConfigErrorKeys.MISSING_VALUE, + "The value for SQLProducer MaxAllowedPacket" + + " is missing.")); + } else { + try { + maxAllowedPacket = Long.parseLong(text); + if (maxAllowedPacket < 1000000) { + errors.add(new ConfigItem(ConfigItemTypes.WARNING, + ConfigErrorKeys.VALUE_OUT_OF_RANGE, + "The value for SQLProducer " + + "MaxAllowedPacket should be at least" + + " 1000000 Byte.")); + } + } catch (NumberFormatException nfe) { + errors.add(new ConfigItem(ConfigItemTypes.ERROR, + ConfigErrorKeys.ILLEGAL_INPUT, + "NumberFormatException for the size of" + + " SQLProducer MaxAllowedPacket")); + } + } + + builder.append("\t\r\n"); + builder.append("\t\t" + tasksizeRevisions + + "\r\n"); + builder.append("\t\t" + tasksizeDiffs + + "\r\n"); + builder.append("\t\t" + + maxAllowedPacket + + "\r\n"); + + builder.append("\t\r\n"); + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/panels/ConfigPanel.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/panels/ConfigPanel.java index 6ff2826b..5b79dbe1 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/panels/ConfigPanel.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/panels/ConfigPanel.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -27,162 +27,148 @@ /** * Panel of the ConfigGUI Contains a tabbed panel with reference to all the * other panels. - * - * - * */ @SuppressWarnings("serial") public class ConfigPanel - extends AbstractPanel -{ + extends AbstractPanel { - private JTabbedPane tabs; + private JTabbedPane tabs; - private JButton importButton; - private JButton verifyButton; - private JButton exportButton; + private JButton importButton; + private JButton verifyButton; + private JButton exportButton; - /** - * (Constructor) Creates a new ConfigPanel. - * - * @param controller - * Reference to the controller - */ - public ConfigPanel(final ConfigController controller) - { + /** + * (Constructor) Creates a new ConfigPanel. + * + * @param controller Reference to the controller + */ + public ConfigPanel(final ConfigController controller) { - super(controller); + super(controller); - createTabbedPane(); + createTabbedPane(); - createImportButton(); - createVerifyButton(); - createExportButton(); - } + createImportButton(); + createVerifyButton(); + createExportButton(); + } - // --------------------------------------------------------------------------// - // CONSTRUCTION METHODS // - // --------------------------------------------------------------------------// + // --------------------------------------------------------------------------// + // CONSTRUCTION METHODS // + // --------------------------------------------------------------------------// - private void createTabbedPane() - { + private void createTabbedPane() { - tabs = new JTabbedPane(); - tabs.setBounds(5, 5, 580, 300); + tabs = new JTabbedPane(); + tabs.setBounds(5, 5, 580, 300); - tabs.add("Mode", new ModePanel(controller)); - tabs.add("Externals", new ExternalProgramsPanel(controller)); - tabs.add("Input", new InputPanel(controller)); - tabs.add("Output", new OutputPanel(controller)); - tabs.add("Database", new SQLPanel(controller)); - tabs.add("Cache", new CachePanel(controller)); - tabs.add("Logging", new LoggingPanel(controller)); - tabs.add("Debug", new DebugPanel(controller)); - tabs.add("Filter", new FilterPanel(controller)); + tabs.add("Mode", new ModePanel(controller)); + tabs.add("Externals", new ExternalProgramsPanel(controller)); + tabs.add("Input", new InputPanel(controller)); + tabs.add("Output", new OutputPanel(controller)); + tabs.add("Database", new SQLPanel(controller)); + tabs.add("Cache", new CachePanel(controller)); + tabs.add("Logging", new LoggingPanel(controller)); + tabs.add("Debug", new DebugPanel(controller)); + tabs.add("Filter", new FilterPanel(controller)); - this.add(tabs); + this.add(tabs); - } + } - private void createImportButton() - { + private void createImportButton() { - importButton = new JButton("Import"); - importButton.setBounds(5, 310, 190, 25); + importButton = new JButton("Import"); + importButton.setBounds(5, 310, 190, 25); - importButton.addActionListener(e -> { + importButton.addActionListener(e -> { controller.loadConfiguration(); repaint(); }); - this.add(importButton); + this.add(importButton); - } + } - private void createVerifyButton() - { + private void createVerifyButton() { - verifyButton = new JButton("Verify Settings"); - verifyButton.setBounds(200, 310, 190, 25); + verifyButton = new JButton("Verify Settings"); + verifyButton.setBounds(200, 310, 190, 25); - verifyButton.addActionListener(e -> { + verifyButton.addActionListener(e -> { controller.createConfigurationXML(); repaint(); }); - this.add(verifyButton); - } + this.add(verifyButton); + } - private void createExportButton() - { + private void createExportButton() { - exportButton = new JButton("Export"); - exportButton.setBounds(395, 310, 190, 25); + exportButton = new JButton("Export"); + exportButton.setBounds(395, 310, 190, 25); - exportButton.addActionListener(e -> { + exportButton.addActionListener(e -> { controller.saveConfiguration(); repaint(); }); - this.add(exportButton); - } - - // --------------------------------------------------------------------------// - // VALIDATION METHODS // - // --------------------------------------------------------------------------// - - /** - * empty method - */ - @Override - public void validate() - { - - } - - /** - * A call of this method should validate the positions of the panels - * components. - */ - @Override - public void relocate() - { - - int w = 575, h = 330; - - int x = (this.getWidth() - w) / 2; - int y = (this.getHeight() - h) / 2; - - tabs.setLocation(x, y); - - importButton.setLocation(x, y + 305); - verifyButton.setLocation(x + 195, y + 305); - exportButton.setLocation(x + 390, y + 305); - - } - - // --------------------------------------------------------------------------// - // INPUT/OUTPUT METHODS // - // --------------------------------------------------------------------------// - - /** - * @deprecated - */ - @Deprecated - @Override - public void applyConfig(final ConfigSettings config) - { - throw new UnsupportedOperationException(); - } - - /** - * @deprecated - */ - @Deprecated - @Override - public void toXML(final StringBuilder builder, - final ConfigVerification errors) - { - throw new UnsupportedOperationException(); - } + this.add(exportButton); + } + + // --------------------------------------------------------------------------// + // VALIDATION METHODS // + // --------------------------------------------------------------------------// + + /** + * empty method + */ + @Override + public void validate() { + + } + + /** + * A call of this method should validate the positions of the panels + * components. + */ + @Override + public void relocate() { + + int w = 575, h = 330; + + int x = (this.getWidth() - w) / 2; + int y = (this.getHeight() - h) / 2; + + tabs.setLocation(x, y); + + importButton.setLocation(x, y + 305); + verifyButton.setLocation(x + 195, y + 305); + exportButton.setLocation(x + 390, y + 305); + + } + + // --------------------------------------------------------------------------// + // INPUT/OUTPUT METHODS // + // --------------------------------------------------------------------------// + + /** + * @deprecated + */ + @Deprecated + @Override + public void applyConfig(final ConfigSettings config) { + throw new UnsupportedOperationException(); + } + + /** + * @deprecated + */ + @Deprecated + @Override + public void toXML(final StringBuilder builder, + final ConfigVerification errors) { + throw new UnsupportedOperationException(); + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/panels/DebugPanel.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/panels/DebugPanel.java index 618a9d96..ea21ea18 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/panels/DebugPanel.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/panels/DebugPanel.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -38,48 +38,41 @@ *

* This panel contains all components for setting configuration parameters * related to the debug purposes. - * - * - * */ @SuppressWarnings("serial") public class DebugPanel - extends AbstractPanel -{ + extends AbstractPanel { - private JCheckBox verifyDiffCheckBox; - private JCheckBox verifyEncodingCheckBox; + private JCheckBox verifyDiffCheckBox; + private JCheckBox verifyEncodingCheckBox; - private JCheckBox debugOuputCheckBox; - private JLabel debugOutputLabel; - private JTextField debugOutputField; + private JCheckBox debugOuputCheckBox; + private JLabel debugOutputLabel; + private JTextField debugOutputField; - private JCheckBox statsOutputCheckBox; + private JCheckBox statsOutputCheckBox; - /** - * (Constructor) Creates a new DebugPanel. - * - * @param controller - * Reference to the controller - */ - public DebugPanel(final ConfigController controller) - { + /** + * (Constructor) Creates a new DebugPanel. + * + * @param controller Reference to the controller + */ + public DebugPanel(final ConfigController controller) { - super(controller); - controller.register(PanelKeys.PANEL_DEBUG, this); + super(controller); + controller.register(PanelKeys.PANEL_DEBUG, this); - createVerificationSettings(); - createStatsOutputSettings(); - createDebugSettings(); - } + createVerificationSettings(); + createStatsOutputSettings(); + createDebugSettings(); + } - public void createVerificationSettings() - { + public void createVerificationSettings() { - verifyDiffCheckBox = new JCheckBox("Activate Diff Verification"); - verifyDiffCheckBox.setBounds(10, 10, 200, 25); + verifyDiffCheckBox = new JCheckBox("Activate Diff Verification"); + verifyDiffCheckBox.setBounds(10, 10, 200, 25); - verifyDiffCheckBox.addActionListener(e -> { + verifyDiffCheckBox.addActionListener(e -> { boolean flag = !controller.isDiffVerificationEnabled(); controller.setEnableDiffVerification(flag); @@ -87,12 +80,12 @@ public void createVerificationSettings() validateDebugSettings(); }); - this.add(verifyDiffCheckBox); + this.add(verifyDiffCheckBox); - verifyEncodingCheckBox = new JCheckBox("Activate Encoding Verification"); - verifyEncodingCheckBox.setBounds(10, 40, 200, 25); + verifyEncodingCheckBox = new JCheckBox("Activate Encoding Verification"); + verifyEncodingCheckBox.setBounds(10, 40, 200, 25); - verifyEncodingCheckBox.addActionListener(e -> { + verifyEncodingCheckBox.addActionListener(e -> { boolean flag = !controller.isEncodingVerificationEnabled(); controller.setEnableEncodingVerification(flag); @@ -100,32 +93,30 @@ public void createVerificationSettings() validateDebugSettings(); }); - this.add(verifyEncodingCheckBox); - } + this.add(verifyEncodingCheckBox); + } - private void createStatsOutputSettings() - { - statsOutputCheckBox = new JCheckBox( - "Activate Article Information Output"); - statsOutputCheckBox.setBounds(10, 80, 250, 25); + private void createStatsOutputSettings() { + statsOutputCheckBox = new JCheckBox( + "Activate Article Information Output"); + statsOutputCheckBox.setBounds(10, 80, 250, 25); - statsOutputCheckBox.addActionListener(e -> { + statsOutputCheckBox.addActionListener(e -> { boolean flag = !controller.isStatsOutputEnabled(); controller.setEnableStatsOutput(flag); }); - this.add(statsOutputCheckBox); - } + this.add(statsOutputCheckBox); + } - private void createDebugSettings() - { + private void createDebugSettings() { - debugOuputCheckBox = new JCheckBox("Activate Debug Output"); - debugOuputCheckBox.setBounds(10, 120, 200, 25); - this.add(debugOuputCheckBox); + debugOuputCheckBox = new JCheckBox("Activate Debug Output"); + debugOuputCheckBox.setBounds(10, 120, 200, 25); + this.add(debugOuputCheckBox); - debugOuputCheckBox.addActionListener(e -> { + debugOuputCheckBox.addActionListener(e -> { boolean flag = !controller.isDebugOutputEnabled(); controller.setEnableDebugOutput(flag); @@ -133,186 +124,174 @@ private void createDebugSettings() validateDebugSettings(); }); - debugOutputLabel = new JLabel("Debug Folder: "); - debugOutputLabel.setBorder(BorderFactory.createRaisedBevelBorder()); - debugOutputLabel.setBounds(10, 150, 100, 25); - this.add(debugOutputLabel); - - debugOutputField = new JTextField(); - debugOutputField.setBounds(120, 150, 250, 25); - this.add(debugOutputField); - } - - // --------------------------------------------------------------------------// - // VALIDATION METHODS // - // --------------------------------------------------------------------------// - - /** - * A call of this method should validate the status of the panels - * components. - */ - @Override - public void validate() - { - validateDebugSettings(); - } - - /** - * Validates the debug settings. - */ - private void validateDebugSettings() - { - - verifyDiffCheckBox.setSelected(controller.isDiffVerificationEnabled()); - verifyEncodingCheckBox.setSelected(controller - .isEncodingVerificationEnabled()); - statsOutputCheckBox.setSelected(controller.isStatsOutputEnabled()); - - boolean flagA = controller.isDiffVerificationEnabled() - || controller.isEncodingVerificationEnabled(); - - debugOuputCheckBox.setEnabled(flagA); - debugOuputCheckBox.setSelected(controller.isDebugOutputEnabled()); - - boolean flagB = controller.isDebugOutputEnabled(); - debugOutputLabel.setEnabled(flagA && flagB); - debugOutputField.setEnabled(flagA && flagB); - - } - - /** - * A call of this method should validate the positions of the panels - * components. - */ - @Override - public void relocate() - { - - int w = 360, h = 165; - int x = (this.getWidth() - w) / 2, y = (this.getHeight() - h) / 2; - - verifyDiffCheckBox.setLocation(x, y); - verifyEncodingCheckBox.setLocation(x, y + 30); - - statsOutputCheckBox.setLocation(x, y + 70); - - debugOuputCheckBox.setLocation(x, y + 110); - debugOutputLabel.setLocation(x, y + 140); - debugOutputField.setLocation(x + 110, y + 140); - } - - // --------------------------------------------------------------------------// - // INPUT/OUTPUT METHODS // - // --------------------------------------------------------------------------// - - /** - * Reads the configuration parameters described in the panel from the - * ConfigSettings and and sets the contained values. - * - * @param config - * Reference to the ConfigSettings object - */ - @Override - public void applyConfig(final ConfigSettings config) - { - Object o = config - .getConfigParameter(ConfigurationKeys.VERIFICATION_DIFF); - if (o != null) { - controller.setEnableDiffVerification((Boolean) o); - } - else { - controller.setEnableDiffVerification(false); - } - - o = config.getConfigParameter(ConfigurationKeys.VERIFICATION_ENCODING); - if (o != null) { - controller.setEnableEncodingVerification((Boolean) o); - } - else { - controller.setEnableEncodingVerification(false); - } - - o = config - .getConfigParameter(ConfigurationKeys.MODE_STATISTICAL_OUTPUT); - if (o != null) { - controller.setEnableStatsOutput((Boolean) o); - } - else { - controller.setEnableStatsOutput(false); - } - - o = config.getConfigParameter(ConfigurationKeys.LOGGING_PATH_DEBUG); - if (o != null) { - controller.setEnableDebugOutput(true); - this.debugOutputField.setText((String) o); - } - else { - controller.setEnableDebugOutput(false); - this.debugOutputField.setText(""); - } - } - - /** - * Adds the xml description of the panels content to the StringBuilder. - * Errors which occur during the xml transformation will be added to the - * ConfigVerification. - * - * @param builder - * Reference to a StringBuilder object - * @param errors - * Reference to the ConfigVerification object - */ - @Override - public void toXML(final StringBuilder builder, - final ConfigVerification errors) - { - - boolean verifyDiff = controller.isDiffVerificationEnabled(); - boolean verifyEncoding = controller.isEncodingVerificationEnabled(); - boolean statsOutput = controller.isStatsOutputEnabled(); - boolean debugOutput = controller.isDebugOutputEnabled(); - - if (verifyDiff || verifyEncoding || statsOutput || debugOutput) { - - builder.append("\t\r\n"); - - if (verifyDiff) { - builder.append("\t\t" + verifyDiff - + "\r\n"); - } - - if (verifyEncoding) { - builder.append("\t\t" + verifyEncoding - + "\r\n"); - } - - if (statsOutput) { - builder.append("\t\t" + statsOutput - + "\r\n"); - } - - builder.append("\t\t\r\n"); // \"" + path + - // "\"\r\n"); - builder.append("\t\t\t" + debugOutput + "\r\n"); - - if (debugOutput) { - - String path = debugOutputField.getText(); - if (path.length() == 0) { - errors.add(new ConfigItem(ConfigItemTypes.WARNING, - ConfigErrorKeys.PATH_NOT_SET, - "The folder of the debug output is not specified.")); - } - if (!path.endsWith(File.separator) - && path.contains(File.separator)) { - path += File.separator; - } - - builder.append("\t\t\t\"" + path + "\"\r\n"); - } - - builder.append("\t\t\r\n"); - builder.append("\t\r\n"); - } - } + debugOutputLabel = new JLabel("Debug Folder: "); + debugOutputLabel.setBorder(BorderFactory.createRaisedBevelBorder()); + debugOutputLabel.setBounds(10, 150, 100, 25); + this.add(debugOutputLabel); + + debugOutputField = new JTextField(); + debugOutputField.setBounds(120, 150, 250, 25); + this.add(debugOutputField); + } + + // --------------------------------------------------------------------------// + // VALIDATION METHODS // + // --------------------------------------------------------------------------// + + /** + * A call of this method should validate the status of the panels + * components. + */ + @Override + public void validate() { + validateDebugSettings(); + } + + /** + * Validates the debug settings. + */ + private void validateDebugSettings() { + + verifyDiffCheckBox.setSelected(controller.isDiffVerificationEnabled()); + verifyEncodingCheckBox.setSelected(controller + .isEncodingVerificationEnabled()); + statsOutputCheckBox.setSelected(controller.isStatsOutputEnabled()); + + boolean flagA = controller.isDiffVerificationEnabled() + || controller.isEncodingVerificationEnabled(); + + debugOuputCheckBox.setEnabled(flagA); + debugOuputCheckBox.setSelected(controller.isDebugOutputEnabled()); + + boolean flagB = controller.isDebugOutputEnabled(); + debugOutputLabel.setEnabled(flagA && flagB); + debugOutputField.setEnabled(flagA && flagB); + + } + + /** + * A call of this method should validate the positions of the panels + * components. + */ + @Override + public void relocate() { + + int w = 360, h = 165; + int x = (this.getWidth() - w) / 2, y = (this.getHeight() - h) / 2; + + verifyDiffCheckBox.setLocation(x, y); + verifyEncodingCheckBox.setLocation(x, y + 30); + + statsOutputCheckBox.setLocation(x, y + 70); + + debugOuputCheckBox.setLocation(x, y + 110); + debugOutputLabel.setLocation(x, y + 140); + debugOutputField.setLocation(x + 110, y + 140); + } + + // --------------------------------------------------------------------------// + // INPUT/OUTPUT METHODS // + // --------------------------------------------------------------------------// + + /** + * Reads the configuration parameters described in the panel from the + * ConfigSettings and and sets the contained values. + * + * @param config Reference to the ConfigSettings object + */ + @Override + public void applyConfig(final ConfigSettings config) { + Object o = config + .getConfigParameter(ConfigurationKeys.VERIFICATION_DIFF); + if (o != null) { + controller.setEnableDiffVerification((Boolean) o); + } else { + controller.setEnableDiffVerification(false); + } + + o = config.getConfigParameter(ConfigurationKeys.VERIFICATION_ENCODING); + if (o != null) { + controller.setEnableEncodingVerification((Boolean) o); + } else { + controller.setEnableEncodingVerification(false); + } + + o = config + .getConfigParameter(ConfigurationKeys.MODE_STATISTICAL_OUTPUT); + if (o != null) { + controller.setEnableStatsOutput((Boolean) o); + } else { + controller.setEnableStatsOutput(false); + } + + o = config.getConfigParameter(ConfigurationKeys.LOGGING_PATH_DEBUG); + if (o != null) { + controller.setEnableDebugOutput(true); + this.debugOutputField.setText((String) o); + } else { + controller.setEnableDebugOutput(false); + this.debugOutputField.setText(""); + } + } + + /** + * Adds the xml description of the panels content to the StringBuilder. + * Errors which occur during the xml transformation will be added to the + * ConfigVerification. + * + * @param builder Reference to a StringBuilder object + * @param errors Reference to the ConfigVerification object + */ + @Override + public void toXML(final StringBuilder builder, + final ConfigVerification errors) { + + boolean verifyDiff = controller.isDiffVerificationEnabled(); + boolean verifyEncoding = controller.isEncodingVerificationEnabled(); + boolean statsOutput = controller.isStatsOutputEnabled(); + boolean debugOutput = controller.isDebugOutputEnabled(); + + if (verifyDiff || verifyEncoding || statsOutput || debugOutput) { + + builder.append("\t\r\n"); + + if (verifyDiff) { + builder.append("\t\t" + verifyDiff + + "\r\n"); + } + + if (verifyEncoding) { + builder.append("\t\t" + verifyEncoding + + "\r\n"); + } + + if (statsOutput) { + builder.append("\t\t" + statsOutput + + "\r\n"); + } + + builder.append("\t\t\r\n"); // \"" + path + + // "\"\r\n"); + builder.append("\t\t\t" + debugOutput + "\r\n"); + + if (debugOutput) { + + String path = debugOutputField.getText(); + if (path.length() == 0) { + errors.add(new ConfigItem(ConfigItemTypes.WARNING, + ConfigErrorKeys.PATH_NOT_SET, + "The folder of the debug output is not specified.")); + } + if (!path.endsWith(File.separator) + && path.contains(File.separator)) { + path += File.separator; + } + + builder.append("\t\t\t\"" + path + "\"\r\n"); + } + + builder.append("\t\t\r\n"); + builder.append("\t\r\n"); + } + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/panels/ExternalProgramsPanel.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/panels/ExternalProgramsPanel.java index 5a536dcb..e34339fa 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/panels/ExternalProgramsPanel.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/panels/ExternalProgramsPanel.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -39,70 +39,63 @@ *

* This panel contains all components for setting configuration parameters * related to the use of external programs. - * - * - * */ @SuppressWarnings("serial") public class ExternalProgramsPanel - extends AbstractPanel -{ + extends AbstractPanel { - private JLabel executablePathLabel; + private JLabel executablePathLabel; - private JCheckBox sevenZipEnableBox; - private JLabel sevenZipLabel; - private JTextField sevenZipPathField; - private JButton sevenZipSearchButton; + private JCheckBox sevenZipEnableBox; + private JLabel sevenZipLabel; + private JTextField sevenZipPathField; + private JButton sevenZipSearchButton; - /** - * (Constructor) Creates a new ExternalProgramPanel. - * - * @param controller - * Reference to the controller - */ - public ExternalProgramsPanel(final ConfigController controller) - { + /** + * (Constructor) Creates a new ExternalProgramPanel. + * + * @param controller Reference to the controller + */ + public ExternalProgramsPanel(final ConfigController controller) { - super(controller); - controller.register(PanelKeys.PANEL_EXTERNALS, this); + super(controller); + controller.register(PanelKeys.PANEL_EXTERNALS, this); - createExecutableSettings(); - } + createExecutableSettings(); + } - // --------------------------------------------------------------------------// - // CONSTRUCTION METHODS // - // --------------------------------------------------------------------------// + // --------------------------------------------------------------------------// + // CONSTRUCTION METHODS // + // --------------------------------------------------------------------------// - private void createExecutableSettings() - { + private void createExecutableSettings() { - executablePathLabel = new JLabel("Path to executables: "); - executablePathLabel.setBounds(10, 10, 250, 25); - this.add(executablePathLabel); + executablePathLabel = new JLabel("Path to executables: "); + executablePathLabel.setBounds(10, 10, 250, 25); + this.add(executablePathLabel); - // ------------------------------------------------------------------// - // 7ZIP / P7ZIP SETTINGS // - // ------------------------------------------------------------------// + // ------------------------------------------------------------------// + // 7ZIP / P7ZIP SETTINGS // + // ------------------------------------------------------------------// - sevenZipEnableBox = new JCheckBox(); - sevenZipEnableBox.setBounds(10, 45, 25, 25); + sevenZipEnableBox = new JCheckBox(); + sevenZipEnableBox.setBounds(10, 45, 25, 25); - this.add(sevenZipEnableBox); + this.add(sevenZipEnableBox); - sevenZipLabel = new JLabel("7Zip Executable: "); - sevenZipLabel.setBorder(BorderFactory.createRaisedBevelBorder()); - sevenZipLabel.setBounds(40, 45, 120, 25); - this.add(sevenZipLabel); + sevenZipLabel = new JLabel("7Zip Executable: "); + sevenZipLabel.setBorder(BorderFactory.createRaisedBevelBorder()); + sevenZipLabel.setBounds(40, 45, 120, 25); + this.add(sevenZipLabel); - sevenZipPathField = new JTextField(); - sevenZipPathField.setBounds(170, 45, 300, 25); - this.add(sevenZipPathField); + sevenZipPathField = new JTextField(); + sevenZipPathField.setBounds(170, 45, 300, 25); + this.add(sevenZipPathField); - sevenZipSearchButton = new JButton("Search"); - sevenZipSearchButton.setBounds(480, 45, 80, 25); + sevenZipSearchButton = new JButton("Search"); + sevenZipSearchButton.setBounds(480, 45, 80, 25); - sevenZipSearchButton.addActionListener(e -> { + sevenZipSearchButton.addActionListener(e -> { JFileChooser fc = new JFileChooser(); if (fc.showOpenDialog(new JPanel()) == JFileChooser.APPROVE_OPTION) { @@ -110,9 +103,9 @@ private void createExecutableSettings() } }); - this.add(sevenZipSearchButton); + this.add(sevenZipSearchButton); - sevenZipEnableBox.addActionListener(e -> { + sevenZipEnableBox.addActionListener(e -> { boolean flag = !controller.is7ZipEnabled(); controller.setEnable7Zip(flag); @@ -121,120 +114,111 @@ private void createExecutableSettings() sevenZipSearchButton.setEnabled(flag); }); - } - - // --------------------------------------------------------------------------// - // VALIDATION METHODS // - // --------------------------------------------------------------------------// - - /** - * A call of this method should validate the status of the panels - * components. - */ - @Override - public void validate() - { - validate7ZipSettings(); - } - - /** - * Validates the 7Zip settings - */ - private void validate7ZipSettings() - { - boolean flag = controller.is7ZipEnabled(); - - sevenZipEnableBox.setSelected(flag); - sevenZipLabel.setEnabled(flag); - sevenZipPathField.setEnabled(flag); - sevenZipSearchButton.setEnabled(flag); - } - - - /** - * A call of this method should validate the positions of the panels - * components. - */ - @Override - public void relocate() - { - - int w = 550, h = 210; - - int x = (this.getWidth() - w) / 2; - int y = (this.getHeight() - h) / 2; - - // 10, 10 <-> 580, 185 - executablePathLabel.setLocation(x, y); - - sevenZipEnableBox.setLocation(x, y + 35); - sevenZipLabel.setLocation(x + 30, y + 35); - sevenZipPathField.setLocation(x + 160, y + 35); - sevenZipSearchButton.setLocation(x + 470, y + 35); - - } - - // --------------------------------------------------------------------------// - // INPUT/OUTPUT METHODS // - // --------------------------------------------------------------------------// - - /** - * Reads the configuration parameters described in the panel from the - * ConfigSettings and and sets the contained values. - * - * @param config - * Reference to the ConfigSettings object - */ - @Override - public void applyConfig(final ConfigSettings config) - { - Object o = config - .getConfigParameter(ConfigurationKeys.PATH_PROGRAM_7ZIP); - if (o != null) { - controller.setEnable7Zip(true); - sevenZipPathField.setText((String) o); - } - else { - controller.setEnable7Zip(false); - sevenZipPathField.setText(""); - } - - } - - /** - * Adds the xml description of the panels content to the StringBuilder. - * Errors which occur during the xml transformation will be added to the - * ConfigVerification. - * - * @param builder - * Reference to a StringBuilder object - * @param errors - * Reference to the ConfigVerification object - */ - @Override - public void toXML(final StringBuilder builder, - final ConfigVerification errors) - { - - boolean sevenzip = controller.is7ZipEnabled(); - - if (sevenzip) { - - String cmd; - builder.append("\t\r\n"); - - if (sevenzip) { - cmd = sevenZipPathField.getText(); - if (cmd.length() == 0) { - errors.add(new ConfigItem(ConfigItemTypes.ERROR, - ConfigErrorKeys.PATH_NOT_SET, - "The path to the 7Zip executable" + " is missing.")); - } - - builder.append("\t\t\"" + cmd + "\"\r\n"); - } - - builder.append("\t\r\n"); - } - } + } + + // --------------------------------------------------------------------------// + // VALIDATION METHODS // + // --------------------------------------------------------------------------// + + /** + * A call of this method should validate the status of the panels + * components. + */ + @Override + public void validate() { + validate7ZipSettings(); + } + + /** + * Validates the 7Zip settings + */ + private void validate7ZipSettings() { + boolean flag = controller.is7ZipEnabled(); + + sevenZipEnableBox.setSelected(flag); + sevenZipLabel.setEnabled(flag); + sevenZipPathField.setEnabled(flag); + sevenZipSearchButton.setEnabled(flag); + } + + + /** + * A call of this method should validate the positions of the panels + * components. + */ + @Override + public void relocate() { + + int w = 550, h = 210; + + int x = (this.getWidth() - w) / 2; + int y = (this.getHeight() - h) / 2; + + // 10, 10 <-> 580, 185 + executablePathLabel.setLocation(x, y); + + sevenZipEnableBox.setLocation(x, y + 35); + sevenZipLabel.setLocation(x + 30, y + 35); + sevenZipPathField.setLocation(x + 160, y + 35); + sevenZipSearchButton.setLocation(x + 470, y + 35); + + } + + // --------------------------------------------------------------------------// + // INPUT/OUTPUT METHODS // + // --------------------------------------------------------------------------// + + /** + * Reads the configuration parameters described in the panel from the + * ConfigSettings and and sets the contained values. + * + * @param config Reference to the ConfigSettings object + */ + @Override + public void applyConfig(final ConfigSettings config) { + Object o = config + .getConfigParameter(ConfigurationKeys.PATH_PROGRAM_7ZIP); + if (o != null) { + controller.setEnable7Zip(true); + sevenZipPathField.setText((String) o); + } else { + controller.setEnable7Zip(false); + sevenZipPathField.setText(""); + } + + } + + /** + * Adds the xml description of the panels content to the StringBuilder. + * Errors which occur during the xml transformation will be added to the + * ConfigVerification. + * + * @param builder Reference to a StringBuilder object + * @param errors Reference to the ConfigVerification object + */ + @Override + public void toXML(final StringBuilder builder, + final ConfigVerification errors) { + + boolean sevenzip = controller.is7ZipEnabled(); + + if (sevenzip) { + + String cmd; + builder.append("\t\r\n"); + + if (sevenzip) { + cmd = sevenZipPathField.getText(); + if (cmd.length() == 0) { + errors.add(new ConfigItem(ConfigItemTypes.ERROR, + ConfigErrorKeys.PATH_NOT_SET, + "The path to the 7Zip executable" + " is missing.")); + } + + builder.append("\t\t\"" + cmd + "\"\r\n"); + } + + builder.append("\t\r\n"); + } + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/panels/FilterPanel.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/panels/FilterPanel.java index 9b438dcb..85aac50a 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/panels/FilterPanel.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/panels/FilterPanel.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -40,219 +40,200 @@ */ @SuppressWarnings("serial") public class FilterPanel - extends AbstractPanel -{ - // table with namespaces to filter - private JTable namespaces; - - /** - * (Constructor) Creates a new SurrogatePanel - * - * @param controller - * Reference to the controller - */ - public FilterPanel(ConfigController controller) - { - super(controller); - - controller.register(PanelKeys.PANEL_FILTER, this); - - initTable(); - - initButtons(); - - // init label - JLabel hint = new JLabel(); - hint.setText("If nothing is selected,
all namespaces are allowed."); - hint.setBounds(385, 70, 180, 60); - this.add(hint); - } - - /** - * Initialize JTable that contains namespaces - */ - private void initTable() - { - namespaces = new JTable(new FilterTableModel()); - - namespaces.removeColumn(namespaces.getColumn("#")); - - namespaces.setFillsViewportHeight(true); - namespaces.setPreferredScrollableViewportSize(new Dimension(500, 70)); - - // Create the scroll pane and add the table to it. - JScrollPane scrollPane = new JScrollPane(namespaces); - - scrollPane.setBounds(70, 10, 300, 200); - this.add(scrollPane); - } - - /** - * Initialize two buttons: SelectAll and UnselectAll - */ - private void initButtons() - { - JButton selectAll = new JButton("Select all"); - selectAll.addActionListener(arg0 -> { + extends AbstractPanel { + // table with namespaces to filter + private JTable namespaces; + + /** + * (Constructor) Creates a new SurrogatePanel + * + * @param controller Reference to the controller + */ + public FilterPanel(ConfigController controller) { + super(controller); + + controller.register(PanelKeys.PANEL_FILTER, this); + + initTable(); + + initButtons(); + + // init label + JLabel hint = new JLabel(); + hint.setText("If nothing is selected,
all namespaces are allowed."); + hint.setBounds(385, 70, 180, 60); + this.add(hint); + } + + /** + * Initialize JTable that contains namespaces + */ + private void initTable() { + namespaces = new JTable(new FilterTableModel()); + + namespaces.removeColumn(namespaces.getColumn("#")); + + namespaces.setFillsViewportHeight(true); + namespaces.setPreferredScrollableViewportSize(new Dimension(500, 70)); + + // Create the scroll pane and add the table to it. + JScrollPane scrollPane = new JScrollPane(namespaces); + + scrollPane.setBounds(70, 10, 300, 200); + this.add(scrollPane); + } + + /** + * Initialize two buttons: SelectAll and UnselectAll + */ + private void initButtons() { + JButton selectAll = new JButton("Select all"); + selectAll.addActionListener(arg0 -> { for (int i = 0; i < 22; i++) { namespaces.getModel().setValueAt(true, i, 1); } }); - selectAll.setBounds(380, 10, 120, 25); - this.add(selectAll); + selectAll.setBounds(380, 10, 120, 25); + this.add(selectAll); - JButton unselectAll = new JButton("Unselect all"); + JButton unselectAll = new JButton("Unselect all"); - unselectAll.addActionListener(e -> { + unselectAll.addActionListener(e -> { for (int i = 0; i < 22; i++) { namespaces.getModel().setValueAt(false, i, 1); } }); - unselectAll.setBounds(380, 40, 120, 25); - this.add(unselectAll); - } - - @Override - public void validate() - { - - } - - @Override - public void relocate() - { - - } - - @Override - public void toXML(StringBuilder builder, ConfigVerification errors) - { - builder.append("\t\r\n"); - builder.append("\t\t\r\n"); - int rows = this.namespaces.getModel().getRowCount(); - for (int j = 0; j < rows; j++) { - - if (this.namespaces.getModel().getValueAt(j, 1).equals(true)) { - builder.append("\t\t\t"); - builder.append(this.namespaces.getModel().getValueAt(j, 2)); - builder.append("\r\n"); - } - - } - - builder.append("\t\t\r\n"); - builder.append("\t\r\n"); - - } - - @Override - public void applyConfig(ConfigSettings config) - { - @SuppressWarnings("unchecked") - Set namespaces = (Set) config - .getConfigParameter(ConfigurationKeys.NAMESPACES_TO_KEEP); - - if (namespaces != null) { - - int rows = this.namespaces.getModel().getRowCount(); - for (int j = 0; j < rows; j++) { - if (namespaces.contains((this.namespaces.getModel().getValueAt( - j, 2)))) { - this.namespaces.getModel().setValueAt(true, j, - 1); - } - else { - this.namespaces.getModel().setValueAt(false, - j, 1); - } - - } - - } - - } - - /** - * Custom model for JTable that contains a list of namespaces to filter - * - */ - class FilterTableModel - extends AbstractTableModel - { - private final String[] columnNames = { "Namespace", "Allow", "#" }; - - private final Object[][] data = { { "main(0)", false, 0 }, - { "talk(1)", false, 1 }, - { "user(2)", false, 2 }, - { "user talk(3)", false, 3 }, - { "wikipedia(4)", false, 4 }, - { "wikipedia talk(5)", false, 5 }, - { "file(6)", false, 6 }, - { "file talk(7)", false, 7 }, - { "mediawiki(8)", false, 8 }, - { "mediawiki talk(9)", false, 9 }, - { "template(10)", false, 10 }, - { "template talk(11)", false, 11 }, - { "help(12)", false, 12 }, - { "help talk(13)", false, 13 }, - { "category(14)", false, 14 }, - { "category talk(15)", false, 15 }, - { "portal(100)", false, 100 }, - { "portal talk(101)", false, 101 }, - { "book(108)", false, 108 }, - { "book talk(109)", false, 109 }, - { "special(-1)", false, -1 }, - { "media(-2)", false, -2 } - - }; - - @Override - public int getColumnCount() - { - return columnNames.length; - } - - @Override - public int getRowCount() - { - return data.length; - } - - @Override - public String getColumnName(int col) - { - return columnNames[col]; - } - - @Override - public Object getValueAt(int row, int col) - { - return data[row][col]; - } - - @SuppressWarnings({ "unchecked", "rawtypes" }) - @Override - public Class getColumnClass(int c) - { - return getValueAt(0, c).getClass(); - } - - @Override - public boolean isCellEditable(int row, int col) - { - return true; - } - - @Override - public void setValueAt(Object value, int row, int col) - { - data[row][col] = value; - fireTableCellUpdated(row, col); - } - - } + unselectAll.setBounds(380, 40, 120, 25); + this.add(unselectAll); + } + + @Override + public void validate() { + + } + + @Override + public void relocate() { + + } + + @Override + public void toXML(StringBuilder builder, ConfigVerification errors) { + builder.append("\t\r\n"); + builder.append("\t\t\r\n"); + int rows = this.namespaces.getModel().getRowCount(); + for (int j = 0; j < rows; j++) { + + if (this.namespaces.getModel().getValueAt(j, 1).equals(true)) { + builder.append("\t\t\t"); + builder.append(this.namespaces.getModel().getValueAt(j, 2)); + builder.append("\r\n"); + } + + } + + builder.append("\t\t\r\n"); + builder.append("\t\r\n"); + + } + + @Override + public void applyConfig(ConfigSettings config) { + @SuppressWarnings("unchecked") + Set namespaces = (Set) config + .getConfigParameter(ConfigurationKeys.NAMESPACES_TO_KEEP); + + if (namespaces != null) { + + int rows = this.namespaces.getModel().getRowCount(); + for (int j = 0; j < rows; j++) { + if (namespaces.contains((this.namespaces.getModel().getValueAt( + j, 2)))) { + this.namespaces.getModel().setValueAt(true, j, + 1); + } else { + this.namespaces.getModel().setValueAt(false, + j, 1); + } + + } + + } + + } + + /** + * Custom model for JTable that contains a list of namespaces to filter + */ + class FilterTableModel + extends AbstractTableModel { + private final String[] columnNames = {"Namespace", "Allow", "#"}; + + private final Object[][] data = {{"main(0)", false, 0}, + {"talk(1)", false, 1}, + {"user(2)", false, 2}, + {"user talk(3)", false, 3}, + {"wikipedia(4)", false, 4}, + {"wikipedia talk(5)", false, 5}, + {"file(6)", false, 6}, + {"file talk(7)", false, 7}, + {"mediawiki(8)", false, 8}, + {"mediawiki talk(9)", false, 9}, + {"template(10)", false, 10}, + {"template talk(11)", false, 11}, + {"help(12)", false, 12}, + {"help talk(13)", false, 13}, + {"category(14)", false, 14}, + {"category talk(15)", false, 15}, + {"portal(100)", false, 100}, + {"portal talk(101)", false, 101}, + {"book(108)", false, 108}, + {"book talk(109)", false, 109}, + {"special(-1)", false, -1}, + {"media(-2)", false, -2} + + }; + + @Override + public int getColumnCount() { + return columnNames.length; + } + + @Override + public int getRowCount() { + return data.length; + } + + @Override + public String getColumnName(int col) { + return columnNames[col]; + } + + @Override + public Object getValueAt(int row, int col) { + return data[row][col]; + } + + @SuppressWarnings({"unchecked", "rawtypes"}) + @Override + public Class getColumnClass(int c) { + return getValueAt(0, c).getClass(); + } + + @Override + public boolean isCellEditable(int row, int col) { + return true; + } + + @Override + public void setValueAt(Object value, int row, int col) { + data[row][col] = value; + fireTableCellUpdated(row, col); + } + + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/panels/InputPanel.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/panels/InputPanel.java index d5e4dd86..17ebcdbd 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/panels/InputPanel.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/panels/InputPanel.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -45,82 +45,71 @@ *

* This panel contains all components for setting configuration parameters * related to the input data. - * - * - * */ @SuppressWarnings("serial") public class InputPanel - extends AbstractPanel -{ - - /** - * Subpanel of the InputPanel - *

- * Contains the settings related to the surrogate mode - * - * - * - */ - private class SurrogatePanel - extends AbstractPanel - { - - private JLabel surrogateLabel; - private JRadioButton replaceSurrogatesRadioButton; - private JRadioButton faultySurrogatesRadioButton; - private JRadioButton discardSurrogatesRevisionRadioButton; - private JRadioButton discardSurrogatesArticleRadioButton; - - /** - * (Constructor) Creates a new SurrogatePanel - * - * @param controller - * Reference to the controller - */ - public SurrogatePanel(final ConfigController controller) - { - super(controller); - createButtons(); - } - - private void createButtons() - { - surrogateLabel = new JLabel("Surrogate Characters"); - surrogateLabel.setBounds(10, 10, 130, 25); - this.add(surrogateLabel); - - /* - * DEFAULT MODE - */ - discardSurrogatesRevisionRadioButton = new JRadioButton( - "Discard revision"); - discardSurrogatesRevisionRadioButton.setBounds(10, 90, 120, 25); - - discardSurrogatesRevisionRadioButton - .addActionListener(e -> { - - if (controller.getSurrogates() != SurrogateModes.DISCARD_REVISION) { - controller - .setSurrogates(SurrogateModes.DISCARD_REVISION); - } - - validateSurrogateSettings(); - }); - - // pre-activate default mode - discardSurrogatesRevisionRadioButton.setSelected(true); - - this.add(discardSurrogatesRevisionRadioButton); - - /* - * REPLACE-Mode - */ - - replaceSurrogatesRadioButton = new JRadioButton("Replace them"); - replaceSurrogatesRadioButton.setBounds(10, 40, 120, 25); - - replaceSurrogatesRadioButton.addActionListener(e -> { + extends AbstractPanel { + + /** + * Subpanel of the InputPanel + *

+ * Contains the settings related to the surrogate mode + */ + private class SurrogatePanel + extends AbstractPanel { + + private JLabel surrogateLabel; + private JRadioButton replaceSurrogatesRadioButton; + private JRadioButton faultySurrogatesRadioButton; + private JRadioButton discardSurrogatesRevisionRadioButton; + private JRadioButton discardSurrogatesArticleRadioButton; + + /** + * (Constructor) Creates a new SurrogatePanel + * + * @param controller Reference to the controller + */ + public SurrogatePanel(final ConfigController controller) { + super(controller); + createButtons(); + } + + private void createButtons() { + surrogateLabel = new JLabel("Surrogate Characters"); + surrogateLabel.setBounds(10, 10, 130, 25); + this.add(surrogateLabel); + + /* + * DEFAULT MODE + */ + discardSurrogatesRevisionRadioButton = new JRadioButton( + "Discard revision"); + discardSurrogatesRevisionRadioButton.setBounds(10, 90, 120, 25); + + discardSurrogatesRevisionRadioButton + .addActionListener(e -> { + + if (controller.getSurrogates() != SurrogateModes.DISCARD_REVISION) { + controller + .setSurrogates(SurrogateModes.DISCARD_REVISION); + } + + validateSurrogateSettings(); + }); + + // pre-activate default mode + discardSurrogatesRevisionRadioButton.setSelected(true); + + this.add(discardSurrogatesRevisionRadioButton); + + /* + * REPLACE-Mode + */ + + replaceSurrogatesRadioButton = new JRadioButton("Replace them"); + replaceSurrogatesRadioButton.setBounds(10, 40, 120, 25); + + replaceSurrogatesRadioButton.addActionListener(e -> { if (controller.getSurrogates() != SurrogateModes.REPLACE) { controller.setSurrogates(SurrogateModes.REPLACE); @@ -128,16 +117,16 @@ private void createButtons() validateSurrogateSettings(); }); - this.add(replaceSurrogatesRadioButton); + this.add(replaceSurrogatesRadioButton); - /* - * THROW_ERROR-Mode - */ + /* + * THROW_ERROR-Mode + */ - faultySurrogatesRadioButton = new JRadioButton("Throw an error"); - faultySurrogatesRadioButton.setBounds(10, 65, 120, 25); + faultySurrogatesRadioButton = new JRadioButton("Throw an error"); + faultySurrogatesRadioButton.setBounds(10, 65, 120, 25); - faultySurrogatesRadioButton.addActionListener(e -> { + faultySurrogatesRadioButton.addActionListener(e -> { if (controller.getSurrogates() != SurrogateModes.THROW_ERROR) { controller.setSurrogates(SurrogateModes.THROW_ERROR); @@ -145,50 +134,48 @@ private void createButtons() validateSurrogateSettings(); }); - this.add(faultySurrogatesRadioButton); + this.add(faultySurrogatesRadioButton); - /* - * DISCARD_REST-Mode - */ + /* + * DISCARD_REST-Mode + */ - discardSurrogatesArticleRadioButton = new JRadioButton( - "Discard rest"); - discardSurrogatesArticleRadioButton.setBounds(10, 115, 120, 25); + discardSurrogatesArticleRadioButton = new JRadioButton( + "Discard rest"); + discardSurrogatesArticleRadioButton.setBounds(10, 115, 120, 25); - discardSurrogatesArticleRadioButton - .addActionListener(e -> { + discardSurrogatesArticleRadioButton + .addActionListener(e -> { - if (controller.getSurrogates() != SurrogateModes.DISCARD_REST) { - controller - .setSurrogates(SurrogateModes.DISCARD_REST); - } + if (controller.getSurrogates() != SurrogateModes.DISCARD_REST) { + controller + .setSurrogates(SurrogateModes.DISCARD_REST); + } - validateSurrogateSettings(); - }); - this.add(discardSurrogatesArticleRadioButton); + validateSurrogateSettings(); + }); + this.add(discardSurrogatesArticleRadioButton); - } + } - /** - * A call of this method should validate the status of the panels - * components. - */ - @Override - public void validate() - { - validateSurrogateSettings(); - } + /** + * A call of this method should validate the status of the panels + * components. + */ + @Override + public void validate() { + validateSurrogateSettings(); + } - /** - * Validates the surrogate settings. - */ - private void validateSurrogateSettings() - { + /** + * Validates the surrogate settings. + */ + private void validateSurrogateSettings() { - /* - * TODO Uncomment this code as soon as the surrogate modes are reactivated - */ + /* + * TODO Uncomment this code as soon as the surrogate modes are reactivated + */ // SurrogateModes sur = controller.getSurrogates(); // @@ -201,321 +188,303 @@ private void validateSurrogateSettings() // discardSurrogatesArticleRadioButton // .setSelected(sur == SurrogateModes.DISCARD_REST); - /* - * DEACTIVATE UNSUPPORTED MODES - * TODO: remove config options for - * unsupported surrogates mode. Can be activated again as soon as - * the implementation of these modes have been checked. - * Then also uncomment the original code above. - */ - //BEGIN WORK AROUND FOR DEACTIVATED SURROGATE MODES - faultySurrogatesRadioButton.setEnabled(false); - discardSurrogatesArticleRadioButton.setEnabled(false); - replaceSurrogatesRadioButton.setEnabled(false); - discardSurrogatesRevisionRadioButton.setSelected(true); - //END WORK AROUND FOR DEACTIVATED SURROGATE MODES - - - } - - /** - * A call of this method should validate the positions of the panels - * components. - */ - @Override - public void relocate() - { - - int w = 120, h = 130; - int x = (this.getWidth() - w) / 2, y = (this.getHeight() - h) / 2; - - surrogateLabel.setLocation(10, 10); - faultySurrogatesRadioButton.setLocation(x, y + 55); - replaceSurrogatesRadioButton.setLocation(x, y + 30); - discardSurrogatesRevisionRadioButton.setLocation(x, y + 80); - discardSurrogatesArticleRadioButton.setLocation(x, y + 105); - } - - /** - * empty method - * - * @deprecated - * @throws UnsupportedOperationException - */ - @Deprecated - @Override - public void applyConfig(final ConfigSettings config) - { - throw new UnsupportedOperationException(); - } - - /** - * empty method - * - * @deprecated - * @throws UnsupportedOperationException - */ - @Deprecated - @Override - public void toXML(final StringBuilder builder, - final ConfigVerification errors) - { - throw new UnsupportedOperationException(); - } - } - - // --------------------------------------------------------------------------// - // FIELDS & CONSTRUCTORS // - // --------------------------------------------------------------------------// - - private JTable archiveTable; - private JScrollPane archiveScrollPane; - - private JButton addArchiveButton; - private JButton removeArchiveButton; - - private JLabel encodingLabel; - private JTextField encodingField; - - private SurrogatePanel surrogatePanel; - - /** - * (Constructor) Creates a new InputPanel. - * - * @param controller - * Reference to the controller - */ - public InputPanel(final ConfigController controller) - { - - super(controller); - controller.register(PanelKeys.PANEL_INPUT, this); - - createArchiveTable(); - createControllButtons(); - createEncodingSettings(); - createSurrogateSettings(); - } - - // --------------------------------------------------------------------------// - // CONSTRUCTION METHODS // - // --------------------------------------------------------------------------// - - private void createArchiveTable() - { - archiveTable = new JTable(controller.getArchives()); - archiveTable.setSelectionMode(ListSelectionModel.SINGLE_SELECTION); - - archiveScrollPane = new JScrollPane(archiveTable); - archiveScrollPane.setBounds(10, 10, 410, 210); - - this.add(archiveScrollPane); - } - - private void createControllButtons() - { - addArchiveButton = new JButton("Add"); - addArchiveButton.setBounds(445, 20, 100, 25); - - addArchiveButton.addActionListener(e -> { + /* + * DEACTIVATE UNSUPPORTED MODES + * TODO: remove config options for + * unsupported surrogates mode. Can be activated again as soon as + * the implementation of these modes have been checked. + * Then also uncomment the original code above. + */ + //BEGIN WORK AROUND FOR DEACTIVATED SURROGATE MODES + faultySurrogatesRadioButton.setEnabled(false); + discardSurrogatesArticleRadioButton.setEnabled(false); + replaceSurrogatesRadioButton.setEnabled(false); + discardSurrogatesRevisionRadioButton.setSelected(true); + //END WORK AROUND FOR DEACTIVATED SURROGATE MODES + + + } + + /** + * A call of this method should validate the positions of the panels + * components. + */ + @Override + public void relocate() { + + int w = 120, h = 130; + int x = (this.getWidth() - w) / 2, y = (this.getHeight() - h) / 2; + + surrogateLabel.setLocation(10, 10); + faultySurrogatesRadioButton.setLocation(x, y + 55); + replaceSurrogatesRadioButton.setLocation(x, y + 30); + discardSurrogatesRevisionRadioButton.setLocation(x, y + 80); + discardSurrogatesArticleRadioButton.setLocation(x, y + 105); + } + + /** + * empty method + * + * @throws UnsupportedOperationException + * @deprecated + */ + @Deprecated + @Override + public void applyConfig(final ConfigSettings config) { + throw new UnsupportedOperationException(); + } + + /** + * empty method + * + * @throws UnsupportedOperationException + * @deprecated + */ + @Deprecated + @Override + public void toXML(final StringBuilder builder, + final ConfigVerification errors) { + throw new UnsupportedOperationException(); + } + } + + // --------------------------------------------------------------------------// + // FIELDS & CONSTRUCTORS // + // --------------------------------------------------------------------------// + + private JTable archiveTable; + private JScrollPane archiveScrollPane; + + private JButton addArchiveButton; + private JButton removeArchiveButton; + + private JLabel encodingLabel; + private JTextField encodingField; + + private SurrogatePanel surrogatePanel; + + /** + * (Constructor) Creates a new InputPanel. + * + * @param controller Reference to the controller + */ + public InputPanel(final ConfigController controller) { + + super(controller); + controller.register(PanelKeys.PANEL_INPUT, this); + + createArchiveTable(); + createControllButtons(); + createEncodingSettings(); + createSurrogateSettings(); + } + + // --------------------------------------------------------------------------// + // CONSTRUCTION METHODS // + // --------------------------------------------------------------------------// + + private void createArchiveTable() { + archiveTable = new JTable(controller.getArchives()); + archiveTable.setSelectionMode(ListSelectionModel.SINGLE_SELECTION); + + archiveScrollPane = new JScrollPane(archiveTable); + archiveScrollPane.setBounds(10, 10, 410, 210); + + this.add(archiveScrollPane); + } + + private void createControllButtons() { + addArchiveButton = new JButton("Add"); + addArchiveButton.setBounds(445, 20, 100, 25); + + addArchiveButton.addActionListener(e -> { new InputDialog(controller).setVisible(true); repaint(); }); - this.add(addArchiveButton); + this.add(addArchiveButton); - removeArchiveButton = new JButton("Remove"); - removeArchiveButton.setBounds(445, 50, 100, 25); + removeArchiveButton = new JButton("Remove"); + removeArchiveButton.setBounds(445, 50, 100, 25); - removeArchiveButton.addActionListener(e -> { + removeArchiveButton.addActionListener(e -> { controller.removeArchive(archiveTable.getSelectedRow()); archiveTable.revalidate(); repaint(); }); - this.add(removeArchiveButton); - } - - private void createEncodingSettings() - { - encodingLabel = new JLabel("Wikipedia Character Encoding: "); - encodingLabel.setBorder(BorderFactory.createRaisedBevelBorder()); - encodingLabel.setBounds(10, 230, 200, 25); - this.add(encodingLabel); - - encodingField = new JTextField(); - encodingField.setBounds(220, 230, 200, 25); - this.add(encodingField); - } - - private void createSurrogateSettings() - { - surrogatePanel = new SurrogatePanel(controller); - surrogatePanel.setBorder(BorderFactory.createLoweredBevelBorder()); - surrogatePanel.setBounds(425, 95, 140, 160); - this.add(surrogatePanel); - } - - // --------------------------------------------------------------------------// - // VALIDATION METHODS // - // --------------------------------------------------------------------------// - - /** - * A call of this method should validate the status of the panels - * components. - */ - @Override - public void validate() - { - - this.archiveTable.revalidate(); - this.surrogatePanel.validate(); - } - - /** - * A call of this method should validate the positions of the panels - * components. - */ - @Override - public void relocate() - { - - int w = 555, h = 235; - int x = (this.getWidth() - w) / 2, y = (this.getHeight() - h) / 2; - - archiveScrollPane.setLocation(x, y); - - addArchiveButton.setLocation(x + 435, y + 10); - removeArchiveButton.setLocation(x + 435, y + 40); - - encodingLabel.setLocation(x, y + 220); - encodingField.setLocation(x + 210, y + 220); - - surrogatePanel.setLocation(x + 415, y + 85); - - } - - // --------------------------------------------------------------------------// - // INPUT/OUTPUT METHODS // - // --------------------------------------------------------------------------// - - /** - * Reads the configuration parameters described in the panel from the - * ConfigSettings and and sets the contained values. - * - * @param config - * Reference to the ConfigSettings object - */ - @Override - public void applyConfig(final ConfigSettings config) - { - - Object o = config - .getConfigParameter(ConfigurationKeys.WIKIPEDIA_ENCODING); - if (o != null) { - encodingField.setText((String) o); - } - else { - encodingField.setText(""); - } - - o = config.getConfigParameter(ConfigurationKeys.MODE_SURROGATES); - if (o != null) { - controller.setSurrogates((SurrogateModes) o); - } - else { - controller.setSurrogates(SurrogateModes.DISCARD_REVISION); - } - } - - /** - * Adds the xml description of the panels content to the StringBuilder. - * Errors which occur during the xml transformation will be added to the - * ConfigVerification. - * - * @param builder - * Reference to a StringBuilder object - * @param errors - * Reference to the ConfigVerification object - */ - @Override - public void toXML(final StringBuilder builder, - final ConfigVerification errors) - { - - SurrogateModes surMode = controller.getSurrogates(); - - String wikiEncoding = encodingField.getText(); - if (wikiEncoding.length() == 0) { - - errors.add(new ConfigItem(ConfigItemTypes.WARNING, - ConfigErrorKeys.MISSING_VALUE, - "The CharacterEncoding was not set.")); - } - - builder.append("\t\r\n"); - builder.append("\t\t" + surMode - + "\r\n"); - builder.append("\t\t" + wikiEncoding - + "\r\n"); - - ArchiveRegistry reg = controller.getArchives(); - - int size = reg.getRowCount(); - - ArchiveDescription archive; - InputType type; - String archivePath; - long start; - - if(size==0){ - errors.add(new ConfigItem(ConfigItemTypes.WARNING, - ConfigErrorKeys.MISSING_VALUE, - "No source file has been set.")); - } - - for (int i = 0; i < size; i++) { - - archive = reg.get(i); - - type = archive.getType(); - switch (type) { - case XML: - break; - case BZIP2: - //bzip is always enabled - nothing to check here - break; - case SEVENZIP: - if (!controller.is7ZipEnabled()) { - errors.add(new ConfigItem(ConfigItemTypes.ERROR, - ConfigErrorKeys.ILLEGAL_INPUT_FILE, - "The SevenUip mode is not " + "activated")); - } - break; - } - - archivePath = archive.getPath(); - if (archivePath.length() == 0) { - errors.add(new ConfigItem(ConfigItemTypes.ERROR, - ConfigErrorKeys.PATH_NOT_SET, - "The archive path is missing")); - } - - start = archive.getStartPosition(); - if (start < 0) { - errors.add(new ConfigItem(ConfigItemTypes.ERROR, - ConfigErrorKeys.VALUE_OUT_OF_RANGE, - "The archive start value should be at least 0")); - } - - builder.append("\t\t\r\n"); - builder.append("\t\t\t" + type + "\r\n"); - builder.append("\t\t\t\"" + archivePath + "\"\r\n"); - builder.append("\t\t\t" + start + "\r\n"); - builder.append("\t\t\r\n"); - } - builder.append("\t\r\n"); - } + this.add(removeArchiveButton); + } + + private void createEncodingSettings() { + encodingLabel = new JLabel("Wikipedia Character Encoding: "); + encodingLabel.setBorder(BorderFactory.createRaisedBevelBorder()); + encodingLabel.setBounds(10, 230, 200, 25); + this.add(encodingLabel); + + encodingField = new JTextField(); + encodingField.setBounds(220, 230, 200, 25); + this.add(encodingField); + } + + private void createSurrogateSettings() { + surrogatePanel = new SurrogatePanel(controller); + surrogatePanel.setBorder(BorderFactory.createLoweredBevelBorder()); + surrogatePanel.setBounds(425, 95, 140, 160); + this.add(surrogatePanel); + } + + // --------------------------------------------------------------------------// + // VALIDATION METHODS // + // --------------------------------------------------------------------------// + + /** + * A call of this method should validate the status of the panels + * components. + */ + @Override + public void validate() { + + this.archiveTable.revalidate(); + this.surrogatePanel.validate(); + } + + /** + * A call of this method should validate the positions of the panels + * components. + */ + @Override + public void relocate() { + + int w = 555, h = 235; + int x = (this.getWidth() - w) / 2, y = (this.getHeight() - h) / 2; + + archiveScrollPane.setLocation(x, y); + + addArchiveButton.setLocation(x + 435, y + 10); + removeArchiveButton.setLocation(x + 435, y + 40); + + encodingLabel.setLocation(x, y + 220); + encodingField.setLocation(x + 210, y + 220); + + surrogatePanel.setLocation(x + 415, y + 85); + + } + + // --------------------------------------------------------------------------// + // INPUT/OUTPUT METHODS // + // --------------------------------------------------------------------------// + + /** + * Reads the configuration parameters described in the panel from the + * ConfigSettings and and sets the contained values. + * + * @param config Reference to the ConfigSettings object + */ + @Override + public void applyConfig(final ConfigSettings config) { + + Object o = config + .getConfigParameter(ConfigurationKeys.WIKIPEDIA_ENCODING); + if (o != null) { + encodingField.setText((String) o); + } else { + encodingField.setText(""); + } + + o = config.getConfigParameter(ConfigurationKeys.MODE_SURROGATES); + if (o != null) { + controller.setSurrogates((SurrogateModes) o); + } else { + controller.setSurrogates(SurrogateModes.DISCARD_REVISION); + } + } + + /** + * Adds the xml description of the panels content to the StringBuilder. + * Errors which occur during the xml transformation will be added to the + * ConfigVerification. + * + * @param builder Reference to a StringBuilder object + * @param errors Reference to the ConfigVerification object + */ + @Override + public void toXML(final StringBuilder builder, + final ConfigVerification errors) { + + SurrogateModes surMode = controller.getSurrogates(); + + String wikiEncoding = encodingField.getText(); + if (wikiEncoding.length() == 0) { + + errors.add(new ConfigItem(ConfigItemTypes.WARNING, + ConfigErrorKeys.MISSING_VALUE, + "The CharacterEncoding was not set.")); + } + + builder.append("\t\r\n"); + builder.append("\t\t" + surMode + + "\r\n"); + builder.append("\t\t" + wikiEncoding + + "\r\n"); + + ArchiveRegistry reg = controller.getArchives(); + + int size = reg.getRowCount(); + + ArchiveDescription archive; + InputType type; + String archivePath; + long start; + + if (size == 0) { + errors.add(new ConfigItem(ConfigItemTypes.WARNING, + ConfigErrorKeys.MISSING_VALUE, + "No source file has been set.")); + } + + for (int i = 0; i < size; i++) { + + archive = reg.get(i); + + type = archive.getType(); + switch (type) { + case XML: + break; + case BZIP2: + //bzip is always enabled - nothing to check here + break; + case SEVENZIP: + if (!controller.is7ZipEnabled()) { + errors.add(new ConfigItem(ConfigItemTypes.ERROR, + ConfigErrorKeys.ILLEGAL_INPUT_FILE, + "The SevenUip mode is not " + "activated")); + } + break; + } + + archivePath = archive.getPath(); + if (archivePath.length() == 0) { + errors.add(new ConfigItem(ConfigItemTypes.ERROR, + ConfigErrorKeys.PATH_NOT_SET, + "The archive path is missing")); + } + + start = archive.getStartPosition(); + if (start < 0) { + errors.add(new ConfigItem(ConfigItemTypes.ERROR, + ConfigErrorKeys.VALUE_OUT_OF_RANGE, + "The archive start value should be at least 0")); + } + + builder.append("\t\t\r\n"); + builder.append("\t\t\t" + type + "\r\n"); + builder.append("\t\t\t\"" + archivePath + "\"\r\n"); + builder.append("\t\t\t" + start + "\r\n"); + builder.append("\t\t\r\n"); + } + builder.append("\t\r\n"); + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/panels/LoggingPanel.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/panels/LoggingPanel.java index dd6e4130..b9b1c007 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/panels/LoggingPanel.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/panels/LoggingPanel.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -39,154 +39,141 @@ *

* This panel contains all components for setting configuration parameters * related to the logging. - * */ @SuppressWarnings("serial") public class LoggingPanel - extends AbstractPanel -{ - - private JLabel diffToolLabel; - private JTextField diffToolField; - private JComboBox diffToolLogLevelComboBox; - - /** - * (Constructor) Creates a new LoggingPanel. - * - * @param controller - * Reference to the controller - */ - public LoggingPanel(final ConfigController controller) - { - - super(controller); - controller.register(PanelKeys.PANEL_LOGGING, this); - - createDiffToolLoggingSettings(); - } - - private void createDiffToolLoggingSettings() - { - diffToolLabel = new JLabel("Logging Root Folder: "); - diffToolLabel.setBorder(BorderFactory.createRaisedBevelBorder()); - diffToolLabel.setBounds(10, 10, 150, 25); - this.add(diffToolLabel); - - diffToolField = new JTextField(); - diffToolField.setBounds(170, 10, 200, 25); - this.add(diffToolField); - - diffToolLogLevelComboBox = new JComboBox<>(); - diffToolLogLevelComboBox.setBounds(390, 10, 100, 25); - - diffToolLogLevelComboBox.addItem(Level.ERROR); - diffToolLogLevelComboBox.addItem(Level.WARN); - diffToolLogLevelComboBox.addItem(Level.INFO); - diffToolLogLevelComboBox.addItem(Level.DEBUG); - diffToolLogLevelComboBox.addItem(Level.TRACE); - - this.add(diffToolLogLevelComboBox); - } - - - // --------------------------------------------------------------------------// - // VALIDATION METHODS // - // --------------------------------------------------------------------------// - - /** - * A call of this method should validate the status of the panels - * components. - */ - @Override - public void validate() - { - - } - - - /** - * A call of this method should validate the positions of the panels - * components. - */ - @Override - public void relocate() - { - - int w = 480, h = 245; - int x = (this.getWidth() - w) / 2, y = (this.getHeight() - h) / 2; - - diffToolLabel.setLocation(x, y); - diffToolField.setLocation(x + 160, y); - diffToolLogLevelComboBox.setLocation(x + 380, y); - - } - - // --------------------------------------------------------------------------// - // INPUT/OUTPUT METHODS // - // --------------------------------------------------------------------------// - - /** - * Reads the configuration parameters described in the panel from the - * ConfigSettings and and sets the contained values. - * - * @param config - * Reference to the ConfigSettings object - */ - @Override - public void applyConfig(final ConfigSettings config) - { - - Object o = config - .getConfigParameter(ConfigurationKeys.LOGGING_PATH_DIFFTOOL); - if (o != null) { - this.diffToolField.setText((String) o); - } - else { - this.diffToolField.setText(""); - } - - o = config - .getConfigParameter(ConfigurationKeys.LOGGING_LOGLEVEL_DIFFTOOL); - if (o != null) { - this.diffToolLogLevelComboBox.setSelectedItem(o); - } - - } - - /** - * Adds the xml description of the panels content to the StringBuilder. - * Errors which occur during the xml transformation will be added to the - * ConfigVerification. - * - * @param builder - * Reference to a StringBuilder object - * @param errors - * Reference to the ConfigVerification object - */ - @Override - public void toXML(final StringBuilder builder, - final ConfigVerification errors) - { - - builder.append("\t\r\n"); - - // DIFFTOOL - String pathDiffTool = diffToolField.getText(); - if (pathDiffTool.length() == 0) { - errors.add(new ConfigItem(ConfigItemTypes.WARNING, - ConfigErrorKeys.PATH_NOT_SET, - "The root folder for all logs and debug" - + " information has not been set.")); - } - if (!pathDiffTool.endsWith(File.separator) - && pathDiffTool.contains(File.separator)) { - pathDiffTool += File.separator; - } - - builder.append("\t\t\"").append(pathDiffTool).append( "\"\r\n"); - builder.append("\t\t\r\n"); - builder.append("\t\t\t").append(diffToolLogLevelComboBox.getSelectedItem() ).append("\r\n"); - builder.append("\t\t\r\n"); - builder.append("\t\r\n"); - } + extends AbstractPanel { + + private JLabel diffToolLabel; + private JTextField diffToolField; + private JComboBox diffToolLogLevelComboBox; + + /** + * (Constructor) Creates a new LoggingPanel. + * + * @param controller Reference to the controller + */ + public LoggingPanel(final ConfigController controller) { + + super(controller); + controller.register(PanelKeys.PANEL_LOGGING, this); + + createDiffToolLoggingSettings(); + } + + private void createDiffToolLoggingSettings() { + diffToolLabel = new JLabel("Logging Root Folder: "); + diffToolLabel.setBorder(BorderFactory.createRaisedBevelBorder()); + diffToolLabel.setBounds(10, 10, 150, 25); + this.add(diffToolLabel); + + diffToolField = new JTextField(); + diffToolField.setBounds(170, 10, 200, 25); + this.add(diffToolField); + + diffToolLogLevelComboBox = new JComboBox<>(); + diffToolLogLevelComboBox.setBounds(390, 10, 100, 25); + + diffToolLogLevelComboBox.addItem(Level.ERROR); + diffToolLogLevelComboBox.addItem(Level.WARN); + diffToolLogLevelComboBox.addItem(Level.INFO); + diffToolLogLevelComboBox.addItem(Level.DEBUG); + diffToolLogLevelComboBox.addItem(Level.TRACE); + + this.add(diffToolLogLevelComboBox); + } + + + // --------------------------------------------------------------------------// + // VALIDATION METHODS // + // --------------------------------------------------------------------------// + + /** + * A call of this method should validate the status of the panels + * components. + */ + @Override + public void validate() { + + } + + + /** + * A call of this method should validate the positions of the panels + * components. + */ + @Override + public void relocate() { + + int w = 480, h = 245; + int x = (this.getWidth() - w) / 2, y = (this.getHeight() - h) / 2; + + diffToolLabel.setLocation(x, y); + diffToolField.setLocation(x + 160, y); + diffToolLogLevelComboBox.setLocation(x + 380, y); + + } + + // --------------------------------------------------------------------------// + // INPUT/OUTPUT METHODS // + // --------------------------------------------------------------------------// + + /** + * Reads the configuration parameters described in the panel from the + * ConfigSettings and and sets the contained values. + * + * @param config Reference to the ConfigSettings object + */ + @Override + public void applyConfig(final ConfigSettings config) { + + Object o = config + .getConfigParameter(ConfigurationKeys.LOGGING_PATH_DIFFTOOL); + if (o != null) { + this.diffToolField.setText((String) o); + } else { + this.diffToolField.setText(""); + } + + o = config + .getConfigParameter(ConfigurationKeys.LOGGING_LOGLEVEL_DIFFTOOL); + if (o != null) { + this.diffToolLogLevelComboBox.setSelectedItem(o); + } + + } + + /** + * Adds the xml description of the panels content to the StringBuilder. + * Errors which occur during the xml transformation will be added to the + * ConfigVerification. + * + * @param builder Reference to a StringBuilder object + * @param errors Reference to the ConfigVerification object + */ + @Override + public void toXML(final StringBuilder builder, + final ConfigVerification errors) { + + builder.append("\t\r\n"); + + // DIFFTOOL + String pathDiffTool = diffToolField.getText(); + if (pathDiffTool.length() == 0) { + errors.add(new ConfigItem(ConfigItemTypes.WARNING, + ConfigErrorKeys.PATH_NOT_SET, + "The root folder for all logs and debug" + + " information has not been set.")); + } + if (!pathDiffTool.endsWith(File.separator) + && pathDiffTool.contains(File.separator)) { + pathDiffTool += File.separator; + } + + builder.append("\t\t\"").append(pathDiffTool).append("\"\r\n"); + builder.append("\t\t\r\n"); + builder.append("\t\t\t").append(diffToolLogLevelComboBox.getSelectedItem()).append("\r\n"); + builder.append("\t\t\r\n"); + builder.append("\t\r\n"); + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/panels/ModePanel.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/panels/ModePanel.java index 43483788..279125d3 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/panels/ModePanel.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/panels/ModePanel.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -35,221 +35,198 @@ *

* This panel contains all components for setting configuration parameters * related to the diff calculation. - * - * - * */ @SuppressWarnings("serial") public class ModePanel - extends AbstractPanel -{ - - private JLabel fullRevisionLabel; - private JTextField fullRevisionField; - - private JLabel minimumCommonSequenceLabel; - private JTextField minimumCommonSequenceField; - - /** - * (Constructor) Creates a new ModePanel. - * - * @param controller - * Reference to the controller - */ - public ModePanel(final ConfigController controller) - { - - super(controller); - controller.register(PanelKeys.PANEL_VALUES, this); - - createFullRevisionSettings(); - createMinimumCommonSequenceSettings(); - } - - // --------------------------------------------------------------------------// - // CONSTRUCTION METHODS // - // --------------------------------------------------------------------------// - - private void createFullRevisionSettings() - { - - fullRevisionLabel = new JLabel( - "Every n-th revision will be a full revision:"); - fullRevisionLabel.setBorder(BorderFactory.createRaisedBevelBorder()); - fullRevisionLabel.setBounds(10, 10, 270, 25); - this.add(fullRevisionLabel); - - fullRevisionField = new JTextField(); - fullRevisionField.setBounds(290, 10, 100, 25); - this.add(fullRevisionField); - } - - private void createMinimumCommonSequenceSettings() - { - - minimumCommonSequenceLabel = new JLabel( - "Min lenght of a common subsequence:"); - minimumCommonSequenceLabel.setBorder(BorderFactory - .createRaisedBevelBorder()); - minimumCommonSequenceLabel.setBounds(10, 50, 270, 25); - this.add(minimumCommonSequenceLabel); - - minimumCommonSequenceField = new JTextField(); - minimumCommonSequenceField.setBounds(290, 50, 100, 25); - this.add(minimumCommonSequenceField); - } - - // --------------------------------------------------------------------------// - // VALIDATION METHODS // - // --------------------------------------------------------------------------// - - /** - * empty method - */ - @Override - public void validate() - { - - } - - /** - * A call of this method should validate the positions of the panels - * components. - */ - @Override - public void relocate() - { - - int w = 380, h = 65; - int x = (this.getWidth() - w) / 2, y = (this.getHeight() - h) / 2; - - fullRevisionLabel.setLocation(x, y); - fullRevisionField.setLocation(x + 280, y); - - minimumCommonSequenceLabel.setLocation(x, y + 40); - minimumCommonSequenceField.setLocation(x + 280, y + 40); - } - - // --------------------------------------------------------------------------// - // INPUT/OUTPUT METHODS // - // --------------------------------------------------------------------------// - - /** - * Reads the configuration parameters described in the panel from the - * ConfigSettings and and sets the contained values. - * - * @param config - * Reference to the ConfigSettings object - */ - @Override - public void applyConfig(final ConfigSettings config) - { - Object o = config - .getConfigParameter(ConfigurationKeys.COUNTER_FULL_REVISION); - if (o != null) { - this.fullRevisionField.setText(Integer.toString((Integer) o)); - } - else { - this.fullRevisionField.setText(""); - } - - o = config - .getConfigParameter(ConfigurationKeys.VALUE_MINIMUM_LONGEST_COMMON_SUBSTRING); - if (o != null) { - this.minimumCommonSequenceField.setText(Integer - .toString((Integer) o)); - } - else { - this.minimumCommonSequenceField.setText(""); - } - } - - /** - * Adds the xml description of the panels content to the StringBuilder. - * Errors which occur during the xml transformation will be added to the - * ConfigVerification. - * - * @param builder - * Reference to a StringBuilder object - * @param errors - * Reference to the ConfigVerification object - */ - @Override - public void toXML(StringBuilder builder, final ConfigVerification errors) - { - - int minLCS = -1, fullRevCounter = -1; - - // Check the FullRevisionCounter input - String text = this.minimumCommonSequenceField.getText(); - if (text.length() == 0) { - errors.add(new ConfigItem(ConfigItemTypes.ERROR, - ConfigErrorKeys.MISSING_VALUE, "The value for minimum " - + "LongestCommonSubsequence is missing.")); - } - else { - try { - minLCS = Integer.parseInt(text); - if (minLCS < 7) { - errors.add(new ConfigItem(ConfigItemTypes.ERROR, - ConfigErrorKeys.VALUE_OUT_OF_RANGE, - "The value of the minimum " - + " LongestCommonSubsequence has to be" - + " at least 7.")); - } - else if (minLCS < 12) { - errors.add(new ConfigItem(ConfigItemTypes.WARNING, - ConfigErrorKeys.VALUE_OUT_OF_RANGE, - "A value smaller than 12 for the " - + "minimum LongestCommonSubsequence" - + " is not recommended.")); - } - } - catch (NumberFormatException nfe) { - errors.add(new ConfigItem(ConfigItemTypes.ERROR, - ConfigErrorKeys.ILLEGAL_INPUT, - "NumberFormatException for " - + "ArticleProducer TaskLimit")); - } - } - - // Check the FullRevisionCounter input - text = this.fullRevisionField.getText(); - if (text.length() == 0) { - errors.add(new ConfigItem(ConfigItemTypes.ERROR, - ConfigErrorKeys.MISSING_VALUE, - "The value for FullRevision Counter" + " is missing.")); - } - else { - try { - fullRevCounter = Integer.parseInt(text); - if (fullRevCounter < 1) { - errors.add(new ConfigItem(ConfigItemTypes.ERROR, - ConfigErrorKeys.VALUE_OUT_OF_RANGE, - "The FullRevision Counter has to " - + "be at least 1.")); - } - else if (fullRevCounter < 100) { - errors.add(new ConfigItem(ConfigItemTypes.WARNING, - ConfigErrorKeys.VALUE_OUT_OF_RANGE, - "A FullRevision Counter with a" - + " value smaller than 100 is not" - + " recommended.")); - } - } - catch (NumberFormatException nfe) { - errors.add(new ConfigItem(ConfigItemTypes.ERROR, - ConfigErrorKeys.ILLEGAL_INPUT, - "NumberFormatException for " - + "ArticleProducer TaskLimit")); - } - } - - builder.append("\t\r\n"); - builder.append("\t\t" + minLCS - + "\r\n"); - builder.append("\t\t" + fullRevCounter - + "\r\n"); - builder.append("\t\r\n"); - } + extends AbstractPanel { + + private JLabel fullRevisionLabel; + private JTextField fullRevisionField; + + private JLabel minimumCommonSequenceLabel; + private JTextField minimumCommonSequenceField; + + /** + * (Constructor) Creates a new ModePanel. + * + * @param controller Reference to the controller + */ + public ModePanel(final ConfigController controller) { + + super(controller); + controller.register(PanelKeys.PANEL_VALUES, this); + + createFullRevisionSettings(); + createMinimumCommonSequenceSettings(); + } + + // --------------------------------------------------------------------------// + // CONSTRUCTION METHODS // + // --------------------------------------------------------------------------// + + private void createFullRevisionSettings() { + + fullRevisionLabel = new JLabel( + "Every n-th revision will be a full revision:"); + fullRevisionLabel.setBorder(BorderFactory.createRaisedBevelBorder()); + fullRevisionLabel.setBounds(10, 10, 270, 25); + this.add(fullRevisionLabel); + + fullRevisionField = new JTextField(); + fullRevisionField.setBounds(290, 10, 100, 25); + this.add(fullRevisionField); + } + + private void createMinimumCommonSequenceSettings() { + + minimumCommonSequenceLabel = new JLabel( + "Min lenght of a common subsequence:"); + minimumCommonSequenceLabel.setBorder(BorderFactory + .createRaisedBevelBorder()); + minimumCommonSequenceLabel.setBounds(10, 50, 270, 25); + this.add(minimumCommonSequenceLabel); + + minimumCommonSequenceField = new JTextField(); + minimumCommonSequenceField.setBounds(290, 50, 100, 25); + this.add(minimumCommonSequenceField); + } + + // --------------------------------------------------------------------------// + // VALIDATION METHODS // + // --------------------------------------------------------------------------// + + /** + * empty method + */ + @Override + public void validate() { + + } + + /** + * A call of this method should validate the positions of the panels + * components. + */ + @Override + public void relocate() { + + int w = 380, h = 65; + int x = (this.getWidth() - w) / 2, y = (this.getHeight() - h) / 2; + + fullRevisionLabel.setLocation(x, y); + fullRevisionField.setLocation(x + 280, y); + + minimumCommonSequenceLabel.setLocation(x, y + 40); + minimumCommonSequenceField.setLocation(x + 280, y + 40); + } + + // --------------------------------------------------------------------------// + // INPUT/OUTPUT METHODS // + // --------------------------------------------------------------------------// + + /** + * Reads the configuration parameters described in the panel from the + * ConfigSettings and and sets the contained values. + * + * @param config Reference to the ConfigSettings object + */ + @Override + public void applyConfig(final ConfigSettings config) { + Object o = config + .getConfigParameter(ConfigurationKeys.COUNTER_FULL_REVISION); + if (o != null) { + this.fullRevisionField.setText(Integer.toString((Integer) o)); + } else { + this.fullRevisionField.setText(""); + } + + o = config + .getConfigParameter(ConfigurationKeys.VALUE_MINIMUM_LONGEST_COMMON_SUBSTRING); + if (o != null) { + this.minimumCommonSequenceField.setText(Integer + .toString((Integer) o)); + } else { + this.minimumCommonSequenceField.setText(""); + } + } + + /** + * Adds the xml description of the panels content to the StringBuilder. + * Errors which occur during the xml transformation will be added to the + * ConfigVerification. + * + * @param builder Reference to a StringBuilder object + * @param errors Reference to the ConfigVerification object + */ + @Override + public void toXML(StringBuilder builder, final ConfigVerification errors) { + + int minLCS = -1, fullRevCounter = -1; + + // Check the FullRevisionCounter input + String text = this.minimumCommonSequenceField.getText(); + if (text.length() == 0) { + errors.add(new ConfigItem(ConfigItemTypes.ERROR, + ConfigErrorKeys.MISSING_VALUE, "The value for minimum " + + "LongestCommonSubsequence is missing.")); + } else { + try { + minLCS = Integer.parseInt(text); + if (minLCS < 7) { + errors.add(new ConfigItem(ConfigItemTypes.ERROR, + ConfigErrorKeys.VALUE_OUT_OF_RANGE, + "The value of the minimum " + + " LongestCommonSubsequence has to be" + + " at least 7.")); + } else if (minLCS < 12) { + errors.add(new ConfigItem(ConfigItemTypes.WARNING, + ConfigErrorKeys.VALUE_OUT_OF_RANGE, + "A value smaller than 12 for the " + + "minimum LongestCommonSubsequence" + + " is not recommended.")); + } + } catch (NumberFormatException nfe) { + errors.add(new ConfigItem(ConfigItemTypes.ERROR, + ConfigErrorKeys.ILLEGAL_INPUT, + "NumberFormatException for " + + "ArticleProducer TaskLimit")); + } + } + + // Check the FullRevisionCounter input + text = this.fullRevisionField.getText(); + if (text.length() == 0) { + errors.add(new ConfigItem(ConfigItemTypes.ERROR, + ConfigErrorKeys.MISSING_VALUE, + "The value for FullRevision Counter" + " is missing.")); + } else { + try { + fullRevCounter = Integer.parseInt(text); + if (fullRevCounter < 1) { + errors.add(new ConfigItem(ConfigItemTypes.ERROR, + ConfigErrorKeys.VALUE_OUT_OF_RANGE, + "The FullRevision Counter has to " + + "be at least 1.")); + } else if (fullRevCounter < 100) { + errors.add(new ConfigItem(ConfigItemTypes.WARNING, + ConfigErrorKeys.VALUE_OUT_OF_RANGE, + "A FullRevision Counter with a" + + " value smaller than 100 is not" + + " recommended.")); + } + } catch (NumberFormatException nfe) { + errors.add(new ConfigItem(ConfigItemTypes.ERROR, + ConfigErrorKeys.ILLEGAL_INPUT, + "NumberFormatException for " + + "ArticleProducer TaskLimit")); + } + } + + builder.append("\t\r\n"); + builder.append("\t\t" + minLCS + + "\r\n"); + builder.append("\t\t" + fullRevCounter + + "\r\n"); + builder.append("\t\r\n"); + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/panels/OutputPanel.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/panels/OutputPanel.java index fd1b7f01..84598750 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/panels/OutputPanel.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/panels/OutputPanel.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -41,70 +41,62 @@ *

* This panel contains all components for setting configuration parameters * related to the file output. - * - * - * */ @SuppressWarnings("serial") public class OutputPanel - extends AbstractPanel -{ + extends AbstractPanel { - private JLabel outputLabel; - private JTextField outputPathField; + private JLabel outputLabel; + private JTextField outputPathField; - private JCheckBox enableZipEncodingCompression; - private JCheckBox activateDataFileOutput; - private JLabel outputCompression; - private JRadioButton disableOutputCompression; - private JRadioButton enable7ZipOutputCompression; - private JRadioButton enableBZip2OutputCompression; + private JCheckBox enableZipEncodingCompression; + private JCheckBox activateDataFileOutput; + private JLabel outputCompression; + private JRadioButton disableOutputCompression; + private JRadioButton enable7ZipOutputCompression; + private JRadioButton enableBZip2OutputCompression; - private JCheckBox enableMultipleOutputFiles; - private JLabel outputSizeLimitLabel; - private JTextField outputSizeLimitField; + private JCheckBox enableMultipleOutputFiles; + private JLabel outputSizeLimitLabel; + private JTextField outputSizeLimitField; - /** - * (Constructor) Create the OutputPanel object. - * - * @param controller - * Reference to the controller - */ - public OutputPanel(final ConfigController controller) - { + /** + * (Constructor) Create the OutputPanel object. + * + * @param controller Reference to the controller + */ + public OutputPanel(final ConfigController controller) { - super(controller); - controller.register(PanelKeys.PANEL_OUTPUT, this); + super(controller); + controller.register(PanelKeys.PANEL_OUTPUT, this); - createOutputPathSettings(); - createOutputSizeSettings(); - createOutputSettings(); - } + createOutputPathSettings(); + createOutputSizeSettings(); + createOutputSettings(); + } - // --------------------------------------------------------------------------// - // CONSTRUCTION METHODS // - // --------------------------------------------------------------------------// + // --------------------------------------------------------------------------// + // CONSTRUCTION METHODS // + // --------------------------------------------------------------------------// - private void createOutputPathSettings() - { + private void createOutputPathSettings() { - outputLabel = new JLabel("Output Folder: "); - outputLabel.setBorder(BorderFactory.createRaisedBevelBorder()); - outputLabel.setBounds(10, 10, 150, 25); - this.add(outputLabel); + outputLabel = new JLabel("Output Folder: "); + outputLabel.setBorder(BorderFactory.createRaisedBevelBorder()); + outputLabel.setBounds(10, 10, 150, 25); + this.add(outputLabel); - outputPathField = new JTextField(); - outputPathField.setBounds(170, 10, 200, 25); - this.add(outputPathField); - } + outputPathField = new JTextField(); + outputPathField.setBounds(170, 10, 200, 25); + this.add(outputPathField); + } - private void createOutputSettings() - { + private void createOutputSettings() { - enableZipEncodingCompression = new JCheckBox("Activate Zip Encoding"); - enableZipEncodingCompression.setBounds(120, 50, 150, 25); + enableZipEncodingCompression = new JCheckBox("Activate Zip Encoding"); + enableZipEncodingCompression.setBounds(120, 50, 150, 25); - enableZipEncodingCompression.addActionListener(e -> { + enableZipEncodingCompression.addActionListener(e -> { boolean flag = !controller.isZipCompressionEnabled(); controller.setEnableZipCompression(flag); @@ -112,17 +104,17 @@ private void createOutputSettings() validate(); }); - this.add(enableZipEncodingCompression); + this.add(enableZipEncodingCompression); - outputCompression = new JLabel("Output Compression:"); - outputCompression.setBounds(120, 85, 250, 25); - this.add(outputCompression); + outputCompression = new JLabel("Output Compression:"); + outputCompression.setBounds(120, 85, 250, 25); + this.add(outputCompression); - disableOutputCompression = new JRadioButton("None"); - disableOutputCompression.setBounds(120, 110, 250, 20); - this.add(disableOutputCompression); + disableOutputCompression = new JRadioButton("None"); + disableOutputCompression.setBounds(120, 110, 250, 20); + this.add(disableOutputCompression); - disableOutputCompression.addActionListener(e -> { + disableOutputCompression.addActionListener(e -> { OutputCompressionEnum oce = controller.getOutputCompression(); if (oce != OutputCompressionEnum.None) { @@ -132,69 +124,68 @@ private void createOutputSettings() validate(); }); - enable7ZipOutputCompression = new JRadioButton("7Zip Compression"); - enable7ZipOutputCompression.setBounds(120, 130, 250, 20); - this.add(enable7ZipOutputCompression); + enable7ZipOutputCompression = new JRadioButton("7Zip Compression"); + enable7ZipOutputCompression.setBounds(120, 130, 250, 20); + this.add(enable7ZipOutputCompression); - enable7ZipOutputCompression.addActionListener(e -> { + enable7ZipOutputCompression.addActionListener(e -> { OutputCompressionEnum oce = controller.getOutputCompression(); if (oce != OutputCompressionEnum.SevenZip) { controller - .setOutputCompression(OutputCompressionEnum.SevenZip); + .setOutputCompression(OutputCompressionEnum.SevenZip); } validate(); }); - enableBZip2OutputCompression = new JRadioButton("BZip2 Compression"); - enableBZip2OutputCompression.setBounds(120, 150, 250, 20); - this.add(enableBZip2OutputCompression); + enableBZip2OutputCompression = new JRadioButton("BZip2 Compression"); + enableBZip2OutputCompression.setBounds(120, 150, 250, 20); + this.add(enableBZip2OutputCompression); - enableBZip2OutputCompression.addActionListener(e -> { + enableBZip2OutputCompression.addActionListener(e -> { OutputCompressionEnum oce = controller.getOutputCompression(); if (oce != OutputCompressionEnum.BZip2) { controller - .setOutputCompression(OutputCompressionEnum.BZip2); + .setOutputCompression(OutputCompressionEnum.BZip2); } validate(); }); - activateDataFileOutput = new JCheckBox("DataFile Output"); - activateDataFileOutput.setBounds(120, 50, 170, 25); - activateDataFileOutput.setVisible(true); - activateDataFileOutput.addActionListener(e -> { + activateDataFileOutput = new JCheckBox("DataFile Output"); + activateDataFileOutput.setBounds(120, 50, 170, 25); + activateDataFileOutput.setVisible(true); + activateDataFileOutput.addActionListener(e -> { boolean flag = !controller.isEnableDataFileOutput(); controller.setEnableDataFileOutput(flag); validate(); }); - this.add(activateDataFileOutput); + this.add(activateDataFileOutput); - } + } - private void createOutputSizeSettings() - { + private void createOutputSizeSettings() { - enableMultipleOutputFiles = new JCheckBox( - "Allow multiple output files per consumer"); - enableMultipleOutputFiles.setBounds(10, 200, 250, 25); - this.add(enableMultipleOutputFiles); + enableMultipleOutputFiles = new JCheckBox( + "Allow multiple output files per consumer"); + enableMultipleOutputFiles.setBounds(10, 200, 250, 25); + this.add(enableMultipleOutputFiles); - outputSizeLimitLabel = new JLabel("File Size Limit (in byte): "); - outputSizeLimitLabel.setBorder(BorderFactory.createRaisedBevelBorder()); - outputSizeLimitLabel.setBounds(10, 230, 150, 25); - this.add(outputSizeLimitLabel); + outputSizeLimitLabel = new JLabel("File Size Limit (in byte): "); + outputSizeLimitLabel.setBorder(BorderFactory.createRaisedBevelBorder()); + outputSizeLimitLabel.setBounds(10, 230, 150, 25); + this.add(outputSizeLimitLabel); - outputSizeLimitField = new JTextField(); - outputSizeLimitField.setBounds(170, 230, 200, 25); - this.add(outputSizeLimitField); + outputSizeLimitField = new JTextField(); + outputSizeLimitField.setBounds(170, 230, 200, 25); + this.add(outputSizeLimitField); - enableMultipleOutputFiles.addActionListener(e -> { + enableMultipleOutputFiles.addActionListener(e -> { boolean flag = !controller.isMultipleOutputFiles(); controller.setMultipleOutputFiles(flag); @@ -202,279 +193,266 @@ private void createOutputSizeSettings() outputSizeLimitLabel.setEnabled(flag); outputSizeLimitField.setEnabled(flag); }); - } + } + + // --------------------------------------------------------------------------// + // VALIDATION METHODS // + // --------------------------------------------------------------------------// - // --------------------------------------------------------------------------// - // VALIDATION METHODS // - // --------------------------------------------------------------------------// + /** + * A call of this method should validate the status of the panels + * components. + */ + @Override + public void validate() { - /** - * A call of this method should validate the status of the panels - * components. - */ - @Override - public void validate() - { + boolean flagA = !controller.isEnableSQLDatabaseOutput(); + boolean flagB = controller.isMultipleOutputFiles(); - boolean flagA = !controller.isEnableSQLDatabaseOutput(); - boolean flagB = controller.isMultipleOutputFiles(); + OutputCompressionEnum oce = controller.getOutputCompression(); - OutputCompressionEnum oce = controller.getOutputCompression(); + enableZipEncodingCompression.setSelected(controller + .isZipCompressionEnabled()); - enableZipEncodingCompression.setSelected(controller - .isZipCompressionEnabled()); + disableOutputCompression.setSelected(oce == OutputCompressionEnum.None); - disableOutputCompression.setSelected(oce == OutputCompressionEnum.None); - - enableBZip2OutputCompression - .setSelected(oce == OutputCompressionEnum.BZip2); + enableBZip2OutputCompression + .setSelected(oce == OutputCompressionEnum.BZip2); + + activateDataFileOutput.setSelected(controller.isEnableDataFileOutput()); + + outputLabel.setEnabled(flagA); + outputPathField.setEnabled(flagA); + + enableZipEncodingCompression.setEnabled(flagA); + + outputCompression.setEnabled(flagA); + disableOutputCompression.setEnabled(flagA); + + enable7ZipOutputCompression.setEnabled(flagA + && controller.is7ZipEnabled()); + + enable7ZipOutputCompression + .setSelected(oce == OutputCompressionEnum.SevenZip); + + enableBZip2OutputCompression.setEnabled(flagA); + + //Enable multiple output files only for uncompressed output + enableMultipleOutputFiles.setEnabled(flagA && (oce == OutputCompressionEnum.None)); + enableMultipleOutputFiles.setSelected(flagB); + + outputSizeLimitLabel.setEnabled(flagA && flagB && (oce == OutputCompressionEnum.None)); + outputSizeLimitField.setEnabled(flagA && flagB && (oce == OutputCompressionEnum.None)); + + + } + + /** + * A call of this method should validate the positions of the panels + * components. + */ + @Override + public void relocate() { + + int w = 360, h = 245; + + int x = (this.getWidth() - w) / 2; + int y = (this.getHeight() - h) / 2; + + outputLabel.setLocation(x, y); + outputPathField.setLocation(x + 160, y); + + enableZipEncodingCompression.setLocation(x + 110, y + 40); + outputCompression.setLocation(x + 110, y + 75); + disableOutputCompression.setLocation(x + 110, y + 100); + enableBZip2OutputCompression.setLocation(x + 110, y + 120); + enable7ZipOutputCompression.setLocation(x + 110, y + 140); + activateDataFileOutput.setLocation(x + 110, y + 160); + + enableMultipleOutputFiles.setLocation(x, y + 190); + outputSizeLimitLabel.setLocation(x, y + 220); + outputSizeLimitField.setLocation(x + 160, y + 220); + + } + + // --------------------------------------------------------------------------// + // INPUT/OUTPUT METHODS // + // --------------------------------------------------------------------------// + + /** + * Reads the configuration parameters described in the panel from the + * ConfigSettings and and sets the contained values. + * + * @param config Reference to the ConfigSettings object + */ + @Override + public void applyConfig(final ConfigSettings config) { + + Object o = config + .getConfigParameter(ConfigurationKeys.PATH_OUTPUT_SQL_FILES); + if (o != null) { + this.outputPathField.setText((String) o); + } else { + this.outputPathField.setText(""); + } + + o = config + .getConfigParameter(ConfigurationKeys.MODE_ZIP_COMPRESSION_ENABLED); + if (o != null) { + controller.setEnableZipCompression((Boolean) o); + } else { + controller.setEnableZipCompression(false); + } + + o = config + .getConfigParameter(ConfigurationKeys.MODE_DATAFILE_OUTPUT); + if (o != null) { + controller.setEnableDataFileOutput((Boolean) o); + } else { + controller.setEnableDataFileOutput(false); + } + + o = config.getConfigParameter(ConfigurationKeys.MODE_OUTPUT); + if (o != null) { + switch ((OutputType) o) { + case UNCOMPRESSED: + controller.setEnableSQLDatabaseOutput(false); + controller.setOutputCompression(OutputCompressionEnum.None); + + o = config + .getConfigParameter(ConfigurationKeys.LIMIT_SQL_FILE_SIZE); + break; + case SEVENZIP: + controller.setEnableSQLDatabaseOutput(false); + controller.setOutputCompression(OutputCompressionEnum.SevenZip); + + o = config + .getConfigParameter(ConfigurationKeys.LIMIT_SQL_ARCHIVE_SIZE); + break; + case BZIP2: + controller.setEnableSQLDatabaseOutput(false); + controller.setOutputCompression(OutputCompressionEnum.BZip2); + + o = config + .getConfigParameter(ConfigurationKeys.LIMIT_SQL_ARCHIVE_SIZE); + break; + case DATABASE: + controller.setEnableSQLDatabaseOutput(true); + controller.setOutputCompression(OutputCompressionEnum.None); + + o = null; + break; + } + } + + if (o != null) { + controller.setMultipleOutputFiles(true); + this.outputSizeLimitField.setText(Long.toString((Long) o)); + } else { + controller.setMultipleOutputFiles(false); + this.outputSizeLimitField.setText(""); + } + } + + /** + * Adds the xml description of the panels content to the StringBuilder. + * Errors which occur during the xml transformation will be added to the + * ConfigVerification. + * + * @param builder Reference to a StringBuilder object + * @param errors Reference to the ConfigVerification object + */ + @Override + public void toXML(final StringBuilder builder, + final ConfigVerification errors) { + + if (!controller.isEnableSQLDatabaseOutput()) { + + boolean zipComp = controller.isZipCompressionEnabled(); + boolean multiFile = controller.isMultipleOutputFiles(); + + builder.append("\t\r\n"); + builder.append("\t\t"); + + OutputCompressionEnum comp = controller.getOutputCompression(); + switch (comp) { + case None: + builder.append(OutputType.UNCOMPRESSED); + break; + case BZip2: + builder.append(OutputType.BZIP2); + break; + case SevenZip: + builder.append(OutputType.SEVENZIP); + break; + default: + throw new RuntimeException("Illegal Output Compression Mode"); + } + + builder.append("\r\n"); + + String path = this.outputPathField.getText(); + + if (path == null || path.equals("")) { + errors.add(new ConfigItem(ConfigItemTypes.WARNING, + ConfigErrorKeys.MISSING_VALUE, + "No output path has been set.")); + } + + if (!path.endsWith(File.separator) && path.contains(File.separator)) { + path += File.separator; + } + + builder.append("\t\t\"" + path + "\"\r\n"); + + if (multiFile) { + + long sizeLimit = -1; + + String text = outputSizeLimitField.getText(); + if (text.length() == 0) { + errors.add(new ConfigItem(ConfigItemTypes.ERROR, + ConfigErrorKeys.MISSING_VALUE, + "The output limit is missing.")); + } else { + try { + sizeLimit = Long.parseLong(text); + if (sizeLimit < 100 * 1024 * 1024) { + errors.add(new ConfigItem(ConfigItemTypes.ERROR, + ConfigErrorKeys.VALUE_OUT_OF_RANGE, + "The output limit has to be at" + + " least 100MB")); + } + } catch (NumberFormatException nfe) { + errors.add(new ConfigItem(ConfigItemTypes.ERROR, + ConfigErrorKeys.ILLEGAL_INPUT, + "NumberFormatException for the" + + " output limit")); + } + } + + switch (comp) { + case None: + builder.append("\t\t" + sizeLimit + + "\r\n"); + break; + default: + builder.append("\t\t" + sizeLimit + + "\r\n"); + break; + } + } + + builder.append("\t\t" + zipComp + + "\r\n"); + + if (controller.isEnableDataFileOutput()) { + builder.append("\t\ttrue\r\n"); + } else { + builder.append("\t\tfalse\r\n"); + } - activateDataFileOutput.setSelected(controller.isEnableDataFileOutput()); - - outputLabel.setEnabled(flagA); - outputPathField.setEnabled(flagA); - - enableZipEncodingCompression.setEnabled(flagA); - - outputCompression.setEnabled(flagA); - disableOutputCompression.setEnabled(flagA); - - enable7ZipOutputCompression.setEnabled(flagA - && controller.is7ZipEnabled()); - - enable7ZipOutputCompression - .setSelected(oce == OutputCompressionEnum.SevenZip); - - enableBZip2OutputCompression.setEnabled(flagA); - - //Enable multiple output files only for uncompressed output - enableMultipleOutputFiles.setEnabled(flagA&&(oce == OutputCompressionEnum.None)); - enableMultipleOutputFiles.setSelected(flagB); - - outputSizeLimitLabel.setEnabled(flagA && flagB&&(oce == OutputCompressionEnum.None)); - outputSizeLimitField.setEnabled(flagA && flagB&&(oce == OutputCompressionEnum.None)); - - - } - - /** - * A call of this method should validate the positions of the panels - * components. - */ - @Override - public void relocate() - { - - int w = 360, h = 245; - - int x = (this.getWidth() - w) / 2; - int y = (this.getHeight() - h) / 2; - - outputLabel.setLocation(x, y); - outputPathField.setLocation(x + 160, y); - - enableZipEncodingCompression.setLocation(x + 110, y + 40); - outputCompression.setLocation(x + 110, y + 75); - disableOutputCompression.setLocation(x + 110, y + 100); - enableBZip2OutputCompression.setLocation(x + 110, y + 120); - enable7ZipOutputCompression.setLocation(x + 110, y + 140); - activateDataFileOutput.setLocation(x + 110, y + 160); - - enableMultipleOutputFiles.setLocation(x, y + 190); - outputSizeLimitLabel.setLocation(x, y + 220); - outputSizeLimitField.setLocation(x + 160, y + 220); - - } - - // --------------------------------------------------------------------------// - // INPUT/OUTPUT METHODS // - // --------------------------------------------------------------------------// - - /** - * Reads the configuration parameters described in the panel from the - * ConfigSettings and and sets the contained values. - * - * @param config - * Reference to the ConfigSettings object - */ - @Override - public void applyConfig(final ConfigSettings config) - { - - Object o = config - .getConfigParameter(ConfigurationKeys.PATH_OUTPUT_SQL_FILES); - if (o != null) { - this.outputPathField.setText((String) o); - } - else { - this.outputPathField.setText(""); - } - - o = config - .getConfigParameter(ConfigurationKeys.MODE_ZIP_COMPRESSION_ENABLED); - if (o != null) { - controller.setEnableZipCompression((Boolean) o); - } - else { - controller.setEnableZipCompression(false); - } - - o = config - .getConfigParameter(ConfigurationKeys.MODE_DATAFILE_OUTPUT); - if (o != null) { - controller.setEnableDataFileOutput((Boolean) o); - } - else { - controller.setEnableDataFileOutput(false); - } - - o = config.getConfigParameter(ConfigurationKeys.MODE_OUTPUT); - if (o != null) { - switch ((OutputType) o) { - case UNCOMPRESSED: - controller.setEnableSQLDatabaseOutput(false); - controller.setOutputCompression(OutputCompressionEnum.None); - - o = config - .getConfigParameter(ConfigurationKeys.LIMIT_SQL_FILE_SIZE); - break; - case SEVENZIP: - controller.setEnableSQLDatabaseOutput(false); - controller.setOutputCompression(OutputCompressionEnum.SevenZip); - - o = config - .getConfigParameter(ConfigurationKeys.LIMIT_SQL_ARCHIVE_SIZE); - break; - case BZIP2: - controller.setEnableSQLDatabaseOutput(false); - controller.setOutputCompression(OutputCompressionEnum.BZip2); - - o = config - .getConfigParameter(ConfigurationKeys.LIMIT_SQL_ARCHIVE_SIZE); - break; - case DATABASE: - controller.setEnableSQLDatabaseOutput(true); - controller.setOutputCompression(OutputCompressionEnum.None); - - o = null; - break; - } - } - - if (o != null) { - controller.setMultipleOutputFiles(true); - this.outputSizeLimitField.setText(Long.toString((Long) o)); - } - else { - controller.setMultipleOutputFiles(false); - this.outputSizeLimitField.setText(""); - } - } - - /** - * Adds the xml description of the panels content to the StringBuilder. - * Errors which occur during the xml transformation will be added to the - * ConfigVerification. - * - * @param builder - * Reference to a StringBuilder object - * @param errors - * Reference to the ConfigVerification object - */ - @Override - public void toXML(final StringBuilder builder, - final ConfigVerification errors) - { - - if (!controller.isEnableSQLDatabaseOutput()) { - - boolean zipComp = controller.isZipCompressionEnabled(); - boolean multiFile = controller.isMultipleOutputFiles(); - - builder.append("\t\r\n"); - builder.append("\t\t"); - - OutputCompressionEnum comp = controller.getOutputCompression(); - switch (comp) { - case None: - builder.append(OutputType.UNCOMPRESSED); - break; - case BZip2: - builder.append(OutputType.BZIP2); - break; - case SevenZip: - builder.append(OutputType.SEVENZIP); - break; - default: - throw new RuntimeException("Illegal Output Compression Mode"); - } - - builder.append("\r\n"); - - String path = this.outputPathField.getText(); - - if(path==null||path.equals("")){ - errors.add(new ConfigItem(ConfigItemTypes.WARNING, - ConfigErrorKeys.MISSING_VALUE, - "No output path has been set.")); - } - - if (!path.endsWith(File.separator) && path.contains(File.separator)) { - path += File.separator; - } - - builder.append("\t\t\"" + path + "\"\r\n"); - - if (multiFile) { - - long sizeLimit = -1; - - String text = outputSizeLimitField.getText(); - if (text.length() == 0) { - errors.add(new ConfigItem(ConfigItemTypes.ERROR, - ConfigErrorKeys.MISSING_VALUE, - "The output limit is missing.")); - } - else { - try { - sizeLimit = Long.parseLong(text); - if (sizeLimit < 100 * 1024 * 1024) { - errors.add(new ConfigItem(ConfigItemTypes.ERROR, - ConfigErrorKeys.VALUE_OUT_OF_RANGE, - "The output limit has to be at" - + " least 100MB")); - } - } - catch (NumberFormatException nfe) { - errors.add(new ConfigItem(ConfigItemTypes.ERROR, - ConfigErrorKeys.ILLEGAL_INPUT, - "NumberFormatException for the" - + " output limit")); - } - } - - switch (comp) { - case None: - builder.append("\t\t" + sizeLimit - + "\r\n"); - break; - default: - builder.append("\t\t" + sizeLimit - + "\r\n"); - break; - } - } - - builder.append("\t\t" + zipComp - + "\r\n"); - - if (controller.isEnableDataFileOutput()) { - builder.append("\t\ttrue\r\n"); - }else{ - builder.append("\t\tfalse\r\n"); - } - - builder.append("\t\r\n"); - } - } + builder.append("\t\r\n"); + } + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/panels/SQLPanel.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/panels/SQLPanel.java index b95f8f40..33a35861 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/panels/SQLPanel.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/panels/SQLPanel.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -37,107 +37,99 @@ *

* This panel contains all components for setting configuration parameters * related to the database output. - * - * - * */ @SuppressWarnings("serial") public class SQLPanel - extends AbstractPanel -{ - - /** - * (Constructor) Create the SQLPanel object. - * - * @param controller - * Reference to the controller - */ - public SQLPanel(final ConfigController controller) - { - - super(controller); - controller.register(PanelKeys.PANEL_SQL, this); - - createSQLFields(); - createOutputSettings(); - } - - // --------------------------------------------------------------------------// - // CONSTRUCTION METHODS // - // --------------------------------------------------------------------------// - - private JCheckBox enableSQLDatabaseConnection; - private JLabel sqlHostLabel; - private JTextField sqlHostField; - private JLabel sqlDatabaseLabel; - private JTextField sqlDatabaseField; - private JLabel sqlUserLabel; - private JTextField sqlUserField; - private JLabel sqlPasswordLabel; - private JTextField sqlPasswordField; - - private JCheckBox enableZipEncodingCheckBox; - - private void createSQLFields() - { - - enableSQLDatabaseConnection = new JCheckBox( - "Activate Database Output"); - enableSQLDatabaseConnection.setBounds(10, 10, 200, 25); - - enableSQLDatabaseConnection.addActionListener(e -> { + extends AbstractPanel { + + /** + * (Constructor) Create the SQLPanel object. + * + * @param controller Reference to the controller + */ + public SQLPanel(final ConfigController controller) { + + super(controller); + controller.register(PanelKeys.PANEL_SQL, this); + + createSQLFields(); + createOutputSettings(); + } + + // --------------------------------------------------------------------------// + // CONSTRUCTION METHODS // + // --------------------------------------------------------------------------// + + private JCheckBox enableSQLDatabaseConnection; + private JLabel sqlHostLabel; + private JTextField sqlHostField; + private JLabel sqlDatabaseLabel; + private JTextField sqlDatabaseField; + private JLabel sqlUserLabel; + private JTextField sqlUserField; + private JLabel sqlPasswordLabel; + private JTextField sqlPasswordField; + + private JCheckBox enableZipEncodingCheckBox; + + private void createSQLFields() { + + enableSQLDatabaseConnection = new JCheckBox( + "Activate Database Output"); + enableSQLDatabaseConnection.setBounds(10, 10, 200, 25); + + enableSQLDatabaseConnection.addActionListener(e -> { boolean flag = !controller.isEnableSQLDatabaseOutput(); controller.setEnableSQLDatabaseOutput(flag); validateSQLFields(); }); - this.add(enableSQLDatabaseConnection); + this.add(enableSQLDatabaseConnection); - sqlHostLabel = new JLabel("Host"); - sqlHostLabel.setBorder(BorderFactory.createRaisedBevelBorder()); - sqlHostLabel.setBounds(10, 50, 100, 25); - this.add(sqlHostLabel); + sqlHostLabel = new JLabel("Host"); + sqlHostLabel.setBorder(BorderFactory.createRaisedBevelBorder()); + sqlHostLabel.setBounds(10, 50, 100, 25); + this.add(sqlHostLabel); - sqlHostField = new JTextField(); - sqlHostField.setBounds(120, 50, 100, 25); - this.add(sqlHostField); + sqlHostField = new JTextField(); + sqlHostField.setBounds(120, 50, 100, 25); + this.add(sqlHostField); - sqlDatabaseLabel = new JLabel("Database"); - sqlDatabaseLabel.setBorder(BorderFactory.createRaisedBevelBorder()); - sqlDatabaseLabel.setBounds(10, 50, 100, 25); - this.add(sqlDatabaseLabel); + sqlDatabaseLabel = new JLabel("Database"); + sqlDatabaseLabel.setBorder(BorderFactory.createRaisedBevelBorder()); + sqlDatabaseLabel.setBounds(10, 50, 100, 25); + this.add(sqlDatabaseLabel); - sqlDatabaseField = new JTextField(); - sqlDatabaseField.setBounds(120, 50, 100, 25); - this.add(sqlDatabaseField); + sqlDatabaseField = new JTextField(); + sqlDatabaseField.setBounds(120, 50, 100, 25); + this.add(sqlDatabaseField); - sqlUserLabel = new JLabel("User"); - sqlUserLabel.setBorder(BorderFactory.createRaisedBevelBorder()); - sqlUserLabel.setBounds(10, 80, 100, 25); - this.add(sqlUserLabel); + sqlUserLabel = new JLabel("User"); + sqlUserLabel.setBorder(BorderFactory.createRaisedBevelBorder()); + sqlUserLabel.setBounds(10, 80, 100, 25); + this.add(sqlUserLabel); - sqlUserField = new JTextField(); - sqlUserField.setBounds(120, 80, 100, 25); - this.add(sqlUserField); + sqlUserField = new JTextField(); + sqlUserField.setBounds(120, 80, 100, 25); + this.add(sqlUserField); - sqlPasswordLabel = new JLabel("Password"); - sqlPasswordLabel.setBorder(BorderFactory.createRaisedBevelBorder()); - sqlPasswordLabel.setBounds(10, 110, 100, 25); - this.add(sqlPasswordLabel); + sqlPasswordLabel = new JLabel("Password"); + sqlPasswordLabel.setBorder(BorderFactory.createRaisedBevelBorder()); + sqlPasswordLabel.setBounds(10, 110, 100, 25); + this.add(sqlPasswordLabel); - sqlPasswordField = new JTextField(); - sqlPasswordField.setBounds(120, 110, 100, 25); - this.add(sqlPasswordField); - } + sqlPasswordField = new JTextField(); + sqlPasswordField.setBounds(120, 110, 100, 25); + this.add(sqlPasswordField); + } - private void createOutputSettings() - { + private void createOutputSettings() { - enableZipEncodingCheckBox = new JCheckBox("Activate Zip Encoding"); - enableZipEncodingCheckBox.setBounds(10, 160, 200, 25); + enableZipEncodingCheckBox = new JCheckBox("Activate Zip Encoding"); + enableZipEncodingCheckBox.setBounds(10, 160, 200, 25); - enableZipEncodingCheckBox.addActionListener(e -> { + enableZipEncodingCheckBox.addActionListener(e -> { boolean flag = !controller.isZipCompressionEnabled(); controller.setEnableZipCompression(flag); @@ -145,203 +137,189 @@ private void createOutputSettings() validateSettings(); }); - this.add(enableZipEncodingCheckBox); - } - - // --------------------------------------------------------------------------// - // VALIDATION METHODS // - // --------------------------------------------------------------------------// - - /** - * A call of this method should validate the status of the panels - * components. - */ - @Override - public void validate() - { - validateSQLFields(); - validateSettings(); - } - - /** - * Validates the Settings. - */ - private void validateSettings() - { - enableZipEncodingCheckBox.setSelected(controller - .isZipCompressionEnabled()); - } - - /** - * Validates the UNCOMPRESSED Settings. - */ - private void validateSQLFields() - { - - boolean flag = controller.isEnableSQLDatabaseOutput(); - - enableSQLDatabaseConnection.setSelected(flag); - - sqlHostLabel.setEnabled(flag); - sqlHostField.setEnabled(flag); - sqlDatabaseLabel.setEnabled(flag); - sqlDatabaseField.setEnabled(flag); - sqlUserLabel.setEnabled(flag); - sqlUserField.setEnabled(flag); - sqlPasswordLabel.setEnabled(flag); - sqlPasswordField.setEnabled(flag); - - enableZipEncodingCheckBox.setEnabled(flag); - } - - /** - * A call of this method should validate the positions of the panels - * components. - */ - @Override - public void relocate() - { - - int w = 200, h = 235; - - int x = (this.getWidth() - w) / 2; - int y = (this.getHeight() - h) / 2; - - enableSQLDatabaseConnection.setLocation(x, y); - sqlHostLabel.setLocation(x, y + 40); - sqlHostField.setLocation(x + 110, y + 40); - sqlDatabaseLabel.setLocation(x, y + 70); - sqlDatabaseField.setLocation(x + 110, y + 70); - sqlUserLabel.setLocation(x, y + 100); - sqlUserField.setLocation(x + 110, y + 100); - sqlPasswordLabel.setLocation(x, y + 130); - sqlPasswordField.setLocation(x + 110, y + 130); - enableZipEncodingCheckBox.setLocation(x, y + 180); - } - - // --------------------------------------------------------------------------// - // INPUT/OUTPUT METHODS // - // --------------------------------------------------------------------------// - - /** - * Reads the configuration parameters described in the panel from the - * ConfigSettings and and sets the contained values. - * - * @param config - * Reference to the ConfigSettings object - */ - @Override - public void applyConfig(final ConfigSettings config) - { - - Object o = config.getConfigParameter(ConfigurationKeys.MODE_OUTPUT); - if ((OutputType) o == OutputType.DATABASE) { - controller.setEnableSQLDatabaseOutput(true); - } - - o = config.getConfigParameter(ConfigurationKeys.SQL_HOST); - if (o != null) { - this.sqlHostField.setText((String) o); - } - else { - this.sqlHostField.setText(""); - } - - o = config.getConfigParameter(ConfigurationKeys.SQL_DATABASE); - if (o != null) { - this.sqlDatabaseField.setText((String) o); - } - else { - this.sqlDatabaseField.setText(""); - } - - o = config.getConfigParameter(ConfigurationKeys.SQL_USERNAME); - if (o != null) { - this.sqlUserField.setText((String) o); - } - else { - this.sqlUserField.setText(""); - } - - o = config.getConfigParameter(ConfigurationKeys.SQL_PASSWORD); - if (o != null) { - this.sqlPasswordField.setText((String) o); - } - else { - this.sqlPasswordField.setText(""); - } - - o = config - .getConfigParameter(ConfigurationKeys.MODE_ZIP_COMPRESSION_ENABLED); - if (o != null) { - controller.setEnableZipCompression((Boolean) o); - } - else { - controller.setEnableZipCompression(false); - } - } - - /** - * Adds the xml description of the panels content to the StringBuilder. - * Errors which occur during the xml transformation will be added to the - * ConfigVerification. - * - * @param builder - * Reference to a StringBuilder object - * @param errors - * Reference to the ConfigVerification object - */ - @Override - public void toXML(final StringBuilder builder, - final ConfigVerification errors) - { - - if (controller.isEnableSQLDatabaseOutput()) { - - String database, user, password, host; - - host = sqlHostField.getText(); - if (host.length() == 0) { - errors.add(new ConfigItem(ConfigItemTypes.ERROR, - ConfigErrorKeys.MISSING_VALUE, - "The name of the sqlproducer-host is missing.")); - } - - database = sqlDatabaseField.getText(); - if (database.length() == 0) { - errors.add(new ConfigItem(ConfigItemTypes.ERROR, - ConfigErrorKeys.MISSING_VALUE, - "The name of the sqlproducer-database is missing.")); - } - - user = sqlUserField.getText(); - if (database.length() == 0) { - errors.add(new ConfigItem(ConfigItemTypes.ERROR, - ConfigErrorKeys.MISSING_VALUE, - "The name of the sqlproducer-user is missing.")); - } - - password = sqlPasswordField.getText(); - if (password.length() == 0) { - errors.add(new ConfigItem(ConfigItemTypes.ERROR, - ConfigErrorKeys.MISSING_VALUE, - "The password of the sqlproducer-user is missing.")); - } - - boolean zipComp = controller.isZipCompressionEnabled(); - - builder.append("\t\r\n"); - builder.append("\t\t" + OutputType.DATABASE - + "\r\n"); - builder.append("\t\t\t\r\n"); - builder.append("\t\t\t\t" + host + "\r\n"); - builder.append("\t\t\t\t" + database + "\r\n"); - builder.append("\t\t\t\t" + user + "\r\n"); - builder.append("\t\t\t\t" + password + "\r\n"); - builder.append("\t\t\t\r\n"); - builder.append("\t\t" + zipComp - + "\r\n"); - builder.append("\t\r\n"); - } - } + this.add(enableZipEncodingCheckBox); + } + + // --------------------------------------------------------------------------// + // VALIDATION METHODS // + // --------------------------------------------------------------------------// + + /** + * A call of this method should validate the status of the panels + * components. + */ + @Override + public void validate() { + validateSQLFields(); + validateSettings(); + } + + /** + * Validates the Settings. + */ + private void validateSettings() { + enableZipEncodingCheckBox.setSelected(controller + .isZipCompressionEnabled()); + } + + /** + * Validates the UNCOMPRESSED Settings. + */ + private void validateSQLFields() { + + boolean flag = controller.isEnableSQLDatabaseOutput(); + + enableSQLDatabaseConnection.setSelected(flag); + + sqlHostLabel.setEnabled(flag); + sqlHostField.setEnabled(flag); + sqlDatabaseLabel.setEnabled(flag); + sqlDatabaseField.setEnabled(flag); + sqlUserLabel.setEnabled(flag); + sqlUserField.setEnabled(flag); + sqlPasswordLabel.setEnabled(flag); + sqlPasswordField.setEnabled(flag); + + enableZipEncodingCheckBox.setEnabled(flag); + } + + /** + * A call of this method should validate the positions of the panels + * components. + */ + @Override + public void relocate() { + + int w = 200, h = 235; + + int x = (this.getWidth() - w) / 2; + int y = (this.getHeight() - h) / 2; + + enableSQLDatabaseConnection.setLocation(x, y); + sqlHostLabel.setLocation(x, y + 40); + sqlHostField.setLocation(x + 110, y + 40); + sqlDatabaseLabel.setLocation(x, y + 70); + sqlDatabaseField.setLocation(x + 110, y + 70); + sqlUserLabel.setLocation(x, y + 100); + sqlUserField.setLocation(x + 110, y + 100); + sqlPasswordLabel.setLocation(x, y + 130); + sqlPasswordField.setLocation(x + 110, y + 130); + enableZipEncodingCheckBox.setLocation(x, y + 180); + } + + // --------------------------------------------------------------------------// + // INPUT/OUTPUT METHODS // + // --------------------------------------------------------------------------// + + /** + * Reads the configuration parameters described in the panel from the + * ConfigSettings and and sets the contained values. + * + * @param config Reference to the ConfigSettings object + */ + @Override + public void applyConfig(final ConfigSettings config) { + + Object o = config.getConfigParameter(ConfigurationKeys.MODE_OUTPUT); + if ((OutputType) o == OutputType.DATABASE) { + controller.setEnableSQLDatabaseOutput(true); + } + + o = config.getConfigParameter(ConfigurationKeys.SQL_HOST); + if (o != null) { + this.sqlHostField.setText((String) o); + } else { + this.sqlHostField.setText(""); + } + + o = config.getConfigParameter(ConfigurationKeys.SQL_DATABASE); + if (o != null) { + this.sqlDatabaseField.setText((String) o); + } else { + this.sqlDatabaseField.setText(""); + } + + o = config.getConfigParameter(ConfigurationKeys.SQL_USERNAME); + if (o != null) { + this.sqlUserField.setText((String) o); + } else { + this.sqlUserField.setText(""); + } + + o = config.getConfigParameter(ConfigurationKeys.SQL_PASSWORD); + if (o != null) { + this.sqlPasswordField.setText((String) o); + } else { + this.sqlPasswordField.setText(""); + } + + o = config + .getConfigParameter(ConfigurationKeys.MODE_ZIP_COMPRESSION_ENABLED); + if (o != null) { + controller.setEnableZipCompression((Boolean) o); + } else { + controller.setEnableZipCompression(false); + } + } + + /** + * Adds the xml description of the panels content to the StringBuilder. + * Errors which occur during the xml transformation will be added to the + * ConfigVerification. + * + * @param builder Reference to a StringBuilder object + * @param errors Reference to the ConfigVerification object + */ + @Override + public void toXML(final StringBuilder builder, + final ConfigVerification errors) { + + if (controller.isEnableSQLDatabaseOutput()) { + + String database, user, password, host; + + host = sqlHostField.getText(); + if (host.length() == 0) { + errors.add(new ConfigItem(ConfigItemTypes.ERROR, + ConfigErrorKeys.MISSING_VALUE, + "The name of the sqlproducer-host is missing.")); + } + + database = sqlDatabaseField.getText(); + if (database.length() == 0) { + errors.add(new ConfigItem(ConfigItemTypes.ERROR, + ConfigErrorKeys.MISSING_VALUE, + "The name of the sqlproducer-database is missing.")); + } + + user = sqlUserField.getText(); + if (database.length() == 0) { + errors.add(new ConfigItem(ConfigItemTypes.ERROR, + ConfigErrorKeys.MISSING_VALUE, + "The name of the sqlproducer-user is missing.")); + } + + password = sqlPasswordField.getText(); + if (password.length() == 0) { + errors.add(new ConfigItem(ConfigItemTypes.ERROR, + ConfigErrorKeys.MISSING_VALUE, + "The password of the sqlproducer-user is missing.")); + } + + boolean zipComp = controller.isZipCompressionEnabled(); + + builder.append("\t\r\n"); + builder.append("\t\t" + OutputType.DATABASE + + "\r\n"); + builder.append("\t\t\t\r\n"); + builder.append("\t\t\t\t" + host + "\r\n"); + builder.append("\t\t\t\t" + database + "\r\n"); + builder.append("\t\t\t\t" + user + "\r\n"); + builder.append("\t\t\t\t" + password + "\r\n"); + builder.append("\t\t\t\r\n"); + builder.append("\t\t" + zipComp + + "\r\n"); + builder.append("\t\r\n"); + } + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/simpleconfig/SimpleConfig.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/simpleconfig/SimpleConfig.java index 68cf80f4..94dff615 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/simpleconfig/SimpleConfig.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/simpleconfig/SimpleConfig.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -22,20 +22,19 @@ /** * This class is an alternative to the ConfigGUI and can be used to produce * configuration files for the DiffTool. - * */ -public class SimpleConfig -{ - /** Reference to the ConfigController */ - private final ConfigController controller; +public class SimpleConfig { + /** + * Reference to the ConfigController + */ + private final ConfigController controller; - /** - * (Constructor) Creates a new ConfigGUI object. - */ - public SimpleConfig() - { - this.controller = new ConfigController(); - controller.defaultConfiguration(); - //TODO nothing here yet... - } + /** + * (Constructor) Creates a new ConfigGUI object. + */ + public SimpleConfig() { + this.controller = new ConfigController(); + controller.defaultConfiguration(); + //TODO nothing here yet... + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/article/ArticleReaderInterface.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/article/ArticleReaderInterface.java index 1c8da6e9..c5a1d50b 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/article/ArticleReaderInterface.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/article/ArticleReaderInterface.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -23,49 +23,41 @@ /** * This interface represents the link to the input. - * - * - * */ -public interface ArticleReaderInterface -{ +public interface ArticleReaderInterface { - /** - * Determines whether another task is available or not. - *

- * This method has to be called before calling the next() method. - * - * @return TRUE | FALSE - * - * @throws ArticleReaderException - * if the parsing of the input fails - */ - boolean hasNext() - throws ArticleReaderException; + /** + * Determines whether another task is available or not. + *

+ * This method has to be called before calling the next() method. + * + * @return TRUE | FALSE + * @throws ArticleReaderException if the parsing of the input fails + */ + boolean hasNext() + throws ArticleReaderException; - /** - * Returns the next RevisionTask. - * - * @return RevisionTask. - * - * @throws ArticleReaderException - * if the parsing of the input fails - */ - Task next() - throws ArticleReaderException; + /** + * Returns the next RevisionTask. + * + * @return RevisionTask. + * @throws ArticleReaderException if the parsing of the input fails + */ + Task next() + throws ArticleReaderException; - /** - * Resets the task processing status of the ArticleReader. - *

- * This method has to be called if the hasNext() or next() methods throw an - * exception. - */ - void resetTaskCompleted(); + /** + * Resets the task processing status of the ArticleReader. + *

+ * This method has to be called if the hasNext() or next() methods throw an + * exception. + */ + void resetTaskCompleted(); - /** - * Returns the number of bytes that the ArticleReader has processed. - * - * @return number of bytes (current position in the file / archive) - */ - long getBytePosition(); + /** + * Returns the number of bytes that the ArticleReader has processed. + * + * @return number of bytes (current position in the file / archive) + */ + long getBytePosition(); } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/article/reader/ArticleFilter.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/article/reader/ArticleFilter.java index 550eaf9f..c519c596 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/article/reader/ArticleFilter.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/article/reader/ArticleFilter.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -32,148 +32,134 @@ * The namespaces are read in from the {@code } of the Wikipedia dump. The * corresponding prefixes of the language version are then used by the filter to * determine whether an article is part of an unwanted namespace or not.
- * + *

* If the ArticleFilter is not initialized or given an empty list of namespaces, * nothing is filtered at all. - * - * */ -public class ArticleFilter -{ - private Map namespaceMap; - - private Set prefixesToAllow; - - private Set prefixesToReject; - - private final Collection allowedNamespaces; - - private boolean excludeMainNamespace; - - private final int MAIN_NAMESPACE = 0; - - private static ConfigurationManager config; - - static { - try { - config = ConfigurationManager.getInstance(); - } - catch (ConfigurationException e) { - // TODO logger - System.err.print(e); - } - } - - /** - * Creates an ArticleFilter that uses configuration file to filter prefixes - * - * @throws ConfigurationException - */ - @SuppressWarnings("unchecked") - public ArticleFilter() - throws ConfigurationException - { - this((Set) config - .getConfigParameter(ConfigurationKeys.NAMESPACES_TO_KEEP)); - } - - /** - * Creates a new filter that filters all pages except the namespaces - * provided in the namespaceWhitelist - * - * @param namespaceWhitelist - * list of namespaces that should NOT be filtered - */ - public ArticleFilter(Collection namespaceWhitelist) - { - this.allowedNamespaces = namespaceWhitelist; - - if (!this.allowedNamespaces.contains(MAIN_NAMESPACE)) { - this.excludeMainNamespace = true; - } - - } - - /** - * Initialized the Namespace-Prefix mapping for the current language version - * of Wikipedia. - * - * @param namespaceMap - * mapping of namespace ids to the corresponding article title - * prefixes - */ - public void initializeNamespaces(Map namespaceMap) - { - this.namespaceMap = namespaceMap; - initializePrefixes(); - } - - /** - * Initialize allowed and restricted prefixes - */ - private void initializePrefixes() - { - if (namespaceMap == null) { - // TODO use logger - System.err - .println("Cannot use whitespace filter without initializing the namespace-prefix map for the current Wikipedia language version. DISABLING FILTER."); - } - else { - prefixesToAllow = new HashSet<>(); - prefixesToReject = new HashSet<>(); - - for (Entry namespace : namespaceMap.entrySet()) { - if (allowedNamespaces.contains(namespace.getKey())) { - prefixesToAllow.add(namespace.getValue() + ":"); - } - else { - prefixesToReject.add(namespace.getValue() + ":"); - } - } - } - } - - /** - * Filter any pages by title prefixes - * - * @param title - * the page title - * @return true, if the page should be used. false, else - */ - public boolean checkArticle(String title) - { - // if filter isn't initialized, do not filter at all - if (namespaceMap == null || namespaceMap.size() == 0 - || allowedNamespaces == null || allowedNamespaces.size() == 0) { - return true; - } - // else, do filter - else { - - // perform filtering - - // reject restricted titles - for (String str : prefixesToReject) { - if (title.startsWith(str)) { - return false; - } - } - - for (String str : prefixesToAllow) { - // allows allowed prefixes - if (title.startsWith(str)) { - return true; - } - // special case for Main Namespace(Main Namespace has not any - // prefixes) - if (excludeMainNamespace) { - return false; - } - - } - - return true; - } - } +public class ArticleFilter { + private Map namespaceMap; + + private Set prefixesToAllow; + + private Set prefixesToReject; + + private final Collection allowedNamespaces; + + private boolean excludeMainNamespace; + + private final int MAIN_NAMESPACE = 0; + + private static ConfigurationManager config; + + static { + try { + config = ConfigurationManager.getInstance(); + } catch (ConfigurationException e) { + // TODO logger + System.err.print(e); + } + } + + /** + * Creates an ArticleFilter that uses configuration file to filter prefixes + * + * @throws ConfigurationException + */ + @SuppressWarnings("unchecked") + public ArticleFilter() + throws ConfigurationException { + this((Set) config + .getConfigParameter(ConfigurationKeys.NAMESPACES_TO_KEEP)); + } + + /** + * Creates a new filter that filters all pages except the namespaces + * provided in the namespaceWhitelist + * + * @param namespaceWhitelist list of namespaces that should NOT be filtered + */ + public ArticleFilter(Collection namespaceWhitelist) { + this.allowedNamespaces = namespaceWhitelist; + + if (!this.allowedNamespaces.contains(MAIN_NAMESPACE)) { + this.excludeMainNamespace = true; + } + + } + + /** + * Initialized the Namespace-Prefix mapping for the current language version + * of Wikipedia. + * + * @param namespaceMap mapping of namespace ids to the corresponding article title + * prefixes + */ + public void initializeNamespaces(Map namespaceMap) { + this.namespaceMap = namespaceMap; + initializePrefixes(); + } + + /** + * Initialize allowed and restricted prefixes + */ + private void initializePrefixes() { + if (namespaceMap == null) { + // TODO use logger + System.err + .println("Cannot use whitespace filter without initializing the namespace-prefix map for the current Wikipedia language version. DISABLING FILTER."); + } else { + prefixesToAllow = new HashSet<>(); + prefixesToReject = new HashSet<>(); + + for (Entry namespace : namespaceMap.entrySet()) { + if (allowedNamespaces.contains(namespace.getKey())) { + prefixesToAllow.add(namespace.getValue() + ":"); + } else { + prefixesToReject.add(namespace.getValue() + ":"); + } + } + } + } + + /** + * Filter any pages by title prefixes + * + * @param title the page title + * @return true, if the page should be used. false, else + */ + public boolean checkArticle(String title) { + // if filter isn't initialized, do not filter at all + if (namespaceMap == null || namespaceMap.size() == 0 + || allowedNamespaces == null || allowedNamespaces.size() == 0) { + return true; + } + // else, do filter + else { + + // perform filtering + + // reject restricted titles + for (String str : prefixesToReject) { + if (title.startsWith(str)) { + return false; + } + } + + for (String str : prefixesToAllow) { + // allows allowed prefixes + if (title.startsWith(str)) { + return true; + } + // special case for Main Namespace(Main Namespace has not any + // prefixes) + if (excludeMainNamespace) { + return false; + } + + } + + return true; + } + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/article/reader/InputFactory.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/article/reader/InputFactory.java index 6e1aff41..a6066d23 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/article/reader/InputFactory.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/article/reader/InputFactory.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -37,207 +37,183 @@ * This factory class contains methods to access a input medium. *

* TODO: Add support for alternative commandlines - * - * - * */ -public class InputFactory -{ - - /** Configuration parameter - Path to the 7Zip executable */ - private static String PATH_PROGRAM_7ZIP = null; - - /** Configuration parameter - Charset name of the input data */ - private static String WIKIPEDIA_ENCODING = null; - - private static ConfigurationManager config = null; - - /** - * Configuration parameter - Flag, that indicates whether the statistical - * output is enabled or not - */ - private static boolean MODE_STATISTICAL_OUTPUT = false; - - static { - try { - config = ConfigurationManager.getInstance(); - - WIKIPEDIA_ENCODING = (String) config - .getConfigParameter(ConfigurationKeys.WIKIPEDIA_ENCODING); - MODE_STATISTICAL_OUTPUT = (Boolean) config - .getConfigParameter(ConfigurationKeys.MODE_STATISTICAL_OUTPUT); - - } - catch (ConfigurationException e) { - e.printStackTrace(); - System.exit(-1); - } - } - - /** No object - Utility class */ - private InputFactory() - { - } - - /** - * Starts a decompression process using the 7Zip program. - * - * @param archivePath - * path to the archive - * @return InputStreamReader - * - * @throws ConfigurationException - * if an error occurred while accessing the configuration - */ - private static InputStreamReader decompressWith7Zip(final String archivePath) - throws ConfigurationException - { - PATH_PROGRAM_7ZIP = (String) config - .getConfigParameter(ConfigurationKeys.PATH_PROGRAM_7ZIP); - - if (PATH_PROGRAM_7ZIP == null) { - throw ErrorFactory - .createConfigurationException(ErrorKeys.CONFIGURATION_PARAMETER_UNDEFINED); - } - - try { - Runtime runtime = Runtime.getRuntime(); - Process p = runtime.exec(PATH_PROGRAM_7ZIP + " e " + archivePath + " -so"); - - return new InputStreamReader(p.getInputStream(), WIKIPEDIA_ENCODING); - - } - catch (Exception e) { - throw new RuntimeException(e); - } - } - - /** - * Starts a decompression process using the BZip2 program. - * - * @param archivePath - * path to the archive - * @return InputStreamReader - * - * @throws ConfigurationException - * if an error occurred while accessing the configuration - */ - private static InputStreamReader decompressWithBZip2( - final String archivePath) - throws ConfigurationException - { - - Bzip2Archiver archiver = new Bzip2Archiver(); - InputStreamReader reader = null; - try { - reader = archiver.getDecompressionStream(archivePath, WIKIPEDIA_ENCODING); - } - catch (IOException e) { - - e.printStackTrace(); - } - - return reader; - } - - /** - * Creates a reader for the xml file. - * - * @param archivePath - * path to the xml file - * @return InputStreamReader - */ - private static InputStreamReader readXMLFile(final String archivePath) - { - - try { - return new InputStreamReader(new BufferedInputStream(new FileInputStream(archivePath)), - WIKIPEDIA_ENCODING); - - } - catch (Exception e) { - throw new RuntimeException(e); - } - } - - /** - * Returns an ArticleReader which reads the specified input file. - * - * @param archive - * input file - * @return ArticleReaderInterface - * - * @throws ConfigurationException - * if an error occurred while accessing the configuration - * @throws ArticleReaderException - * if an error occurred while parsing the file - */ - public static ArticleReaderInterface getTaskReader( - final ArchiveDescription archive) - throws ConfigurationException, ArticleReaderException - { - Reader reader; - - switch (archive.getType()) { - case XML: - reader = readXMLFile(archive.getPath()); - break; - case SEVENZIP: - reader = decompressWith7Zip(archive.getPath()); - break; - case BZIP2: - reader = decompressWithBZip2(archive.getPath()); - break; - default: - throw ErrorFactory - .createArticleReaderException(ErrorKeys.DELTA_CONSUMERS_TASK_READER_INPUTFACTORY_ILLEGAL_INPUTMODE_VALUE); - } - - if (MODE_STATISTICAL_OUTPUT) { - return new TimedWikipediaXMLReader(reader); - } - return new WikipediaXMLReader(reader); - } - - /** - * Returns an ArticleReader which reads the specified input file. - * - * @param archive - * input file - * @param checker - * the article filter - * @return ArticleReaderInterface - * - * @throws ConfigurationException - * if an error occurred while accessing the configuration - * @throws ArticleReaderException - * if an error occurred while parsing the file - */ - public static ArticleReaderInterface getTaskReader( - final ArchiveDescription archive, final ArticleFilter checker) - throws ConfigurationException, ArticleReaderException - { - Reader reader; - - //TODO add support for (compressed) XMLdumps that are stored in multiple archives - switch (archive.getType()) { - case XML: - reader = readXMLFile(archive.getPath()); - break; - case SEVENZIP: - reader = decompressWith7Zip(archive.getPath()); - break; - case BZIP2: - reader = decompressWithBZip2(archive.getPath()); - break; - default: - throw ErrorFactory - .createArticleReaderException(ErrorKeys.DELTA_CONSUMERS_TASK_READER_INPUTFACTORY_ILLEGAL_INPUTMODE_VALUE); - } - - if (MODE_STATISTICAL_OUTPUT) { - return new TimedWikipediaXMLReader(reader, checker); - } - return new WikipediaXMLReader(reader, checker); - } +public class InputFactory { + + /** + * Configuration parameter - Path to the 7Zip executable + */ + private static String PATH_PROGRAM_7ZIP = null; + + /** + * Configuration parameter - Charset name of the input data + */ + private static String WIKIPEDIA_ENCODING = null; + + private static ConfigurationManager config = null; + + /** + * Configuration parameter - Flag, that indicates whether the statistical + * output is enabled or not + */ + private static boolean MODE_STATISTICAL_OUTPUT = false; + + static { + try { + config = ConfigurationManager.getInstance(); + + WIKIPEDIA_ENCODING = (String) config + .getConfigParameter(ConfigurationKeys.WIKIPEDIA_ENCODING); + MODE_STATISTICAL_OUTPUT = (Boolean) config + .getConfigParameter(ConfigurationKeys.MODE_STATISTICAL_OUTPUT); + + } catch (ConfigurationException e) { + e.printStackTrace(); + System.exit(-1); + } + } + + /** + * No object - Utility class + */ + private InputFactory() { + } + + /** + * Starts a decompression process using the 7Zip program. + * + * @param archivePath path to the archive + * @return InputStreamReader + * @throws ConfigurationException if an error occurred while accessing the configuration + */ + private static InputStreamReader decompressWith7Zip(final String archivePath) + throws ConfigurationException { + PATH_PROGRAM_7ZIP = (String) config + .getConfigParameter(ConfigurationKeys.PATH_PROGRAM_7ZIP); + + if (PATH_PROGRAM_7ZIP == null) { + throw ErrorFactory + .createConfigurationException(ErrorKeys.CONFIGURATION_PARAMETER_UNDEFINED); + } + + try { + Runtime runtime = Runtime.getRuntime(); + Process p = runtime.exec(PATH_PROGRAM_7ZIP + " e " + archivePath + " -so"); + + return new InputStreamReader(p.getInputStream(), WIKIPEDIA_ENCODING); + + } catch (Exception e) { + throw new RuntimeException(e); + } + } + + /** + * Starts a decompression process using the BZip2 program. + * + * @param archivePath path to the archive + * @return InputStreamReader + * @throws ConfigurationException if an error occurred while accessing the configuration + */ + private static InputStreamReader decompressWithBZip2( + final String archivePath) + throws ConfigurationException { + + Bzip2Archiver archiver = new Bzip2Archiver(); + InputStreamReader reader = null; + try { + reader = archiver.getDecompressionStream(archivePath, WIKIPEDIA_ENCODING); + } catch (IOException e) { + + e.printStackTrace(); + } + + return reader; + } + + /** + * Creates a reader for the xml file. + * + * @param archivePath path to the xml file + * @return InputStreamReader + */ + private static InputStreamReader readXMLFile(final String archivePath) { + + try { + return new InputStreamReader(new BufferedInputStream(new FileInputStream(archivePath)), + WIKIPEDIA_ENCODING); + + } catch (Exception e) { + throw new RuntimeException(e); + } + } + + /** + * Returns an ArticleReader which reads the specified input file. + * + * @param archive input file + * @return ArticleReaderInterface + * @throws ConfigurationException if an error occurred while accessing the configuration + * @throws ArticleReaderException if an error occurred while parsing the file + */ + public static ArticleReaderInterface getTaskReader( + final ArchiveDescription archive) + throws ConfigurationException, ArticleReaderException { + Reader reader; + + switch (archive.getType()) { + case XML: + reader = readXMLFile(archive.getPath()); + break; + case SEVENZIP: + reader = decompressWith7Zip(archive.getPath()); + break; + case BZIP2: + reader = decompressWithBZip2(archive.getPath()); + break; + default: + throw ErrorFactory + .createArticleReaderException(ErrorKeys.DELTA_CONSUMERS_TASK_READER_INPUTFACTORY_ILLEGAL_INPUTMODE_VALUE); + } + + if (MODE_STATISTICAL_OUTPUT) { + return new TimedWikipediaXMLReader(reader); + } + return new WikipediaXMLReader(reader); + } + + /** + * Returns an ArticleReader which reads the specified input file. + * + * @param archive input file + * @param checker the article filter + * @return ArticleReaderInterface + * @throws ConfigurationException if an error occurred while accessing the configuration + * @throws ArticleReaderException if an error occurred while parsing the file + */ + public static ArticleReaderInterface getTaskReader( + final ArchiveDescription archive, final ArticleFilter checker) + throws ConfigurationException, ArticleReaderException { + Reader reader; + + //TODO add support for (compressed) XMLdumps that are stored in multiple archives + switch (archive.getType()) { + case XML: + reader = readXMLFile(archive.getPath()); + break; + case SEVENZIP: + reader = decompressWith7Zip(archive.getPath()); + break; + case BZIP2: + reader = decompressWithBZip2(archive.getPath()); + break; + default: + throw ErrorFactory + .createArticleReaderException(ErrorKeys.DELTA_CONSUMERS_TASK_READER_INPUTFACTORY_ILLEGAL_INPUTMODE_VALUE); + } + + if (MODE_STATISTICAL_OUTPUT) { + return new TimedWikipediaXMLReader(reader, checker); + } + return new WikipediaXMLReader(reader, checker); + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/article/reader/TimedWikipediaXMLReader.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/article/reader/TimedWikipediaXMLReader.java index 8000a969..a15b0e0a 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/article/reader/TimedWikipediaXMLReader.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/article/reader/TimedWikipediaXMLReader.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -30,190 +30,172 @@ /** * This version of the WikipediaXMLReader collects statistical information * when it is running. Besides that, it does the same as WikipediaXMLReader. - * */ public class TimedWikipediaXMLReader - extends WikipediaXMLReader -{ - - /** Temporary variable - start position of the article */ - private long taskStartPosition; - - /** Temporary variable - time the parsing of the article started */ - private long startTime; - - /** Temporary variable - time needed to parse the article */ - private long processingTimeRead; - - /** Temporary variable - number of parsed revisions */ - private int readRevisionCounter; - - /** Temporary variable - The time the task entered the system */ - private long enteringTime; - - /** - * Temporary variable - Flag which indicates that the last task was - * completed - */ - private boolean lastTaskCompleted; - - /** - * (Constructor) Creates a new WikipediaXMLReader. - * - * @param input - * Reference to the reader - * - * @throws ConfigurationException - * if an error occurred while accessing the configuration - */ - public TimedWikipediaXMLReader(final Reader input) - throws ConfigurationException - { - - super(input); - this.lastTaskCompleted = true; - } - - /** - * (Constructor) Creates a new TimedWikipediaXMLReader. - * - * @param input - * Reference to the reader - * @param articleNameChecker - * Reference to a name checker - * - * @throws ConfigurationException - * if an error occurred while accessing the configuration - */ - public TimedWikipediaXMLReader(final Reader input, - final ArticleFilter articleNameChecker) - throws ConfigurationException - { - - super(input, articleNameChecker); - } - - /** - * Reads the header of an article. - * - * @return FALSE if the article was not accepted by the articleNameChecker - * TRUE if no name checker was used, or if the articleNameChecker - * accepted the ArticleName - * - * @throws IOException - * if an error occurs while reading from the input - * @throws ArticleReaderException - * if an error occurs while parsing the input - */ - @Override - protected boolean readHeader() - throws IOException, ArticleReaderException - { - this.enteringTime = startTime; - return super.readHeader(); - } - - /** - * Reads a single revision from an article. - * - * @return Revision - * - * @throws IOException - * if an error occurs while reading from the input - * @throws ArticleReaderException - * if an error occurs while parsing the input - */ - @Override - protected Revision readRevision() - throws IOException, ArticleReaderException - { - - Revision rev = super.readRevision(); - this.readRevisionCounter++; - return rev; - } - - /** - * Determines whether another task is available or not. - *

- * This method has to be called before calling the next() method. - * - * @return TRUE | FALSE - * - * @throws ArticleReaderException - * if the parsing of the input fails - */ - @Override - public boolean hasNext() - throws ArticleReaderException - { - - if (super.hasNext()) { - - if (lastTaskCompleted) { - this.taskStartPosition = this.getBytePosition(); - this.processingTimeRead = 0; - this.readRevisionCounter = 0; - this.lastTaskCompleted = false; - } - - return true; - } - return false; - } - - /** - * Returns the next RevisionTask. - * - * @return RevisionTask. - * - * @throws ArticleReaderException - * if the parsing of the input fails - */ - @Override - public Task next() - throws ArticleReaderException - { - this.startTime = System.currentTimeMillis(); - - Task task = super.next(); - - processingTimeRead += System.currentTimeMillis() - startTime; - - if (task != null) { - if (task.getTaskType() == TaskTypes.TASK_PARTIAL_LAST - || task.getTaskType() == TaskTypes.TASK_FULL) { - - lastTaskCompleted = true; - - ArticleInformation info = task.getHeader(); - info.setEnteringTime(enteringTime); - info.setOriginalSize(this.getBytePosition() - taskStartPosition); - info.setProcessingTimeRead(processingTimeRead); - info.setReadRevisionCounter(readRevisionCounter); - - } - else { - lastTaskCompleted = false; - } - } - else { - lastTaskCompleted = true; - } - - return task; - } - - /** - * Resets the task processing status of the ArticleReader. - *

- * This method has to be called if the hasNext() or next() methods throw an - * exception. - */ - @Override - public void resetTaskCompleted() - { - lastTaskCompleted = true; - super.resetTaskCompleted(); - } + extends WikipediaXMLReader { + + /** + * Temporary variable - start position of the article + */ + private long taskStartPosition; + + /** + * Temporary variable - time the parsing of the article started + */ + private long startTime; + + /** + * Temporary variable - time needed to parse the article + */ + private long processingTimeRead; + + /** + * Temporary variable - number of parsed revisions + */ + private int readRevisionCounter; + + /** + * Temporary variable - The time the task entered the system + */ + private long enteringTime; + + /** + * Temporary variable - Flag which indicates that the last task was + * completed + */ + private boolean lastTaskCompleted; + + /** + * (Constructor) Creates a new WikipediaXMLReader. + * + * @param input Reference to the reader + * @throws ConfigurationException if an error occurred while accessing the configuration + */ + public TimedWikipediaXMLReader(final Reader input) + throws ConfigurationException { + + super(input); + this.lastTaskCompleted = true; + } + + /** + * (Constructor) Creates a new TimedWikipediaXMLReader. + * + * @param input Reference to the reader + * @param articleNameChecker Reference to a name checker + * @throws ConfigurationException if an error occurred while accessing the configuration + */ + public TimedWikipediaXMLReader(final Reader input, + final ArticleFilter articleNameChecker) + throws ConfigurationException { + + super(input, articleNameChecker); + } + + /** + * Reads the header of an article. + * + * @return FALSE if the article was not accepted by the articleNameChecker + * TRUE if no name checker was used, or if the articleNameChecker + * accepted the ArticleName + * @throws IOException if an error occurs while reading from the input + * @throws ArticleReaderException if an error occurs while parsing the input + */ + @Override + protected boolean readHeader() + throws IOException, ArticleReaderException { + this.enteringTime = startTime; + return super.readHeader(); + } + + /** + * Reads a single revision from an article. + * + * @return Revision + * @throws IOException if an error occurs while reading from the input + * @throws ArticleReaderException if an error occurs while parsing the input + */ + @Override + protected Revision readRevision() + throws IOException, ArticleReaderException { + + Revision rev = super.readRevision(); + this.readRevisionCounter++; + return rev; + } + + /** + * Determines whether another task is available or not. + *

+ * This method has to be called before calling the next() method. + * + * @return TRUE | FALSE + * @throws ArticleReaderException if the parsing of the input fails + */ + @Override + public boolean hasNext() + throws ArticleReaderException { + + if (super.hasNext()) { + + if (lastTaskCompleted) { + this.taskStartPosition = this.getBytePosition(); + this.processingTimeRead = 0; + this.readRevisionCounter = 0; + this.lastTaskCompleted = false; + } + + return true; + } + return false; + } + + /** + * Returns the next RevisionTask. + * + * @return RevisionTask. + * @throws ArticleReaderException if the parsing of the input fails + */ + @Override + public Task next() + throws ArticleReaderException { + this.startTime = System.currentTimeMillis(); + + Task task = super.next(); + + processingTimeRead += System.currentTimeMillis() - startTime; + + if (task != null) { + if (task.getTaskType() == TaskTypes.TASK_PARTIAL_LAST + || task.getTaskType() == TaskTypes.TASK_FULL) { + + lastTaskCompleted = true; + + ArticleInformation info = task.getHeader(); + info.setEnteringTime(enteringTime); + info.setOriginalSize(this.getBytePosition() - taskStartPosition); + info.setProcessingTimeRead(processingTimeRead); + info.setReadRevisionCounter(readRevisionCounter); + + } else { + lastTaskCompleted = false; + } + } else { + lastTaskCompleted = true; + } + + return task; + } + + /** + * Resets the task processing status of the ArticleReader. + *

+ * This method has to be called if the hasNext() or next() methods throw an + * exception. + */ + @Override + public void resetTaskCompleted() { + lastTaskCompleted = true; + super.resetTaskCompleted(); + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/article/reader/WikipediaXMLReader.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/article/reader/WikipediaXMLReader.java index 20d6974d..1adaa2e8 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/article/reader/WikipediaXMLReader.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/article/reader/WikipediaXMLReader.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -50,692 +50,663 @@ /** * This class parses the wikipedia xml format. - * - * - * */ public class WikipediaXMLReader - implements ArticleReaderInterface -{ - - /** Reference to the reader */ - private Reader input; - - /** Current position in the xml content */ - private long bytePosition; - - /** Reference to the xml keyword tree */ - private SingleKeywordTree keywords; - - /** Configuration parameter - Maximum size of a revision task */ - private final long LIMIT_TASK_SIZE_REVISIONS; - - /** Reference to the article filter */ - private ArticleFilter articleFilter; - - /** - * (Constructor) Creates a new WikipediaXMLReader. - * - * @throws ConfigurationException - * if an error occurred while accessing the configuration - */ - private WikipediaXMLReader() - throws ConfigurationException - { - - this.bytePosition = 0; - - this.taskHeader = null; - this.lastTaskCompleted = true; - - ConfigurationManager config = ConfigurationManager.getInstance(); - - LIMIT_TASK_SIZE_REVISIONS = (Long) config - .getConfigParameter(ConfigurationKeys.LIMIT_TASK_SIZE_REVISIONS); - - initXMLKeys(); - - } - - /** - * (Constructor) Creates a new WikipediaXMLReader. - * - * @param input - * Reference to the reader - * - * @throws ConfigurationException - * if an error occurred while accessing the configuration - */ - public WikipediaXMLReader(final Reader input) - throws ConfigurationException - { - - this(); - this.articleFilter = null; - this.input = input; - initNamespaces(); - } - - /** - * (Constructor) Creates a new WikipediaXMLReader. - * - * @param input - * Reference to the reader - * @param articleNameChecker - * Reference to a name checker - * - * @throws ConfigurationException - * if an error occurred while accessing the configuration - */ - public WikipediaXMLReader(final Reader input, - final ArticleFilter articleNameChecker) - throws ConfigurationException - { - - this(); - this.articleFilter = articleNameChecker; - this.input = input; - initNamespaces(); - - } - - /** - * Creates and initializes the xml keyword tree. - */ - private void initXMLKeys() - { - this.keywords = new SingleKeywordTree<>(); - - keywords.addKeyword(WikipediaXMLKeys.KEY_START_PAGE.getKeyword(), - WikipediaXMLKeys.KEY_START_PAGE); - keywords.addKeyword(WikipediaXMLKeys.KEY_END_PAGE.getKeyword(), - WikipediaXMLKeys.KEY_END_PAGE); - keywords.addKeyword(WikipediaXMLKeys.KEY_START_TITLE.getKeyword(), - WikipediaXMLKeys.KEY_START_TITLE); - keywords.addKeyword(WikipediaXMLKeys.KEY_END_TITLE.getKeyword(), - WikipediaXMLKeys.KEY_END_TITLE); - keywords.addKeyword(WikipediaXMLKeys.KEY_START_ID.getKeyword(), - WikipediaXMLKeys.KEY_START_ID); - keywords.addKeyword(WikipediaXMLKeys.KEY_END_ID.getKeyword(), - WikipediaXMLKeys.KEY_END_ID); - keywords.addKeyword(WikipediaXMLKeys.KEY_START_REVISION.getKeyword(), - WikipediaXMLKeys.KEY_START_REVISION); - keywords.addKeyword(WikipediaXMLKeys.KEY_END_REVISION.getKeyword(), - WikipediaXMLKeys.KEY_END_REVISION); - keywords.addKeyword(WikipediaXMLKeys.KEY_START_TIMESTAMP.getKeyword(), - WikipediaXMLKeys.KEY_START_TIMESTAMP); - keywords.addKeyword(WikipediaXMLKeys.KEY_END_TIMESTAMP.getKeyword(), - WikipediaXMLKeys.KEY_END_TIMESTAMP); - keywords.addKeyword(WikipediaXMLKeys.KEY_START_TEXT.getKeyword(), - WikipediaXMLKeys.KEY_START_TEXT); - keywords.addKeyword(WikipediaXMLKeys.KEY_END_TEXT.getKeyword(), - WikipediaXMLKeys.KEY_END_TEXT); - keywords.addKeyword(WikipediaXMLKeys.KEY_MINOR_FLAG.getKeyword(), - WikipediaXMLKeys.KEY_MINOR_FLAG); - keywords.addKeyword(WikipediaXMLKeys.KEY_START_COMMENT.getKeyword(), - WikipediaXMLKeys.KEY_START_COMMENT); - keywords.addKeyword(WikipediaXMLKeys.KEY_END_COMMENT.getKeyword(), - WikipediaXMLKeys.KEY_END_COMMENT); - keywords.addKeyword(WikipediaXMLKeys.KEY_START_IP.getKeyword(), - WikipediaXMLKeys.KEY_START_IP); - keywords.addKeyword(WikipediaXMLKeys.KEY_END_IP.getKeyword(), - WikipediaXMLKeys.KEY_END_IP); - keywords.addKeyword(WikipediaXMLKeys.KEY_START_USERNAME.getKeyword(), - WikipediaXMLKeys.KEY_START_USERNAME); - keywords.addKeyword(WikipediaXMLKeys.KEY_END_USERNAME.getKeyword(), - WikipediaXMLKeys.KEY_END_USERNAME); - keywords.addKeyword(WikipediaXMLKeys.KEY_START_CONTRIBUTOR.getKeyword(), - WikipediaXMLKeys.KEY_START_CONTRIBUTOR); - keywords.addKeyword(WikipediaXMLKeys.KEY_END_CONTRIBUTOR.getKeyword(), - WikipediaXMLKeys.KEY_END_CONTRIBUTOR); - keywords.addKeyword(WikipediaXMLKeys.KEY_START_NAMESPACES.getKeyword(), - WikipediaXMLKeys.KEY_START_NAMESPACES); - keywords.addKeyword(WikipediaXMLKeys.KEY_END_NAMESPACES.getKeyword(), - WikipediaXMLKeys.KEY_END_NAMESPACES); - } - - /** - * Reads the namespaces from the siteinfo section and processes them - * in order to initialize the ArticleFilter - */ - private void initNamespaces(){ - Map namespaceMap = new HashMap<>(); - try{ - int b = read(); - - this.keywords.reset(); - StringBuilder buffer = null; - - while (b != -1) { + implements ArticleReaderInterface { + + /** + * Reference to the reader + */ + private Reader input; + + /** + * Current position in the xml content + */ + private long bytePosition; + + /** + * Reference to the xml keyword tree + */ + private SingleKeywordTree keywords; + + /** + * Configuration parameter - Maximum size of a revision task + */ + private final long LIMIT_TASK_SIZE_REVISIONS; + + /** + * Reference to the article filter + */ + private ArticleFilter articleFilter; + + /** + * (Constructor) Creates a new WikipediaXMLReader. + * + * @throws ConfigurationException if an error occurred while accessing the configuration + */ + private WikipediaXMLReader() + throws ConfigurationException { + + this.bytePosition = 0; + + this.taskHeader = null; + this.lastTaskCompleted = true; + + ConfigurationManager config = ConfigurationManager.getInstance(); + + LIMIT_TASK_SIZE_REVISIONS = (Long) config + .getConfigParameter(ConfigurationKeys.LIMIT_TASK_SIZE_REVISIONS); + + initXMLKeys(); + + } + + /** + * (Constructor) Creates a new WikipediaXMLReader. + * + * @param input Reference to the reader + * @throws ConfigurationException if an error occurred while accessing the configuration + */ + public WikipediaXMLReader(final Reader input) + throws ConfigurationException { + + this(); + this.articleFilter = null; + this.input = input; + initNamespaces(); + } + + /** + * (Constructor) Creates a new WikipediaXMLReader. + * + * @param input Reference to the reader + * @param articleNameChecker Reference to a name checker + * @throws ConfigurationException if an error occurred while accessing the configuration + */ + public WikipediaXMLReader(final Reader input, + final ArticleFilter articleNameChecker) + throws ConfigurationException { + + this(); + this.articleFilter = articleNameChecker; + this.input = input; + initNamespaces(); + + } + + /** + * Creates and initializes the xml keyword tree. + */ + private void initXMLKeys() { + this.keywords = new SingleKeywordTree<>(); + + keywords.addKeyword(WikipediaXMLKeys.KEY_START_PAGE.getKeyword(), + WikipediaXMLKeys.KEY_START_PAGE); + keywords.addKeyword(WikipediaXMLKeys.KEY_END_PAGE.getKeyword(), + WikipediaXMLKeys.KEY_END_PAGE); + keywords.addKeyword(WikipediaXMLKeys.KEY_START_TITLE.getKeyword(), + WikipediaXMLKeys.KEY_START_TITLE); + keywords.addKeyword(WikipediaXMLKeys.KEY_END_TITLE.getKeyword(), + WikipediaXMLKeys.KEY_END_TITLE); + keywords.addKeyword(WikipediaXMLKeys.KEY_START_ID.getKeyword(), + WikipediaXMLKeys.KEY_START_ID); + keywords.addKeyword(WikipediaXMLKeys.KEY_END_ID.getKeyword(), + WikipediaXMLKeys.KEY_END_ID); + keywords.addKeyword(WikipediaXMLKeys.KEY_START_REVISION.getKeyword(), + WikipediaXMLKeys.KEY_START_REVISION); + keywords.addKeyword(WikipediaXMLKeys.KEY_END_REVISION.getKeyword(), + WikipediaXMLKeys.KEY_END_REVISION); + keywords.addKeyword(WikipediaXMLKeys.KEY_START_TIMESTAMP.getKeyword(), + WikipediaXMLKeys.KEY_START_TIMESTAMP); + keywords.addKeyword(WikipediaXMLKeys.KEY_END_TIMESTAMP.getKeyword(), + WikipediaXMLKeys.KEY_END_TIMESTAMP); + keywords.addKeyword(WikipediaXMLKeys.KEY_START_TEXT.getKeyword(), + WikipediaXMLKeys.KEY_START_TEXT); + keywords.addKeyword(WikipediaXMLKeys.KEY_END_TEXT.getKeyword(), + WikipediaXMLKeys.KEY_END_TEXT); + keywords.addKeyword(WikipediaXMLKeys.KEY_MINOR_FLAG.getKeyword(), + WikipediaXMLKeys.KEY_MINOR_FLAG); + keywords.addKeyword(WikipediaXMLKeys.KEY_START_COMMENT.getKeyword(), + WikipediaXMLKeys.KEY_START_COMMENT); + keywords.addKeyword(WikipediaXMLKeys.KEY_END_COMMENT.getKeyword(), + WikipediaXMLKeys.KEY_END_COMMENT); + keywords.addKeyword(WikipediaXMLKeys.KEY_START_IP.getKeyword(), + WikipediaXMLKeys.KEY_START_IP); + keywords.addKeyword(WikipediaXMLKeys.KEY_END_IP.getKeyword(), + WikipediaXMLKeys.KEY_END_IP); + keywords.addKeyword(WikipediaXMLKeys.KEY_START_USERNAME.getKeyword(), + WikipediaXMLKeys.KEY_START_USERNAME); + keywords.addKeyword(WikipediaXMLKeys.KEY_END_USERNAME.getKeyword(), + WikipediaXMLKeys.KEY_END_USERNAME); + keywords.addKeyword(WikipediaXMLKeys.KEY_START_CONTRIBUTOR.getKeyword(), + WikipediaXMLKeys.KEY_START_CONTRIBUTOR); + keywords.addKeyword(WikipediaXMLKeys.KEY_END_CONTRIBUTOR.getKeyword(), + WikipediaXMLKeys.KEY_END_CONTRIBUTOR); + keywords.addKeyword(WikipediaXMLKeys.KEY_START_NAMESPACES.getKeyword(), + WikipediaXMLKeys.KEY_START_NAMESPACES); + keywords.addKeyword(WikipediaXMLKeys.KEY_END_NAMESPACES.getKeyword(), + WikipediaXMLKeys.KEY_END_NAMESPACES); + } + + /** + * Reads the namespaces from the siteinfo section and processes them + * in order to initialize the ArticleFilter + */ + private void initNamespaces() { + Map namespaceMap = new HashMap<>(); + try { + int b = read(); + + this.keywords.reset(); + StringBuilder buffer = null; + + while (b != -1) { // System.out.print((char)b); - if (buffer != null) { - buffer.append((char) b); - } - - if (this.keywords.check((char) b)) { - switch (this.keywords.getValue()) { - - case KEY_START_NAMESPACES: - buffer = new StringBuilder(WikipediaXMLKeys.KEY_START_NAMESPACES.getKeyword()); - break; - - case KEY_END_NAMESPACES: - DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); - factory.setIgnoringElementContentWhitespace(true); - Document namespaces = factory.newDocumentBuilder().parse(new InputSource(new StringReader(buffer.toString()))); - - - NodeList nsList = namespaces.getChildNodes().item(0).getChildNodes(); - - for (int i = 0; i < nsList.getLength(); i++) { - Node curNamespace = nsList.item(i); - - //get the prefix for the current namespace - String prefix = curNamespace.getTextContent().trim(); - if(!prefix.isEmpty()){ - NamedNodeMap nsAttributes = curNamespace.getAttributes(); - String namespace = nsAttributes.getNamedItem("key").getTextContent(); - namespaceMap.put(Integer.parseInt(namespace), prefix); - } - } - - articleFilter.initializeNamespaces(namespaceMap); - return; //init done - - } - - this.keywords.reset(); - } - - b=read(); - } - }catch(IOException e){ - System.err.println("Error reading namespaces from xml dump."); - }catch(ParserConfigurationException | SAXException e){ - System.err.println("Error parsing namespace data."); - } - } - - /** - * Reads a single byte - * - * @return integer value of the byte or -1 if the end of the stream was - * reached - * - * @throws IOException - * if an error occurs while reading the input - */ - private int read() - throws IOException - { - this.bytePosition++; - return input.read(); - } - - /** Temporary variable - reference to the article information */ - private ArticleInformation taskHeader; - - /** - * Temporary variable - Flag which indicates that the last task was - * completed - */ - private boolean lastTaskCompleted; - - /** - * Temporary variable - Task part counter - */ - private int taskPartCounter; - - /** - * Temporary variable - Task revision counter - */ - private int taskRevisionCounter; - - /** - * Determines whether another task is available or not. - * - * This method has to be called before calling the next() method. - * - * @return TRUE | FALSE - * - * @throws ArticleReaderException - * if the parsing of the input fails - */ - public boolean hasNext() - throws ArticleReaderException - { - - try { - if (!this.lastTaskCompleted) { - return true; - } - - this.keywords.reset(); - - int b = read(); - while (b != -1) { - - if (keywords.check((char) b)) { - switch (keywords.getValue()) { - case KEY_START_PAGE: - // taskStartPosition = bytePosition; - return true; - } - keywords.reset(); - } - - b = read(); - } - - return false; - - } - catch (Exception e) { - throw new ArticleReaderException(e); - } - } - - /** - * Reads the header of an article. - * - * @return FALSE if the article was not accepted by the articleFilter - * TRUE if no name checker was used, or if the articleFilter - * accepted the ArticleName - * - * @throws IOException - * if an error occurs while reading from the input - * @throws ArticleReaderException - * if an error occurs while parsing the input - */ - protected boolean readHeader() - throws IOException, ArticleReaderException - { - - this.taskHeader = new ArticleInformation(); - - int size, r = read(); - StringBuilder buffer = null; - - while (r != -1) { - - if (buffer != null) { - buffer.append((char) r); - } - - if (this.keywords.check((char) r)) { - switch (this.keywords.getValue()) { - - case KEY_START_TITLE: - case KEY_START_ID: - buffer = new StringBuilder(); - break; - - case KEY_END_TITLE: - size = buffer.length(); - buffer.delete(size - - WikipediaXMLKeys.KEY_END_TITLE.getKeyword() - .length(), size); - - this.taskHeader.setArticleName(buffer.toString()); - if (this.articleFilter != null) { - if (!this.articleFilter - .checkArticle(this.taskHeader.getArticleName())) { - return false; - } - } - - buffer = null; - break; - - case KEY_END_ID: - size = buffer.length(); - buffer.delete( - size - - WikipediaXMLKeys.KEY_END_ID.getKeyword() - .length(), size); - - this.taskHeader.setArticleId(Integer.parseInt(buffer - .toString())); - buffer = null; - break; - - case KEY_START_REVISION: - this.keywords.reset(); - return true; - - default: - throw ErrorFactory - .createArticleReaderException(ErrorKeys.DELTA_CONSUMERS_TASK_READER_WIKIPEDIAXMLREADER_UNEXPECTED_KEYWORD); - } - - this.keywords.reset(); - } - - r = read(); - } - - throw ErrorFactory - .createArticleReaderException(ErrorKeys.DELTA_CONSUMERS_TASK_READER_WIKIPEDIAXMLREADER_UNEXPECTED_END_OF_FILE); - } - - /** - * Reads a single revision from an article. - * - * @return Revision - * - * @throws IOException - * if an error occurs while reading from the input - * @throws ArticleReaderException - * if an error occurs while parsing the input - */ - protected Revision readRevision() - throws IOException, ArticleReaderException - { - - this.taskRevisionCounter++; - Revision revision = new Revision(this.taskRevisionCounter); - - int size, r = read(); - boolean hasId = false; - - StringBuilder buffer = null; - this.keywords.reset(); - - while (r != -1) { - - if (buffer != null) { - buffer.append((char) r); - } - - if (this.keywords.check((char) r)) { - switch (this.keywords.getValue()) { - - case KEY_START_TEXT: - - case KEY_START_TIMESTAMP: - - case KEY_START_COMMENT: - - case KEY_START_CONTRIBUTOR: - buffer = new StringBuilder(); - break; - - case KEY_START_ID: - if (!hasId) { - buffer = new StringBuilder(); - } - break; - - case KEY_END_ID: - if (!hasId) { - size = buffer.length(); - buffer.delete(size - - WikipediaXMLKeys.KEY_END_ID.getKeyword() - .length(), size); - - revision.setRevisionID(Integer.parseInt(buffer - .toString())); - buffer = null; - - hasId = true; - } - break; - - case KEY_END_TIMESTAMP: - size = buffer.length(); - buffer.delete(size - - WikipediaXMLKeys.KEY_END_TIMESTAMP.getKeyword() - .length(), size); - - revision.setTimeStamp(buffer.toString()); - buffer = null; - break; - - case KEY_END_TEXT: - size = buffer.length(); - buffer.delete(size - - WikipediaXMLKeys.KEY_END_TEXT.getKeyword() - .length(), size); - - revision.setRevisionText(buffer.toString()); - buffer = null; - break; - - case KEY_END_COMMENT: - size = buffer.length(); - buffer.delete(size - - WikipediaXMLKeys.KEY_END_COMMENT.getKeyword() - .length(), size); - //escape comment string - revision.setComment(SQLEscape.escape(buffer.toString())); - buffer = null; - break; - - case KEY_END_CONTRIBUTOR: - size = buffer.length(); - buffer.delete(size - - WikipediaXMLKeys.KEY_END_CONTRIBUTOR.getKeyword() - .length(), size); - //escape id string - readContributor(revision, buffer.toString()); - buffer = null; - break; - - case KEY_MINOR_FLAG: - revision.setMinor(true); - buffer = null; - break; - - case KEY_END_REVISION: - this.keywords.reset(); - return revision; - - //the following cases are handeled in readContributor() - //they can be skipped here - case KEY_START_IP: - case KEY_END_IP: - case KEY_START_USERNAME: - case KEY_END_USERNAME: - break; - - default: - System.out.println(keywords.getValue()); - throw ErrorFactory - .createArticleReaderException(ErrorKeys.DELTA_CONSUMERS_TASK_READER_WIKIPEDIAXMLREADER_UNEXPECTED_KEYWORD); - } - - this.keywords.reset(); - } - - r = read(); - } - - throw ErrorFactory - .createArticleReaderException(ErrorKeys.DELTA_CONSUMERS_TASK_READER_WIKIPEDIAXMLREADER_UNEXPECTED_END_OF_FILE); - } - - /** - * Parses the content within the contributor tags and adds the - * parsed info to the provided revision object. - * - * @param rev the revision object to store the parsed info in - * @param str the contributor data to be parsed - * @throws IOException - * @throws ArticleReaderException - */ - protected void readContributor(Revision rev, String str) throws IOException, ArticleReaderException - { - char[] contrChars = str.toCharArray(); - int size; - - StringBuilder buffer = null; - this.keywords.reset(); - - for(char curChar:contrChars){ - - if (buffer != null) { - buffer.append(curChar); - } - - if (this.keywords.check(curChar)) { - - switch (this.keywords.getValue()) { - - case KEY_START_ID: - case KEY_START_IP: - case KEY_START_USERNAME: - buffer = new StringBuilder(); - break; - - case KEY_END_IP: - size = buffer.length(); - buffer.delete(size - - WikipediaXMLKeys.KEY_END_IP.getKeyword() - .length(), size); - // escape id string - rev.setContributorName(SQLEscape.escape(buffer.toString())); - rev.setContributorIsRegistered(false); - buffer = null; - break; - - case KEY_END_USERNAME: - size = buffer.length(); - buffer.delete(size - - WikipediaXMLKeys.KEY_END_USERNAME.getKeyword() - .length(), size); - // escape id string - rev.setContributorName(SQLEscape.escape(buffer.toString())); - rev.setContributorIsRegistered(true); - buffer = null; - break; - - case KEY_END_ID: - size = buffer.length(); - buffer.delete(size - - WikipediaXMLKeys.KEY_END_ID.getKeyword() - .length(), size); - String id = buffer.toString(); - if(!id.isEmpty()){ - rev.setContributorId(Integer.parseInt(buffer.toString())); - } - buffer = null; - break; - } - } - } - } - - /** - * Returns the next RevisionTask. - * - * @return RevisionTask. - * - * @throws ArticleReaderException - * if the parsing of the input fails - */ - public Task next() - throws ArticleReaderException - { - - try { - this.keywords.reset(); - - // if new article read header, otherwise use old one - if (this.lastTaskCompleted) { - this.lastTaskCompleted = false; - - this.taskPartCounter = 1; - this.taskRevisionCounter = -1; - - if (!readHeader()) { - - this.lastTaskCompleted = true; - return null; - - } - } - else { - this.taskPartCounter++; - } - - Task task = new Task<>(this.taskHeader, + if (buffer != null) { + buffer.append((char) b); + } + + if (this.keywords.check((char) b)) { + switch (this.keywords.getValue()) { + + case KEY_START_NAMESPACES: + buffer = new StringBuilder(WikipediaXMLKeys.KEY_START_NAMESPACES.getKeyword()); + break; + + case KEY_END_NAMESPACES: + DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); + factory.setIgnoringElementContentWhitespace(true); + Document namespaces = factory.newDocumentBuilder().parse(new InputSource(new StringReader(buffer.toString()))); + + + NodeList nsList = namespaces.getChildNodes().item(0).getChildNodes(); + + for (int i = 0; i < nsList.getLength(); i++) { + Node curNamespace = nsList.item(i); + + //get the prefix for the current namespace + String prefix = curNamespace.getTextContent().trim(); + if (!prefix.isEmpty()) { + NamedNodeMap nsAttributes = curNamespace.getAttributes(); + String namespace = nsAttributes.getNamedItem("key").getTextContent(); + namespaceMap.put(Integer.parseInt(namespace), prefix); + } + } + + articleFilter.initializeNamespaces(namespaceMap); + return; //init done + + } + + this.keywords.reset(); + } + + b = read(); + } + } catch (IOException e) { + System.err.println("Error reading namespaces from xml dump."); + } catch (ParserConfigurationException | SAXException e) { + System.err.println("Error parsing namespace data."); + } + } + + /** + * Reads a single byte + * + * @return integer value of the byte or -1 if the end of the stream was + * reached + * @throws IOException if an error occurs while reading the input + */ + private int read() + throws IOException { + this.bytePosition++; + return input.read(); + } + + /** + * Temporary variable - reference to the article information + */ + private ArticleInformation taskHeader; + + /** + * Temporary variable - Flag which indicates that the last task was + * completed + */ + private boolean lastTaskCompleted; + + /** + * Temporary variable - Task part counter + */ + private int taskPartCounter; + + /** + * Temporary variable - Task revision counter + */ + private int taskRevisionCounter; + + /** + * Determines whether another task is available or not. + *

+ * This method has to be called before calling the next() method. + * + * @return TRUE | FALSE + * @throws ArticleReaderException if the parsing of the input fails + */ + public boolean hasNext() + throws ArticleReaderException { + + try { + if (!this.lastTaskCompleted) { + return true; + } + + this.keywords.reset(); + + int b = read(); + while (b != -1) { + + if (keywords.check((char) b)) { + switch (keywords.getValue()) { + case KEY_START_PAGE: + // taskStartPosition = bytePosition; + return true; + } + keywords.reset(); + } + + b = read(); + } + + return false; + + } catch (Exception e) { + throw new ArticleReaderException(e); + } + } + + /** + * Reads the header of an article. + * + * @return FALSE if the article was not accepted by the articleFilter + * TRUE if no name checker was used, or if the articleFilter + * accepted the ArticleName + * @throws IOException if an error occurs while reading from the input + * @throws ArticleReaderException if an error occurs while parsing the input + */ + protected boolean readHeader() + throws IOException, ArticleReaderException { + + this.taskHeader = new ArticleInformation(); + + int size, r = read(); + StringBuilder buffer = null; + + while (r != -1) { + + if (buffer != null) { + buffer.append((char) r); + } + + if (this.keywords.check((char) r)) { + switch (this.keywords.getValue()) { + + case KEY_START_TITLE: + case KEY_START_ID: + buffer = new StringBuilder(); + break; + + case KEY_END_TITLE: + size = buffer.length(); + buffer.delete(size + - WikipediaXMLKeys.KEY_END_TITLE.getKeyword() + .length(), size); + + this.taskHeader.setArticleName(buffer.toString()); + if (this.articleFilter != null) { + if (!this.articleFilter + .checkArticle(this.taskHeader.getArticleName())) { + return false; + } + } + + buffer = null; + break; + + case KEY_END_ID: + size = buffer.length(); + buffer.delete( + size + - WikipediaXMLKeys.KEY_END_ID.getKeyword() + .length(), size); + + this.taskHeader.setArticleId(Integer.parseInt(buffer + .toString())); + buffer = null; + break; + + case KEY_START_REVISION: + this.keywords.reset(); + return true; + + default: + throw ErrorFactory + .createArticleReaderException(ErrorKeys.DELTA_CONSUMERS_TASK_READER_WIKIPEDIAXMLREADER_UNEXPECTED_KEYWORD); + } + + this.keywords.reset(); + } + + r = read(); + } + + throw ErrorFactory + .createArticleReaderException(ErrorKeys.DELTA_CONSUMERS_TASK_READER_WIKIPEDIAXMLREADER_UNEXPECTED_END_OF_FILE); + } + + /** + * Reads a single revision from an article. + * + * @return Revision + * @throws IOException if an error occurs while reading from the input + * @throws ArticleReaderException if an error occurs while parsing the input + */ + protected Revision readRevision() + throws IOException, ArticleReaderException { + + this.taskRevisionCounter++; + Revision revision = new Revision(this.taskRevisionCounter); + + int size, r = read(); + boolean hasId = false; + + StringBuilder buffer = null; + this.keywords.reset(); + + while (r != -1) { + + if (buffer != null) { + buffer.append((char) r); + } + + if (this.keywords.check((char) r)) { + switch (this.keywords.getValue()) { + + case KEY_START_TEXT: + + case KEY_START_TIMESTAMP: + + case KEY_START_COMMENT: + + case KEY_START_CONTRIBUTOR: + buffer = new StringBuilder(); + break; + + case KEY_START_ID: + if (!hasId) { + buffer = new StringBuilder(); + } + break; + + case KEY_END_ID: + if (!hasId) { + size = buffer.length(); + buffer.delete(size + - WikipediaXMLKeys.KEY_END_ID.getKeyword() + .length(), size); + + revision.setRevisionID(Integer.parseInt(buffer + .toString())); + buffer = null; + + hasId = true; + } + break; + + case KEY_END_TIMESTAMP: + size = buffer.length(); + buffer.delete(size + - WikipediaXMLKeys.KEY_END_TIMESTAMP.getKeyword() + .length(), size); + + revision.setTimeStamp(buffer.toString()); + buffer = null; + break; + + case KEY_END_TEXT: + size = buffer.length(); + buffer.delete(size + - WikipediaXMLKeys.KEY_END_TEXT.getKeyword() + .length(), size); + + revision.setRevisionText(buffer.toString()); + buffer = null; + break; + + case KEY_END_COMMENT: + size = buffer.length(); + buffer.delete(size + - WikipediaXMLKeys.KEY_END_COMMENT.getKeyword() + .length(), size); + //escape comment string + revision.setComment(SQLEscape.escape(buffer.toString())); + buffer = null; + break; + + case KEY_END_CONTRIBUTOR: + size = buffer.length(); + buffer.delete(size + - WikipediaXMLKeys.KEY_END_CONTRIBUTOR.getKeyword() + .length(), size); + //escape id string + readContributor(revision, buffer.toString()); + buffer = null; + break; + + case KEY_MINOR_FLAG: + revision.setMinor(true); + buffer = null; + break; + + case KEY_END_REVISION: + this.keywords.reset(); + return revision; + + //the following cases are handeled in readContributor() + //they can be skipped here + case KEY_START_IP: + case KEY_END_IP: + case KEY_START_USERNAME: + case KEY_END_USERNAME: + break; + + default: + System.out.println(keywords.getValue()); + throw ErrorFactory + .createArticleReaderException(ErrorKeys.DELTA_CONSUMERS_TASK_READER_WIKIPEDIAXMLREADER_UNEXPECTED_KEYWORD); + } + + this.keywords.reset(); + } + + r = read(); + } + + throw ErrorFactory + .createArticleReaderException(ErrorKeys.DELTA_CONSUMERS_TASK_READER_WIKIPEDIAXMLREADER_UNEXPECTED_END_OF_FILE); + } + + /** + * Parses the content within the contributor tags and adds the + * parsed info to the provided revision object. + * + * @param rev the revision object to store the parsed info in + * @param str the contributor data to be parsed + * @throws IOException + * @throws ArticleReaderException + */ + protected void readContributor(Revision rev, String str) throws IOException, ArticleReaderException { + char[] contrChars = str.toCharArray(); + int size; + + StringBuilder buffer = null; + this.keywords.reset(); + + for (char curChar : contrChars) { + + if (buffer != null) { + buffer.append(curChar); + } + + if (this.keywords.check(curChar)) { + + switch (this.keywords.getValue()) { + + case KEY_START_ID: + case KEY_START_IP: + case KEY_START_USERNAME: + buffer = new StringBuilder(); + break; + + case KEY_END_IP: + size = buffer.length(); + buffer.delete(size + - WikipediaXMLKeys.KEY_END_IP.getKeyword() + .length(), size); + // escape id string + rev.setContributorName(SQLEscape.escape(buffer.toString())); + rev.setContributorIsRegistered(false); + buffer = null; + break; + + case KEY_END_USERNAME: + size = buffer.length(); + buffer.delete(size + - WikipediaXMLKeys.KEY_END_USERNAME.getKeyword() + .length(), size); + // escape id string + rev.setContributorName(SQLEscape.escape(buffer.toString())); + rev.setContributorIsRegistered(true); + buffer = null; + break; + + case KEY_END_ID: + size = buffer.length(); + buffer.delete(size + - WikipediaXMLKeys.KEY_END_ID.getKeyword() + .length(), size); + String id = buffer.toString(); + if (!id.isEmpty()) { + rev.setContributorId(Integer.parseInt(buffer.toString())); + } + buffer = null; + break; + } + } + } + } + + /** + * Returns the next RevisionTask. + * + * @return RevisionTask. + * @throws ArticleReaderException if the parsing of the input fails + */ + public Task next() + throws ArticleReaderException { + + try { + this.keywords.reset(); + + // if new article read header, otherwise use old one + if (this.lastTaskCompleted) { + this.lastTaskCompleted = false; + + this.taskPartCounter = 1; + this.taskRevisionCounter = -1; + + if (!readHeader()) { + + this.lastTaskCompleted = true; + return null; + + } + } else { + this.taskPartCounter++; + } + + Task task = new Task<>(this.taskHeader, this.taskPartCounter); - task.add(readRevision()); - - int r = read(); - while (r != -1) { - if (this.keywords.check((char) r)) { - - switch (this.keywords.getValue()) { - - case KEY_START_REVISION: - - if (task.byteSize() >= LIMIT_TASK_SIZE_REVISIONS) { - this.lastTaskCompleted = false; - - if (this.taskPartCounter == 1) { - task.setTaskType(TaskTypes.TASK_PARTIAL_FIRST); - } - else { - task.setTaskType(TaskTypes.TASK_PARTIAL); - } - - return task; - } - - task.add(readRevision()); - break; - - case KEY_END_PAGE: - - this.lastTaskCompleted = true; - if (this.taskPartCounter > 1) { - task.setTaskType(TaskTypes.TASK_PARTIAL_LAST); - } - - return task; - - default: - throw new IOException(); - } - - this.keywords.reset(); - } - - r = read(); - } - - throw ErrorFactory - .createArticleReaderException(ErrorKeys.DELTA_CONSUMERS_TASK_READER_WIKIPEDIAXMLREADER_UNEXPECTED_END_OF_FILE); - - } - catch (ArticleReaderException e) { - throw e; - } - catch (Exception e) { - throw new ArticleReaderException(e); - } - } - - /** - * Resets the task processing status of the ArticleReader. - * - * This method has to be called if the hasNext() or next() methods throw an - * exception. - */ - public void resetTaskCompleted() - { - this.lastTaskCompleted = true; - } - - /** - * Returns the number of bytes that the ArticleReader has processed. - * - * @return number of bytes (current position in the file / archive) - */ - public long getBytePosition() - { - return this.bytePosition; - } + task.add(readRevision()); + + int r = read(); + while (r != -1) { + if (this.keywords.check((char) r)) { + + switch (this.keywords.getValue()) { + + case KEY_START_REVISION: + + if (task.byteSize() >= LIMIT_TASK_SIZE_REVISIONS) { + this.lastTaskCompleted = false; + + if (this.taskPartCounter == 1) { + task.setTaskType(TaskTypes.TASK_PARTIAL_FIRST); + } else { + task.setTaskType(TaskTypes.TASK_PARTIAL); + } + + return task; + } + + task.add(readRevision()); + break; + + case KEY_END_PAGE: + + this.lastTaskCompleted = true; + if (this.taskPartCounter > 1) { + task.setTaskType(TaskTypes.TASK_PARTIAL_LAST); + } + + return task; + + default: + throw new IOException(); + } + + this.keywords.reset(); + } + + r = read(); + } + + throw ErrorFactory + .createArticleReaderException(ErrorKeys.DELTA_CONSUMERS_TASK_READER_WIKIPEDIAXMLREADER_UNEXPECTED_END_OF_FILE); + + } catch (ArticleReaderException e) { + throw e; + } catch (Exception e) { + throw new ArticleReaderException(e); + } + } + + /** + * Resets the task processing status of the ArticleReader. + *

+ * This method has to be called if the hasNext() or next() methods throw an + * exception. + */ + public void resetTaskCompleted() { + this.lastTaskCompleted = true; + } + + /** + * Returns the number of bytes that the ArticleReader has processed. + * + * @return number of bytes (current position in the file / archive) + */ + public long getBytePosition() { + return this.bytePosition; + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/diff/DiffCalculatorInterface.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/diff/DiffCalculatorInterface.java index dd835033..2e7336fb 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/diff/DiffCalculatorInterface.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/diff/DiffCalculatorInterface.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -29,51 +29,41 @@ /** * The DiffCalculatorInterface represents the interface to the diff processing * unit. - * + *

* Please notice that there is no default method to return the generated diff. * The currently implementation uses the TaskTransmitterInterface (given as * parameter of the constructor) to send the diffed data to the DiffProducer. - * - * - * */ -public interface DiffCalculatorInterface -{ +public interface DiffCalculatorInterface { - /** - * This method process the given task to generate the diff. - * - * @param task - * RevisionTask - * - * @throws DiffException - * if the diff process fails - * - * @throws TimeoutException - * if the TaskTransmitter times out during the transmission of - * the task to the DiffProducer. - * - * @throws UnsupportedEncodingException - * if the CharacterSet defined in the configuration is not - * supported by JAVA. - */ + /** + * This method process the given task to generate the diff. + * + * @param task RevisionTask + * @throws DiffException if the diff process fails + * @throws TimeoutException if the TaskTransmitter times out during the transmission of + * the task to the DiffProducer. + * @throws UnsupportedEncodingException if the CharacterSet defined in the configuration is not + * supported by JAVA. + */ void process(final Task task) - throws DiffException, TimeoutException, UnsupportedEncodingException; + throws DiffException, TimeoutException, UnsupportedEncodingException; - /** - * This method is used to delete all information concerning the partial task - * processing. - * - * This method has to be called if the process method throws an exception. - */ + /** + * This method is used to delete all information concerning the partial task + * processing. + *

+ * This method has to be called if the process method throws an exception. + */ void reset(); - - - /** - * Close Stream of Transmitter - * @throws IOException - * @throws SQLException - */ + + + /** + * Close Stream of Transmitter + * + * @throws IOException + * @throws SQLException + */ void closeTransmitter() throws IOException, SQLException; } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/diff/TaskTransmitterInterface.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/diff/TaskTransmitterInterface.java index 56fba720..191f6f52 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/diff/TaskTransmitterInterface.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/diff/TaskTransmitterInterface.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -27,47 +27,38 @@ /** * The TaskTransmitterInterface handles the transmission of DiffTasks to the * DiffProducer. - * - * - * */ -public interface TaskTransmitterInterface -{ +public interface TaskTransmitterInterface { + + /** + * Sends the given task to the DiffProducer - FullTaskPool. + * + * @param result DiffTask of type TaskTypes.FULL_TASK or + * TaskTypes.PARTIAL_TASK_FIRST + * @throws TimeoutException if the TaskTransmitter times out during the transmission of + * the task to the DiffProducer. + */ + void transmitDiff(final Task result) + throws TimeoutException; + + /** + * Sends the given task to the DiffProducer - PartialTaskPool. + * + * @param result DiffTask of type TaskTypes.PARTIAL_TASK or + * TaskTypes.PARTIAL_TASK_LAST + * @throws TimeoutException if the TaskTransmitter times out during the transmission of + * the task to the DiffProducer. + */ + void transmitPartialDiff(final Task result) + throws TimeoutException; - /** - * Sends the given task to the DiffProducer - FullTaskPool. - * - * @param result - * DiffTask of type TaskTypes.FULL_TASK or - * TaskTypes.PARTIAL_TASK_FIRST - * - * @throws TimeoutException - * if the TaskTransmitter times out during the transmission of - * the task to the DiffProducer. - */ - void transmitDiff(final Task result) - throws TimeoutException; - /** - * Sends the given task to the DiffProducer - PartialTaskPool. - * - * @param result - * DiffTask of type TaskTypes.PARTIAL_TASK or - * TaskTypes.PARTIAL_TASK_LAST - * - * @throws TimeoutException - * if the TaskTransmitter times out during the transmission of - * the task to the DiffProducer. - */ - void transmitPartialDiff(final Task result) - throws TimeoutException; - - - /** - * Close stream - * @throws IOException - * @throws SQLException - */ + /** + * Close stream + * + * @throws IOException + * @throws SQLException + */ void close() throws IOException, SQLException; } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/diff/calculation/BlockManagement.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/diff/calculation/BlockManagement.java index 0440b089..25275ff0 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/diff/calculation/BlockManagement.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/diff/calculation/BlockManagement.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -33,328 +33,300 @@ /** * The BlockManagement class is used to calculate the diff operations using the * blocks of the longest common substring search. - * - * - * */ public class BlockManagement - implements BlockManagementInterface -{ + implements BlockManagementInterface { + + /** + * Configuration parameter - Charset name of the input data + */ + private static String WIKIPEDIA_ENCODING; + + /** + * Temporary variable - Just in Time revision + */ + private StringBuilder version; + + /** + * Temporary variable - Diff + */ + private Diff diff; + + /** + * Temporary variable - Storage for intermediate blocks + */ + private Map bufferMap; + + /** + * Reference to the codec + */ + private RevisionCodecData codecData; + + /** + * (Constructor) Creates a BlockManagement object. + * + * @throws ConfigurationException if an error occurred while accessing the configuration + */ + public BlockManagement() + throws ConfigurationException { + + ConfigurationManager config = ConfigurationManager.getInstance(); + WIKIPEDIA_ENCODING = (String) config + .getConfigParameter(ConfigurationKeys.WIKIPEDIA_ENCODING); + + } + + /* + * (non-Javadoc) + * + * @see de.tudarmstadt.ukp.kulessa.delta.consumers.diff.calculation. + * BlockManagementInterface#manage(char[], char[], java.util.ArrayList, + * java.util.ArrayList) + */ + public Diff manage(char[] revA, char[] revB, ArrayList queueA, + ArrayList queueB) + throws UnsupportedEncodingException { + + this.diff = new Diff(); + this.codecData = new RevisionCodecData(); + + this.bufferMap = new HashMap<>(); + this.version = new StringBuilder(); + + DiffBlock curA = null, curB = null; + while (!queueA.isEmpty() || !queueB.isEmpty() || curB != null) { + + if (!queueA.isEmpty() && curA == null) { + curA = queueA.remove(0); + } + if (!queueB.isEmpty() && curB == null) { + curB = queueB.remove(0); + } + + if (curA != null && curB != null) { + + if (curA.getId() == curB.getId()) { + + if (curA.getId() == -1) { + replace(revA, revB, curA, curB); + } else { + version.append(copy(revA, curA.getRevAStart(), + curA.getRevAEnd())); + } + + curA = null; + curB = null; + + } else if (curA.getId() == -1) { + + delete(curA); + curA = null; + + } else if (curB.getId() == -1) { + + insert(revB, curB); + curB = null; + + } else { + + // Difference :( + if (bufferMap.containsKey(curB.getId())) { + + paste(curB); + curB = null; + + } else { + + cut(revA, curA); + curA = null; + + // System.out.println("@TO CUT: " + curA.getId() + "\t<" + // + text + ">"); + } + } + + } else if (curA != null) { + + delete(curA); + curA = null; + + } else if (curB != null) { + + // Difference :( + if (bufferMap.containsKey(curB.getId())) { + + paste(curB); + curB = null; + + } else { + + insert(revB, curB); + curB = null; + } + + } else { + System.err.println("INVALID CASE"); + System.exit(-1); + } + } + + diff.setCodecData(codecData); + return diff; + } + + /*-PRIVATE-METHODS----------------------------------------------------------*/ + + /** + * Copies the specified interval of characters for the array. + * + * @return specified interval + */ + private String copy(final char[] array, final int start, final int end) { + StringBuilder text = new StringBuilder(); + for (int j = start; j < end; j++) { + text.append(array[j]); + } + + return text.toString(); + } + + /** + * Creates an insert operation. + * + * @param revB revision B + * @param curB Reference to the block B + * @throws UnsupportedEncodingException if the character encoding is unsupported + */ + private void insert(final char[] revB, final DiffBlock curB) + throws UnsupportedEncodingException { + + String text = copy(revB, curB.getRevBStart(), curB.getRevBEnd()); + + // Insert (C S L T) + DiffPart action = new DiffPart(DiffAction.INSERT); + + // S + action.setStart(version.length()); + codecData.checkBlocksizeS(version.length()); + + // L T + action.setText(text); + codecData.checkBlocksizeL(text.getBytes(WIKIPEDIA_ENCODING).length); + + diff.add(action); + + version.append(text); + } + + /** + * Creates a delete operation. + * + * @param curA Reference to the block A + * @throws UnsupportedEncodingException if the character encoding is unsupported + */ + private void delete(final DiffBlock curA) { + + // Delete (C S E) + DiffPart action = new DiffPart(DiffAction.DELETE); + + // S + action.setStart(version.length()); + codecData.checkBlocksizeS(version.length()); + + // E + action.setLength(curA.getRevAEnd() - curA.getRevAStart()); + codecData.checkBlocksizeE(action.getLength()); + + diff.add(action); + } + + /** + * Creates a replace operation. + * + * @param revA Reference to revision A + * @param revB Reference to revision B + * @param curA Reference to current block A + * @param curB Reference to current block B + * @throws UnsupportedEncodingException if the character encoding is unsupported + */ + private void replace(final char[] revA, final char[] revB, + final DiffBlock curA, final DiffBlock curB) + throws UnsupportedEncodingException { + + // Replace (C S E L T) + String text = copy(revB, curB.getRevBStart(), curB.getRevBEnd()); + + DiffPart action = new DiffPart(DiffAction.REPLACE); + + // S + action.setStart(version.length()); + codecData.checkBlocksizeS(version.length()); + + // E + action.setLength(curA.getRevAEnd() - curA.getRevAStart()); + codecData.checkBlocksizeE(action.getLength()); + + // L T + action.setText(text); + codecData.checkBlocksizeL(text.getBytes(WIKIPEDIA_ENCODING).length); + + diff.add(action); + + version.append(text); + } + + /** + * Creates a cut operation. + * + * @param revA Reference to revision A + * @param curA Reference to current block A + */ + private void cut(final char[] revA, final DiffBlock curA) { - /** Configuration parameter - Charset name of the input data */ - private static String WIKIPEDIA_ENCODING; + String text = copy(revA, curA.getRevAStart(), curA.getRevAEnd()); - /** Temporary variable - Just in Time revision */ - private StringBuilder version; + // Cut (C S E B) + DiffPart action = new DiffPart(DiffAction.CUT); - /** Temporary variable - Diff */ - private Diff diff; + // S + action.setStart(version.length()); + codecData.checkBlocksizeS(version.length()); - /** Temporary variable - Storage for intermediate blocks */ - private Map bufferMap; + // E + action.setLength(curA.getRevAEnd() - curA.getRevAStart()); + codecData.checkBlocksizeE(action.getLength()); - /** Reference to the codec */ - private RevisionCodecData codecData; + // B + action.setText(Integer.toString(curA.getId())); + codecData.checkBlocksizeB(curA.getId()); - /** - * (Constructor) Creates a BlockManagement object. - * - * @throws ConfigurationException - * if an error occurred while accessing the configuration - */ - public BlockManagement() - throws ConfigurationException - { + diff.add(action); - ConfigurationManager config = ConfigurationManager.getInstance(); - WIKIPEDIA_ENCODING = (String) config - .getConfigParameter(ConfigurationKeys.WIKIPEDIA_ENCODING); + bufferMap.put(curA.getId(), text); + } - } + /** + * Creates a paste operation. + * + * @param curB Reference to current block B + */ + private void paste(final DiffBlock curB) { - /* - * (non-Javadoc) - * - * @see de.tudarmstadt.ukp.kulessa.delta.consumers.diff.calculation. - * BlockManagementInterface#manage(char[], char[], java.util.ArrayList, - * java.util.ArrayList) - */ - public Diff manage(char[] revA, char[] revB, ArrayList queueA, - ArrayList queueB) - throws UnsupportedEncodingException - { + String text = bufferMap.remove(curB.getId()); - this.diff = new Diff(); - this.codecData = new RevisionCodecData(); + // Paste (C S B) + DiffPart action = new DiffPart(DiffAction.PASTE); - this.bufferMap = new HashMap<>(); - this.version = new StringBuilder(); + // S + action.setStart(version.length()); + codecData.checkBlocksizeS(version.length()); - DiffBlock curA = null, curB = null; - while (!queueA.isEmpty() || !queueB.isEmpty() || curB != null) { + // B + action.setText(Integer.toString(curB.getId())); + codecData.checkBlocksizeB(curB.getId()); - if (!queueA.isEmpty() && curA == null) { - curA = queueA.remove(0); - } - if (!queueB.isEmpty() && curB == null) { - curB = queueB.remove(0); - } + diff.add(action); - if (curA != null && curB != null) { - - if (curA.getId() == curB.getId()) { - - if (curA.getId() == -1) { - replace(revA, revB, curA, curB); - } - else { - version.append(copy(revA, curA.getRevAStart(), - curA.getRevAEnd())); - } - - curA = null; - curB = null; - - } - else if (curA.getId() == -1) { - - delete(curA); - curA = null; - - } - else if (curB.getId() == -1) { - - insert(revB, curB); - curB = null; - - } - else { - - // Difference :( - if (bufferMap.containsKey(curB.getId())) { - - paste(curB); - curB = null; - - } - else { - - cut(revA, curA); - curA = null; - - // System.out.println("@TO CUT: " + curA.getId() + "\t<" - // + text + ">"); - } - } - - } - else if (curA != null) { - - delete(curA); - curA = null; - - } - else if (curB != null) { - - // Difference :( - if (bufferMap.containsKey(curB.getId())) { - - paste(curB); - curB = null; - - } - else { - - insert(revB, curB); - curB = null; - } - - } - else { - System.err.println("INVALID CASE"); - System.exit(-1); - } - } - - diff.setCodecData(codecData); - return diff; - } - - /*-PRIVATE-METHODS----------------------------------------------------------*/ - - /** - * Copies the specified interval of characters for the array. - * - * @return specified interval - */ - private String copy(final char[] array, final int start, final int end) - { - StringBuilder text = new StringBuilder(); - for (int j = start; j < end; j++) { - text.append(array[j]); - } - - return text.toString(); - } - - /** - * Creates an insert operation. - * - * @param revB - * revision B - * @param curB - * Reference to the block B - * - * @throws UnsupportedEncodingException - * if the character encoding is unsupported - */ - private void insert(final char[] revB, final DiffBlock curB) - throws UnsupportedEncodingException - { - - String text = copy(revB, curB.getRevBStart(), curB.getRevBEnd()); - - // Insert (C S L T) - DiffPart action = new DiffPart(DiffAction.INSERT); - - // S - action.setStart(version.length()); - codecData.checkBlocksizeS(version.length()); - - // L T - action.setText(text); - codecData.checkBlocksizeL(text.getBytes(WIKIPEDIA_ENCODING).length); - - diff.add(action); - - version.append(text); - } - - /** - * Creates a delete operation. - * - * @param curA - * Reference to the block A - * - * @throws UnsupportedEncodingException - * if the character encoding is unsupported - */ - private void delete(final DiffBlock curA) - { - - // Delete (C S E) - DiffPart action = new DiffPart(DiffAction.DELETE); - - // S - action.setStart(version.length()); - codecData.checkBlocksizeS(version.length()); - - // E - action.setLength(curA.getRevAEnd() - curA.getRevAStart()); - codecData.checkBlocksizeE(action.getLength()); - - diff.add(action); - } - - /** - * Creates a replace operation. - * - * @param revA - * Reference to revision A - * @param revB - * Reference to revision B - * @param curA - * Reference to current block A - * @param curB - * Reference to current block B - * - * @throws UnsupportedEncodingException - * if the character encoding is unsupported - */ - private void replace(final char[] revA, final char[] revB, - final DiffBlock curA, final DiffBlock curB) - throws UnsupportedEncodingException - { - - // Replace (C S E L T) - String text = copy(revB, curB.getRevBStart(), curB.getRevBEnd()); - - DiffPart action = new DiffPart(DiffAction.REPLACE); - - // S - action.setStart(version.length()); - codecData.checkBlocksizeS(version.length()); - - // E - action.setLength(curA.getRevAEnd() - curA.getRevAStart()); - codecData.checkBlocksizeE(action.getLength()); - - // L T - action.setText(text); - codecData.checkBlocksizeL(text.getBytes(WIKIPEDIA_ENCODING).length); - - diff.add(action); - - version.append(text); - } - - /** - * Creates a cut operation. - * - * @param revA - * Reference to revision A - * @param curA - * Reference to current block A - */ - private void cut(final char[] revA, final DiffBlock curA) - { - - String text = copy(revA, curA.getRevAStart(), curA.getRevAEnd()); - - // Cut (C S E B) - DiffPart action = new DiffPart(DiffAction.CUT); - - // S - action.setStart(version.length()); - codecData.checkBlocksizeS(version.length()); - - // E - action.setLength(curA.getRevAEnd() - curA.getRevAStart()); - codecData.checkBlocksizeE(action.getLength()); - - // B - action.setText(Integer.toString(curA.getId())); - codecData.checkBlocksizeB(curA.getId()); - - diff.add(action); - - bufferMap.put(curA.getId(), text); - } - - /** - * Creates a paste operation. - * - * @param curB - * Reference to current block B - */ - private void paste(final DiffBlock curB) - { - - String text = bufferMap.remove(curB.getId()); - - // Paste (C S B) - DiffPart action = new DiffPart(DiffAction.PASTE); - - // S - action.setStart(version.length()); - codecData.checkBlocksizeS(version.length()); - - // B - action.setText(Integer.toString(curB.getId())); - codecData.checkBlocksizeB(curB.getId()); - - diff.add(action); - - version.append(text); - } + version.append(text); + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/diff/calculation/BlockManagementInterface.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/diff/calculation/BlockManagementInterface.java index b58d7902..ea25a1e9 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/diff/calculation/BlockManagementInterface.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/diff/calculation/BlockManagementInterface.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -24,31 +24,21 @@ /** * Interface of the BlockManagement - * - * - * */ -public interface BlockManagementInterface -{ +public interface BlockManagementInterface { - /** - * Uses the substring blocks to create the diff operations. - * - * @param revA - * revision A - * @param revB - * revision B - * @param queueA - * queue A - * @param queueB - * queue B - * @return Diff - * - * @throws UnsupportedEncodingException - * if the character encoding is unsupported - */ + /** + * Uses the substring blocks to create the diff operations. + * + * @param revA revision A + * @param revB revision B + * @param queueA queue A + * @param queueB queue B + * @return Diff + * @throws UnsupportedEncodingException if the character encoding is unsupported + */ Diff manage(final char[] revA, final char[] revB, final ArrayList queueA, final ArrayList queueB) - throws UnsupportedEncodingException; + throws UnsupportedEncodingException; } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/diff/calculation/DiffBlock.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/diff/calculation/DiffBlock.java index c0e2698f..3a5c01b3 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/diff/calculation/DiffBlock.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/diff/calculation/DiffBlock.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -19,145 +19,134 @@ /** * Contains the information for a block. Used for the Diff Calculation. - * - * - * */ public class DiffBlock - implements Comparable -{ - - /** Block ID */ - private final int id; - - /** Start position in revision A */ - private final int revAStart; - - /** End position in revision A */ - private final int revAEnd; - - /** Start position in revision B */ - private final int revBStart; - - /** End position in revision B */ - private final int revBEnd; - - /** - * Flag, indicating the sorting order TRUE sorting after the start position - * of revision A FALSE sorting after the start position of revision B - */ - private final boolean ab; - - /** - * (DiffBlock) Creates a new DiffBlock. - * - * @param id - * ID of the block - * @param revAStart - * start position of revision A - * @param revAEnd - * end position of revision A - * @param revBStart - * start position of revision B - * @param revBEnd - * end position of revision B - * @param ab - * sorting order flag - */ - public DiffBlock(final int id, final int revAStart, final int revAEnd, - final int revBStart, final int revBEnd, final boolean ab) - { - this.id = id; - this.revAStart = revAStart; - this.revAEnd = revAEnd; - this.revBStart = revBStart; - this.revBEnd = revBEnd; - this.ab = ab; - } - - /** - * Compares the positions of both blocks. - * - * @param b - * Block - */ - public int compareTo(final DiffBlock b) - { - if (ab) { - return this.revAStart - b.revAStart; - } - else { - return this.revBStart - b.revBStart; - } - } - - /** - * Returns whether the block is valid or not. - * - * @return TRUE if the block has a ID of the value -1 FALSE otherwise - */ - public boolean isUnknown() - { - return (id == -1); - } - - /** - * Returns the ID of this block. - * - * @return string representation - */ - public String toString() - { - return Integer.toString(id); - } - - /** - * Returns the ID of this block. - * - * @return ID of this block - */ - public int getId() - { - return id; - } - - /** - * Returns the end position of the block in revision A. - * - * @return end position revision A - */ - public int getRevAEnd() - { - return revAEnd; - } - - /** - * Returns the start position of the block in revision A. - * - * @return start position revision A - */ - public int getRevAStart() - { - return revAStart; - } - - /** - * Returns the end position of the block in revision B. - * - * @return end position revision B - */ - public int getRevBEnd() - { - return revBEnd; - } - - /** - * Returns the start position of the block in revision B. - * - * @return start position revision B - */ - public int getRevBStart() - { - return revBStart; - } + implements Comparable { + + /** + * Block ID + */ + private final int id; + + /** + * Start position in revision A + */ + private final int revAStart; + + /** + * End position in revision A + */ + private final int revAEnd; + + /** + * Start position in revision B + */ + private final int revBStart; + + /** + * End position in revision B + */ + private final int revBEnd; + + /** + * Flag, indicating the sorting order TRUE sorting after the start position + * of revision A FALSE sorting after the start position of revision B + */ + private final boolean ab; + + /** + * (DiffBlock) Creates a new DiffBlock. + * + * @param id ID of the block + * @param revAStart start position of revision A + * @param revAEnd end position of revision A + * @param revBStart start position of revision B + * @param revBEnd end position of revision B + * @param ab sorting order flag + */ + public DiffBlock(final int id, final int revAStart, final int revAEnd, + final int revBStart, final int revBEnd, final boolean ab) { + this.id = id; + this.revAStart = revAStart; + this.revAEnd = revAEnd; + this.revBStart = revBStart; + this.revBEnd = revBEnd; + this.ab = ab; + } + + /** + * Compares the positions of both blocks. + * + * @param b Block + */ + public int compareTo(final DiffBlock b) { + if (ab) { + return this.revAStart - b.revAStart; + } else { + return this.revBStart - b.revBStart; + } + } + + /** + * Returns whether the block is valid or not. + * + * @return TRUE if the block has a ID of the value -1 FALSE otherwise + */ + public boolean isUnknown() { + return (id == -1); + } + + /** + * Returns the ID of this block. + * + * @return string representation + */ + public String toString() { + return Integer.toString(id); + } + + /** + * Returns the ID of this block. + * + * @return ID of this block + */ + public int getId() { + return id; + } + + /** + * Returns the end position of the block in revision A. + * + * @return end position revision A + */ + public int getRevAEnd() { + return revAEnd; + } + + /** + * Returns the start position of the block in revision A. + * + * @return start position revision A + */ + public int getRevAStart() { + return revAStart; + } + + /** + * Returns the end position of the block in revision B. + * + * @return end position revision B + */ + public int getRevBEnd() { + return revBEnd; + } + + /** + * Returns the start position of the block in revision B. + * + * @return start position revision B + */ + public int getRevBStart() { + return revBStart; + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/diff/calculation/DiffCalculator.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/diff/calculation/DiffCalculator.java index 83af4c5a..c068bae8 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/diff/calculation/DiffCalculator.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/diff/calculation/DiffCalculator.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -46,709 +46,702 @@ /** * Calculates the Diff. - * - * - * */ public class DiffCalculator - implements DiffCalculatorInterface -{ - - /** - * Configuration parameter - Flag, which indicates whether debug output is - * enabled or not - */ - private final boolean MODE_DEBUG_OUTPUT_ACTIVATED; - - /** Configuration parameter - Path for the DiffTool logger */ - private final String LOGGING_PATH_DIFFTOOL; - - /** Configuration parameter - Path for the debug logger */ - private final String LOGGING_PATH_DEBUG; - - /** Configuration parameter - Each x-th version is a full revision */ - private final int COUNTER_FULL_REVISION; - - /** Configuration parameter - Maximum size of a diff statement */ - private final long LIMIT_TASK_SIZE_DIFFS; - - /** Configuration parameter - Charset name of the input data */ - private final String WIKIPEDIA_ENCODING; - - /** - * Configuration parameter - Flag, which indicates whether the verification - * of the diff is enabled or not - */ - private final boolean VERIFICATION_DIFF; - - /** Configuration parameter - Value of the minimum legal substring */ - private final int VALUE_MINIMUM_LONGEST_COMMON_SUBSTRING; - - /** Configuration parameter - Surrogate Mode */ - private final SurrogateModes MODE_SURROGATES; - - /** Reference to the TransTransmitter */ - private final TaskTransmitterInterface taskTransmitter; - - /** Reference to the BlockManager */ - private final BlockManagementInterface blocks; - - @Override - public void closeTransmitter() throws IOException, SQLException { - this.taskTransmitter.close(); - } - - /** - * (Constructor) Creates a new DiffCalculator object. - * - * @param taskTransmitter - * Reference to the TaskTransmitter - * - * @throws ConfigurationException - * if an error occurred while accessing the configuration - */ - public DiffCalculator(final TaskTransmitterInterface taskTransmitter) - throws ConfigurationException - { - this.taskTransmitter = taskTransmitter; - this.blocks = new BlockManagement(); - - this.articleID = -1; - this.partCounter = 0; - - // Load config parameters - ConfigurationManager config = ConfigurationManager.getInstance(); - - MODE_DEBUG_OUTPUT_ACTIVATED = (Boolean) config - .getConfigParameter(ConfigurationKeys.MODE_DEBUG_OUTPUT); + implements DiffCalculatorInterface { + + /** + * Configuration parameter - Flag, which indicates whether debug output is + * enabled or not + */ + private final boolean MODE_DEBUG_OUTPUT_ACTIVATED; + + /** + * Configuration parameter - Path for the DiffTool logger + */ + private final String LOGGING_PATH_DIFFTOOL; + + /** + * Configuration parameter - Path for the debug logger + */ + private final String LOGGING_PATH_DEBUG; + + /** + * Configuration parameter - Each x-th version is a full revision + */ + private final int COUNTER_FULL_REVISION; + + /** + * Configuration parameter - Maximum size of a diff statement + */ + private final long LIMIT_TASK_SIZE_DIFFS; + + /** + * Configuration parameter - Charset name of the input data + */ + private final String WIKIPEDIA_ENCODING; + + /** + * Configuration parameter - Flag, which indicates whether the verification + * of the diff is enabled or not + */ + private final boolean VERIFICATION_DIFF; + + /** + * Configuration parameter - Value of the minimum legal substring + */ + private final int VALUE_MINIMUM_LONGEST_COMMON_SUBSTRING; + + /** + * Configuration parameter - Surrogate Mode + */ + private final SurrogateModes MODE_SURROGATES; + + /** + * Reference to the TransTransmitter + */ + private final TaskTransmitterInterface taskTransmitter; + + /** + * Reference to the BlockManager + */ + private final BlockManagementInterface blocks; + + @Override + public void closeTransmitter() throws IOException, SQLException { + this.taskTransmitter.close(); + } + + /** + * (Constructor) Creates a new DiffCalculator object. + * + * @param taskTransmitter Reference to the TaskTransmitter + * @throws ConfigurationException if an error occurred while accessing the configuration + */ + public DiffCalculator(final TaskTransmitterInterface taskTransmitter) + throws ConfigurationException { + this.taskTransmitter = taskTransmitter; + this.blocks = new BlockManagement(); + + this.articleID = -1; + this.partCounter = 0; + + // Load config parameters + ConfigurationManager config = ConfigurationManager.getInstance(); + + MODE_DEBUG_OUTPUT_ACTIVATED = (Boolean) config + .getConfigParameter(ConfigurationKeys.MODE_DEBUG_OUTPUT); + + LOGGING_PATH_DIFFTOOL = (String) config + .getConfigParameter(ConfigurationKeys.LOGGING_PATH_DIFFTOOL); + + LOGGING_PATH_DEBUG = (String) config + .getConfigParameter(ConfigurationKeys.LOGGING_PATH_DEBUG); + + COUNTER_FULL_REVISION = (Integer) config + .getConfigParameter(ConfigurationKeys.COUNTER_FULL_REVISION); + + LIMIT_TASK_SIZE_DIFFS = (Long) config + .getConfigParameter(ConfigurationKeys.LIMIT_TASK_SIZE_DIFFS); + + WIKIPEDIA_ENCODING = (String) config + .getConfigParameter(ConfigurationKeys.WIKIPEDIA_ENCODING); + + VERIFICATION_DIFF = (Boolean) config + .getConfigParameter(ConfigurationKeys.VERIFICATION_DIFF); + + VALUE_MINIMUM_LONGEST_COMMON_SUBSTRING = (Integer) config + .getConfigParameter(ConfigurationKeys.VALUE_MINIMUM_LONGEST_COMMON_SUBSTRING); + + MODE_SURROGATES = (SurrogateModes) config + .getConfigParameter(ConfigurationKeys.MODE_SURROGATES); + } + + /*--------------------------------------------------------------------------*/ + + /** + * Temporary variable - ID of the currently processed article + */ + private int articleID; + + /** + * Temporary variable - Storage for the diffs + */ + private Task result; + + /** + * Temporary variable - Revision Counter + */ + private int revisionCounter; + + /** + * Temporary variable - Part Counter + */ + private int partCounter; + + /** + * Temporary variable - Diff Part + */ + private DiffPart part; + + /** + * Temporary variable - content + */ + private String text; + + /** + * Temporary variable - previous revision + */ + private char[] revPrevious; + + /** + * Temporary variable - current revision + */ + private char[] revCurrent; + + /** + * Temporary variable - temporary revision + */ + private char[] revTemp; + + /** + * Temporary variable - Block Counter + */ + private int blockCount; + + /** + * Temporary variable - Used to mark used characters of the previous + * revision + */ + private boolean[] revABlocked; + + /** + * Temporary variable - Used to mark used characters of the current revision + */ + private boolean[] revBBlocked; + + /** + * Temporary variable - Mapping of characters and their positions in the + * previous revision + */ + private HashMap> positions; + + /** + * Temporary variable - Queue for blocks of the previous revision + */ + private ArrayList queueA; + + /** + * Temporary variable - Queue for blocks of the current revision + */ + private ArrayList queueB; + + /** + * Temporary variable - size of the longest matching substring + */ + private int longestMatch_size; + + /** + * Temporary variable - start position of the longest matching substring + */ + private int longestMatch_start; + + /*--------------------------------------------------------------------------*/ + + /** + * Initializes the processing of a RevisionTask using a new DiffTask. + * + * @param task Reference to the DiffTask + */ + private void init(final Task task) { + this.partCounter++; + this.result = new Task<>(task.getHeader(), partCounter); + } + + /** + * Initializes the processing of a new RevisionTask. + * + * @param taskID Article ID + */ + protected void initNewTask(final int taskID) { + + this.articleID = taskID; + + this.partCounter = 0; + this.revisionCounter = 0; + + this.revPrevious = null; + this.revCurrent = null; + } + + /** + * Generates a FullRevision. + * + * @param revision Reference to the revision + * @return Diff, containing a FullRevision + * @throws UnsupportedEncodingException if the character encoding is unsupported + */ + private Diff generateFullRevision(final Revision revision) + throws UnsupportedEncodingException { + + Diff diff = new Diff(); + RevisionCodecData codecData = new RevisionCodecData(); + + // FullRevisionUncompressed (C L T) + part = new DiffPart(DiffAction.FULL_REVISION_UNCOMPRESSED); + + // L T + text = revision.getRevisionText(); + revCurrent = text.toCharArray(); + + part.setText(text); + codecData.checkBlocksizeL(text.getBytes(WIKIPEDIA_ENCODING).length); + + diff.add(part); + + diff.setCodecData(codecData); + return diff; + } + + /** + * Transmits a partial DiffTask. + * + * @param result Reference to the DiffTask + * @throws TimeoutException if a timeout occurred + */ + protected void transmitPartialTask(final Task result) + throws TimeoutException { + + if (this.partCounter == 1) { + + this.result.setTaskType(TaskTypes.TASK_PARTIAL_FIRST); + this.taskTransmitter.transmitDiff(result); + + } else { + + this.result.setTaskType(TaskTypes.TASK_PARTIAL); + this.taskTransmitter.transmitPartialDiff(result); + } + } + + /** + * Transmits the DiffTask at the end of the RevisionTask processing. + * + * @param task Reference to the RevisionTask + * @param result Reference to the DiffTask + * @throws TimeoutException if a timeout occurred + */ + protected void transmitAtEndOfTask(final Task task, + final Task result) + throws TimeoutException { + + if (task.getTaskType() == TaskTypes.TASK_FULL + || task.getTaskType() == TaskTypes.TASK_PARTIAL_LAST) { + + if (this.partCounter > 1) { + this.result.setTaskType(TaskTypes.TASK_PARTIAL_LAST); + this.taskTransmitter.transmitPartialDiff(result); + } else { + this.result.setTaskType(TaskTypes.TASK_FULL); + this.taskTransmitter.transmitDiff(result); + } + + this.result = null; + } + } + + /** + * Calculates the diff for the given revision. + * + * @param revision Reference to a revision + * @return Diff + * @throws UnsupportedEncodingException if the character encoding is unsupported + */ + protected Diff processRevision(final Revision revision) + throws UnsupportedEncodingException { + + // ----------------------------------------------------// + // ** HERE IS THE POINT TO INCLUDE ADDITIONAL FILTERS // + // TO REMOVE FAULTY REVISIONS FROM FURTHER PROCESSING // + // ----------------------------------------------------// - LOGGING_PATH_DIFFTOOL = (String) config - .getConfigParameter(ConfigurationKeys.LOGGING_PATH_DIFFTOOL); - - LOGGING_PATH_DEBUG = (String) config - .getConfigParameter(ConfigurationKeys.LOGGING_PATH_DEBUG); - - COUNTER_FULL_REVISION = (Integer) config - .getConfigParameter(ConfigurationKeys.COUNTER_FULL_REVISION); - - LIMIT_TASK_SIZE_DIFFS = (Long) config - .getConfigParameter(ConfigurationKeys.LIMIT_TASK_SIZE_DIFFS); - - WIKIPEDIA_ENCODING = (String) config - .getConfigParameter(ConfigurationKeys.WIKIPEDIA_ENCODING); - - VERIFICATION_DIFF = (Boolean) config - .getConfigParameter(ConfigurationKeys.VERIFICATION_DIFF); - - VALUE_MINIMUM_LONGEST_COMMON_SUBSTRING = (Integer) config - .getConfigParameter(ConfigurationKeys.VALUE_MINIMUM_LONGEST_COMMON_SUBSTRING); - - MODE_SURROGATES = (SurrogateModes) config - .getConfigParameter(ConfigurationKeys.MODE_SURROGATES); - } - - /*--------------------------------------------------------------------------*/ - - /** Temporary variable - ID of the currently processed article */ - private int articleID; - - /** Temporary variable - Storage for the diffs */ - private Task result; - - /** Temporary variable - Revision Counter */ - private int revisionCounter; - - /** Temporary variable - Part Counter */ - private int partCounter; - - /** Temporary variable - Diff Part */ - private DiffPart part; - - /** Temporary variable - content */ - private String text; - - /** Temporary variable - previous revision */ - private char[] revPrevious; - - /** Temporary variable - current revision */ - private char[] revCurrent; - - /** Temporary variable - temporary revision */ - private char[] revTemp; - - /** Temporary variable - Block Counter */ - private int blockCount; - - /** - * Temporary variable - Used to mark used characters of the previous - * revision - */ - private boolean[] revABlocked; - - /** - * Temporary variable - Used to mark used characters of the current revision - */ - private boolean[] revBBlocked; - - /** - * Temporary variable - Mapping of characters and their positions in the - * previous revision - */ - private HashMap> positions; - - /** Temporary variable - Queue for blocks of the previous revision */ - private ArrayList queueA; - - /** Temporary variable - Queue for blocks of the current revision */ - private ArrayList queueB; - - /** Temporary variable - size of the longest matching substring */ - private int longestMatch_size; - - /** Temporary variable - start position of the longest matching substring */ - private int longestMatch_start; - - /*--------------------------------------------------------------------------*/ - - /** - * Initializes the processing of a RevisionTask using a new DiffTask. - * - * @param task - * Reference to the DiffTask - */ - private void init(final Task task) - { - this.partCounter++; - this.result = new Task<>(task.getHeader(), partCounter); - } - - /** - * Initializes the processing of a new RevisionTask. - * - * @param taskID - * Article ID - */ - protected void initNewTask(final int taskID) - { - - this.articleID = taskID; - - this.partCounter = 0; - this.revisionCounter = 0; - - this.revPrevious = null; - this.revCurrent = null; - } - - /** - * Generates a FullRevision. - * - * @param revision - * Reference to the revision - * @return Diff, containing a FullRevision - * - * @throws UnsupportedEncodingException - * if the character encoding is unsupported - */ - private Diff generateFullRevision(final Revision revision) - throws UnsupportedEncodingException - { - - Diff diff = new Diff(); - RevisionCodecData codecData = new RevisionCodecData(); - - // FullRevisionUncompressed (C L T) - part = new DiffPart(DiffAction.FULL_REVISION_UNCOMPRESSED); - - // L T - text = revision.getRevisionText(); - revCurrent = text.toCharArray(); - - part.setText(text); - codecData.checkBlocksizeL(text.getBytes(WIKIPEDIA_ENCODING).length); - - diff.add(part); - - diff.setCodecData(codecData); - return diff; - } - - /** - * Transmits a partial DiffTask. - * - * @param result - * Reference to the DiffTask - * - * @throws TimeoutException - * if a timeout occurred - */ - protected void transmitPartialTask(final Task result) - throws TimeoutException - { - - if (this.partCounter == 1) { - - this.result.setTaskType(TaskTypes.TASK_PARTIAL_FIRST); - this.taskTransmitter.transmitDiff(result); - - } - else { - - this.result.setTaskType(TaskTypes.TASK_PARTIAL); - this.taskTransmitter.transmitPartialDiff(result); - } - } - - /** - * Transmits the DiffTask at the end of the RevisionTask processing. - * - * @param task - * Reference to the RevisionTask - * @param result - * Reference to the DiffTask - * - * @throws TimeoutException - * if a timeout occurred - */ - protected void transmitAtEndOfTask(final Task task, - final Task result) - throws TimeoutException - { - - if (task.getTaskType() == TaskTypes.TASK_FULL - || task.getTaskType() == TaskTypes.TASK_PARTIAL_LAST) { - - if (this.partCounter > 1) { - this.result.setTaskType(TaskTypes.TASK_PARTIAL_LAST); - this.taskTransmitter.transmitPartialDiff(result); - } - else { - this.result.setTaskType(TaskTypes.TASK_FULL); - this.taskTransmitter.transmitDiff(result); - } - - this.result = null; - } - } - - /** - * Calculates the diff for the given revision. - * - * @param revision - * Reference to a revision - * @return Diff - * - * @throws UnsupportedEncodingException - * if the character encoding is unsupported - */ - protected Diff processRevision(final Revision revision) - throws UnsupportedEncodingException - { - - // ----------------------------------------------------// - // ** HERE IS THE POINT TO INCLUDE ADDITIONAL FILTERS // - // TO REMOVE FAULTY REVISIONS FROM FURTHER PROCESSING // - // ----------------------------------------------------// - - try{ - if(revision.getRevisionText()==null){ - return null; - } - }catch(NullPointerException e){ - return null; - } - - revTemp = revision.getRevisionText().toCharArray(); - - if (MODE_SURROGATES == SurrogateModes.DISCARD_REVISION) { - - // Ignore Revision with surrogate characters - if (Surrogates.scan(revTemp)) { - return null; - } - } - - Diff diff; - - // Full revision - if (revisionCounter % COUNTER_FULL_REVISION == 0) { - - diff = generateFullRevision(revision); - - // Diffed revision - } - else { - - diff = generateDiff(revPrevious, revTemp); - - // if the current revision is identical to the last valid revision - if (diff.size() == 0) { - return null; - } - } - - return diff; - } - - /* - * (non-Javadoc) - * - * @see - * de.tud.ukp.kulessa.delta.consumers.diff.calculation.DiffCalculatorInterface - * #process(de.tud.ukp.kulessa.delta.data.Task) - */ - public void process(final Task task) - throws DiffException, TimeoutException, UnsupportedEncodingException - { - // this.startTime = System.currentTimeMillis(); - Revision revision; - - // check if a new task was received - if (articleID != task.getHeader().getArticleId()) { - - // init settings - initNewTask(task.getHeader().getArticleId()); - init(task); - - // check if old task was complete - } - else if (result == null) { - - init(task); - } - - Diff diff; - - // TODO: Chronological order hotfix - - // does not work for articles that are split across multiple tasks - ArrayList list = task.getContainer(); - Collections.sort(list); - - int i, rSize = list.size(); - - for (i = 0; i < rSize; i++) { - - if (result.byteSize() > LIMIT_TASK_SIZE_DIFFS) { - - transmitPartialTask(result); - init(task); - } - - // Store previous revision - revPrevious = revCurrent; - - // Process next revision - revision = list.get(i); - - diff = processRevision(revision); - - if (diff != null) { - - revCurrent = revTemp; - - // Add to result - revisionCounter++; - - diff.setRevisionCoutner(revisionCounter); - diff.setRevisionID(revision.getRevisionID()); - diff.setTimeStamp(revision.getTimeStamp()); - diff.setComment(revision.getComment()); - diff.setContributorName(revision.getContributorName()); - diff.setContributorId(revision.getContributorId()); - diff.setContributorIsRegistered(revision.contributorIsRegistered()); - diff.setMinor(revision.isMinor()); - - result.add(diff); - - // Verification - if (VERIFICATION_DIFF) { - String revC, revP; - try { - revC = String.valueOf(revCurrent); - revP = diff.buildRevision(revPrevious); - - /* - * WRONG LOCATION if (notEqual && MODE_SURROGATES == - * SurrogateModes.REPLACE) { - * - * // TODO: TEST: if (Surrogates.scan(revCurrent)) { - * - * char[] repCurrent = Surrogates.replace(revCurrent); - * char[] repPrevious = Surrogates.replace(revPrevious); - * - * revC = String.valueOf(repCurrent); revP = - * diff.buildRevision(repPrevious); - * - * notEqual = !revC.equals(revP); } } - */ - - if (!revC.equals(revP)) { - - if (MODE_DEBUG_OUTPUT_ACTIVATED) { - WikipediaXMLWriter writer = new WikipediaXMLWriter( - LOGGING_PATH_DIFFTOOL - + LOGGING_PATH_DEBUG - + task.getHeader() - .getArticleName() - + ".dbg"); - - writer.writeRevision(task); - writer.close(); - } - - throw ErrorFactory - .createDiffException( - ErrorKeys.DIFFTOOL_DIFFCONSUMER_DIFF_VERIFICATION_FAILED, - "Reconstruction of " - + task - + " failed at revision " - + revisionCounter + "."); - } - - // Throw again - } - catch (DiffException e) { - throw e; - - // Catch unexpected exceptions - } - catch (Exception e) { - throw ErrorFactory - .createDiffException( - ErrorKeys.DIFFTOOL_DIFFCONSUMER_DIFF_VERIFICATION_FAILED, - "Reconstruction of " + task - + " failed at revision " - + revisionCounter + ".", e); - } - } - } - } - - transmitAtEndOfTask(task, result); - } - - /** - * Generates a Diff by using the CommonLongestSubstring search. - * - * @param revA - * previous revision - * @param revB - * current revision - * @return Diff - * - * @throws UnsupportedEncodingException - * if the character encoding is unsupported - */ - private Diff generateDiff(final char[] revA, final char[] revB) - throws UnsupportedEncodingException - { - - blockCount = 0; - queueA = new ArrayList<>(); - queueB = new ArrayList<>(); - - revABlocked = new boolean[revA.length]; - revBBlocked = new boolean[revB.length]; - - int revAStartIndex = 0, revAEndIndex = revA.length - 1; - int revBStartIndex = 0, revBEndIndex = revB.length - 1; - - while (revAStartIndex <= revAEndIndex && revBStartIndex <= revBEndIndex - && revA[revAStartIndex] == revB[revBStartIndex]) { - - revABlocked[revAStartIndex] = true; - revBBlocked[revBStartIndex] = true; - revAStartIndex++; - revBStartIndex++; - } - - // First Block - if (revAStartIndex != 0) { - queueA.add(new DiffBlock(this.blockCount, 0, revAStartIndex, 0, - revBStartIndex, true)); - queueB.add(new DiffBlock(this.blockCount, 0, revAStartIndex, 0, - revBStartIndex, false)); - this.blockCount++; - } - - while (revAStartIndex < revAEndIndex && revBStartIndex < revBEndIndex - && revA[revAEndIndex] == revB[revBEndIndex]) { - - revABlocked[revAEndIndex] = true; - revBBlocked[revBEndIndex] = true; - revAEndIndex--; - revBEndIndex--; - } - - // Last Block - if (revAEndIndex + 1 != revA.length) { - queueA.add(new DiffBlock(this.blockCount, revAEndIndex + 1, - revA.length, revBEndIndex + 1, revB.length, true)); - queueB.add(new DiffBlock(this.blockCount, revAEndIndex + 1, - revA.length, revBEndIndex + 1, revB.length, false)); - this.blockCount++; - } - - scan(revA, revAStartIndex, revAEndIndex); - - ArrayList list; - char c; - - int i = revBStartIndex; - while (i < revBEndIndex) { - - c = revB[i]; - list = positions.get(c); - - if (list != null && findLongestMatch(revA, list, revB, i)) { - - i += longestMatch_size; - } - else { - i++; - } - } - - int j; - for (i = revAStartIndex; i <= revAEndIndex; i++) { - if (!revABlocked[i]) { - j = i; - while (i + 1 <= revAEndIndex && !revABlocked[++i]) { + try { + if (revision.getRevisionText() == null) { + return null; + } + } catch (NullPointerException e) { + return null; + } + + revTemp = revision.getRevisionText().toCharArray(); + + if (MODE_SURROGATES == SurrogateModes.DISCARD_REVISION) { + + // Ignore Revision with surrogate characters + if (Surrogates.scan(revTemp)) { + return null; + } + } + + Diff diff; + + // Full revision + if (revisionCounter % COUNTER_FULL_REVISION == 0) { + + diff = generateFullRevision(revision); + + // Diffed revision + } else { + + diff = generateDiff(revPrevious, revTemp); + + // if the current revision is identical to the last valid revision + if (diff.size() == 0) { + return null; + } + } + + return diff; + } + + /* + * (non-Javadoc) + * + * @see + * de.tud.ukp.kulessa.delta.consumers.diff.calculation.DiffCalculatorInterface + * #process(de.tud.ukp.kulessa.delta.data.Task) + */ + public void process(final Task task) + throws DiffException, TimeoutException, UnsupportedEncodingException { + // this.startTime = System.currentTimeMillis(); + Revision revision; + + // check if a new task was received + if (articleID != task.getHeader().getArticleId()) { + + // init settings + initNewTask(task.getHeader().getArticleId()); + init(task); + + // check if old task was complete + } else if (result == null) { + + init(task); + } + + Diff diff; + + // TODO: Chronological order hotfix - + // does not work for articles that are split across multiple tasks + ArrayList list = task.getContainer(); + Collections.sort(list); + + int i, rSize = list.size(); + + for (i = 0; i < rSize; i++) { + + if (result.byteSize() > LIMIT_TASK_SIZE_DIFFS) { + + transmitPartialTask(result); + init(task); + } + + // Store previous revision + revPrevious = revCurrent; + + // Process next revision + revision = list.get(i); + + diff = processRevision(revision); + + if (diff != null) { + + revCurrent = revTemp; + + // Add to result + revisionCounter++; + + diff.setRevisionCoutner(revisionCounter); + diff.setRevisionID(revision.getRevisionID()); + diff.setTimeStamp(revision.getTimeStamp()); + diff.setComment(revision.getComment()); + diff.setContributorName(revision.getContributorName()); + diff.setContributorId(revision.getContributorId()); + diff.setContributorIsRegistered(revision.contributorIsRegistered()); + diff.setMinor(revision.isMinor()); + + result.add(diff); + + // Verification + if (VERIFICATION_DIFF) { + String revC, revP; + try { + revC = String.valueOf(revCurrent); + revP = diff.buildRevision(revPrevious); + + /* + * WRONG LOCATION if (notEqual && MODE_SURROGATES == + * SurrogateModes.REPLACE) { + * + * // TODO: TEST: if (Surrogates.scan(revCurrent)) { + * + * char[] repCurrent = Surrogates.replace(revCurrent); + * char[] repPrevious = Surrogates.replace(revPrevious); + * + * revC = String.valueOf(repCurrent); revP = + * diff.buildRevision(repPrevious); + * + * notEqual = !revC.equals(revP); } } + */ + + if (!revC.equals(revP)) { + + if (MODE_DEBUG_OUTPUT_ACTIVATED) { + WikipediaXMLWriter writer = new WikipediaXMLWriter( + LOGGING_PATH_DIFFTOOL + + LOGGING_PATH_DEBUG + + task.getHeader() + .getArticleName() + + ".dbg"); + + writer.writeRevision(task); + writer.close(); + } + + throw ErrorFactory + .createDiffException( + ErrorKeys.DIFFTOOL_DIFFCONSUMER_DIFF_VERIFICATION_FAILED, + "Reconstruction of " + + task + + " failed at revision " + + revisionCounter + "."); + } + + // Throw again + } catch (DiffException e) { + throw e; + + // Catch unexpected exceptions + } catch (Exception e) { + throw ErrorFactory + .createDiffException( + ErrorKeys.DIFFTOOL_DIFFCONSUMER_DIFF_VERIFICATION_FAILED, + "Reconstruction of " + task + + " failed at revision " + + revisionCounter + ".", e); + } + } + } + } + + transmitAtEndOfTask(task, result); + } + + /** + * Generates a Diff by using the CommonLongestSubstring search. + * + * @param revA previous revision + * @param revB current revision + * @return Diff + * @throws UnsupportedEncodingException if the character encoding is unsupported + */ + private Diff generateDiff(final char[] revA, final char[] revB) + throws UnsupportedEncodingException { + + blockCount = 0; + queueA = new ArrayList<>(); + queueB = new ArrayList<>(); + + revABlocked = new boolean[revA.length]; + revBBlocked = new boolean[revB.length]; + + int revAStartIndex = 0, revAEndIndex = revA.length - 1; + int revBStartIndex = 0, revBEndIndex = revB.length - 1; + + while (revAStartIndex <= revAEndIndex && revBStartIndex <= revBEndIndex + && revA[revAStartIndex] == revB[revBStartIndex]) { + + revABlocked[revAStartIndex] = true; + revBBlocked[revBStartIndex] = true; + revAStartIndex++; + revBStartIndex++; + } + + // First Block + if (revAStartIndex != 0) { + queueA.add(new DiffBlock(this.blockCount, 0, revAStartIndex, 0, + revBStartIndex, true)); + queueB.add(new DiffBlock(this.blockCount, 0, revAStartIndex, 0, + revBStartIndex, false)); + this.blockCount++; + } + + while (revAStartIndex < revAEndIndex && revBStartIndex < revBEndIndex + && revA[revAEndIndex] == revB[revBEndIndex]) { + + revABlocked[revAEndIndex] = true; + revBBlocked[revBEndIndex] = true; + revAEndIndex--; + revBEndIndex--; + } + + // Last Block + if (revAEndIndex + 1 != revA.length) { + queueA.add(new DiffBlock(this.blockCount, revAEndIndex + 1, + revA.length, revBEndIndex + 1, revB.length, true)); + queueB.add(new DiffBlock(this.blockCount, revAEndIndex + 1, + revA.length, revBEndIndex + 1, revB.length, false)); + this.blockCount++; + } + + scan(revA, revAStartIndex, revAEndIndex); + + ArrayList list; + char c; + + int i = revBStartIndex; + while (i < revBEndIndex) { + + c = revB[i]; + list = positions.get(c); + + if (list != null && findLongestMatch(revA, list, revB, i)) { + + i += longestMatch_size; + } else { + i++; + } + } + + int j; + for (i = revAStartIndex; i <= revAEndIndex; i++) { + if (!revABlocked[i]) { + j = i; + while (i + 1 <= revAEndIndex && !revABlocked[++i]) { } - if (i + 1 > revAEndIndex) { - i++; - } + if (i + 1 > revAEndIndex) { + i++; + } - queueA.add(new DiffBlock(-1, j, i, -1, -1, true)); - } - } + queueA.add(new DiffBlock(-1, j, i, -1, -1, true)); + } + } - for (i = revBStartIndex; i <= revBEndIndex; i++) { - if (!revBBlocked[i]) { - j = i; - while (i + 1 <= revBEndIndex && !revBBlocked[++i]) { + for (i = revBStartIndex; i <= revBEndIndex; i++) { + if (!revBBlocked[i]) { + j = i; + while (i + 1 <= revBEndIndex && !revBBlocked[++i]) { } - if (i + 1 > revBEndIndex) { - i++; - } + if (i + 1 > revBEndIndex) { + i++; + } - queueB.add(new DiffBlock(-1, -1, -1, j, i, false)); - } - } + queueB.add(new DiffBlock(-1, -1, -1, j, i, false)); + } + } - Collections.sort(queueA); - Collections.sort(queueB); + Collections.sort(queueA); + Collections.sort(queueB); - return blocks.manage(revA, revB, queueA, queueB); - } + return blocks.manage(revA, revB, queueA, queueB); + } - /** - * Scans the input and creates the character -> position mapping. - * - * @param input - * character array - * @param start - * start position - * @param end - * end position - */ - private void scan(final char[] input, final int start, final int end) - { + /** + * Scans the input and creates the character -> position mapping. + * + * @param input character array + * @param start start position + * @param end end position + */ + private void scan(final char[] input, final int start, final int end) { - this.positions = new HashMap<>(); - ArrayList list; + this.positions = new HashMap<>(); + ArrayList list; - char c; - for (int i = start; i < end; i++) { - c = input[i]; + char c; + for (int i = start; i < end; i++) { + c = input[i]; list = positions.computeIfAbsent(c, k -> new ArrayList<>()); list.add(i); - } - } - - /** - * Searches the longest common substring - * - * @param revA - * current revision - * @param list - * list of start positions for this substring search - * @param revB - * previous revision - * @param index - * start index previous revision - * - * @return TRUE if a legal substring was found FALSE otherwise - */ - private boolean findLongestMatch(final char[] revA, - final ArrayList list, final char[] revB, final int index) - { - - int match; - longestMatch_size = -1; - - int size = list.size(); - int revAsize = revA.length; - int revBsize = revB.length; - - int start, end, count; - for (int i = 0; i < size; i++) { - - start = list.get(i); - if (!revABlocked[start] && !revBBlocked[index + 1]) { - - count = index + 1; - end = start + 1; - - while (end < revAsize && count < revBsize - && revA[end] == revB[count] && !revABlocked[end] - && !revBBlocked[count]) { - end++; - count++; - } - - match = end - start; - if (match > longestMatch_size) { - longestMatch_size = match; - longestMatch_start = start; - } - } - } - - if (longestMatch_size <= VALUE_MINIMUM_LONGEST_COMMON_SUBSTRING) { - return false; - } - - queueA.add(new DiffBlock(this.blockCount, longestMatch_start, - longestMatch_start + longestMatch_size, index, index - + longestMatch_size, true)); - queueB.add(new DiffBlock(this.blockCount, longestMatch_start, - longestMatch_start + longestMatch_size, index, index - + longestMatch_size, false)); - - blockCount++; - - for (int i = 0, j = longestMatch_start, k = index; i < longestMatch_size; i++, j++, k++) { - revABlocked[j] = true; - revBBlocked[k] = true; - } - - return true; - } - - /* - * (non-Javadoc) - * - * @see - * de.tud.ukp.kulessa.delta.consumers.diff.calculation.DiffCalculatorInterface - * #reset() - */ - public void reset() - { - this.result = null; - } + } + } + + /** + * Searches the longest common substring + * + * @param revA current revision + * @param list list of start positions for this substring search + * @param revB previous revision + * @param index start index previous revision + * @return TRUE if a legal substring was found FALSE otherwise + */ + private boolean findLongestMatch(final char[] revA, + final ArrayList list, final char[] revB, final int index) { + + int match; + longestMatch_size = -1; + + int size = list.size(); + int revAsize = revA.length; + int revBsize = revB.length; + + int start, end, count; + for (int i = 0; i < size; i++) { + + start = list.get(i); + if (!revABlocked[start] && !revBBlocked[index + 1]) { + + count = index + 1; + end = start + 1; + + while (end < revAsize && count < revBsize + && revA[end] == revB[count] && !revABlocked[end] + && !revBBlocked[count]) { + end++; + count++; + } + + match = end - start; + if (match > longestMatch_size) { + longestMatch_size = match; + longestMatch_start = start; + } + } + } + + if (longestMatch_size <= VALUE_MINIMUM_LONGEST_COMMON_SUBSTRING) { + return false; + } + + queueA.add(new DiffBlock(this.blockCount, longestMatch_start, + longestMatch_start + longestMatch_size, index, index + + longestMatch_size, true)); + queueB.add(new DiffBlock(this.blockCount, longestMatch_start, + longestMatch_start + longestMatch_size, index, index + + longestMatch_size, false)); + + blockCount++; + + for (int i = 0, j = longestMatch_start, k = index; i < longestMatch_size; i++, j++, k++) { + revABlocked[j] = true; + revBBlocked[k] = true; + } + + return true; + } + + /* + * (non-Javadoc) + * + * @see + * de.tud.ukp.kulessa.delta.consumers.diff.calculation.DiffCalculatorInterface + * #reset() + */ + public void reset() { + this.result = null; + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/diff/calculation/TimedDiffCalculator.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/diff/calculation/TimedDiffCalculator.java index 872e90a5..c5b45bf0 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/diff/calculation/TimedDiffCalculator.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/diff/calculation/TimedDiffCalculator.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -31,165 +31,152 @@ /** * Calculates the Diff while collecting statistical data. - * - * - * */ public class TimedDiffCalculator - extends DiffCalculator -{ - - /** Temporary variable - revision counter */ - private int revisionCounter; - - /** Temporary variable - diff part counter */ - private int diffPartCounter; - - /** Temporary variable - size of the diff */ - private long diffedSize; - - /** Temporary variable - start time of the diff processing */ - private long startTime; - - /** Temporary variable - time used for the diff processing */ - private long processingTimeDiff; - - /** Temporary variable - number of ignored revisions */ - private int ignoredRevisionsCounter; - - /** - * (Constructor) Creates a new DiffCalculator object. - * - * @param taskTransmitter - * Reference to the TaskTransmitter - * - * @throws ConfigurationException - * if an error occurred while accessing the configuration - */ - public TimedDiffCalculator(final TaskTransmitterInterface taskTransmitter) - throws ConfigurationException - { - super(taskTransmitter); - } - - /*--------------------------------------------------------------------------*/ - - /** - * Initializes the processing of a new RevisionTask. - * - * @param taskID - * Article ID - */ - protected void initNewTask(final int taskID) - { - - super.initNewTask(taskID); - - this.processingTimeDiff = 0; - - this.revisionCounter = 0; - this.ignoredRevisionsCounter = 0; - - this.diffPartCounter = 0; - this.diffedSize = 0; - } - - /** - * Transmits a partial DiffTask. - * - * @param result - * Reference to the DiffTask - * - * @throws TimeoutException - * if a timeout occurred - */ - protected void transmitPartialTask(final Task result) - throws TimeoutException - { - - this.diffedSize += result.byteSize(); - this.processingTimeDiff += System.currentTimeMillis() - startTime; - - super.transmitPartialTask(result); - - startTime = System.currentTimeMillis(); - } - - /** - * Transmits the DiffTask at the end of the RevisionTask processing. - * - * @param task - * Reference to the RevisionTask - * @param result - * Reference to the DiffTask - * - * @throws TimeoutException - * if a timeout occurred - */ - protected void transmitAtEndOfTask(final Task task, - final Task result) - throws TimeoutException - { - - this.processingTimeDiff += System.currentTimeMillis() - startTime; - - if (task.getTaskType() == TaskTypes.TASK_FULL - || task.getTaskType() == TaskTypes.TASK_PARTIAL_LAST) { - - diffedSize += result.byteSize(); - - ArticleInformation info = result.getHeader(); - info.setRevisionCounter(revisionCounter); - info.setIgnoredRevisionsCounter(ignoredRevisionsCounter); - info.setDiffedSize(diffedSize); - info.setDiffPartCounter(diffPartCounter); - info.setProcessingTimeRead(task.getHeader().getProcessingTimeRead()); - info.setProcessingTimeDiff(processingTimeDiff); - } - - super.transmitAtEndOfTask(task, result); - } - - /** - * Calculates the diff for the given revision. - * - * @param revision - * Reference to a revision - * @return Diff - * - * @throws UnsupportedEncodingException - * if the character encoding is unsupported - */ - protected Diff processRevision(final Revision revision) - throws UnsupportedEncodingException - { - - Diff diff = super.processRevision(revision); - if (diff == null) { - this.ignoredRevisionsCounter++; - } - else { - this.revisionCounter++; - this.diffPartCounter += diff.size(); - } - - return diff; - } - - /*--------------------------------------------------------------------------*/ - - /* - * (non-Javadoc) - * - * @see - * de.tud.ukp.kulessa.delta.consumers.diff.calculation.DiffCalculatorInterface - * #process(de.tud.ukp.kulessa.delta.data.Task) - */ - public void process(final Task task) - throws DiffException, TimeoutException, UnsupportedEncodingException - { - - this.startTime = System.currentTimeMillis(); - super.process(task); - } + extends DiffCalculator { + + /** + * Temporary variable - revision counter + */ + private int revisionCounter; + + /** + * Temporary variable - diff part counter + */ + private int diffPartCounter; + + /** + * Temporary variable - size of the diff + */ + private long diffedSize; + + /** + * Temporary variable - start time of the diff processing + */ + private long startTime; + + /** + * Temporary variable - time used for the diff processing + */ + private long processingTimeDiff; + + /** + * Temporary variable - number of ignored revisions + */ + private int ignoredRevisionsCounter; + + /** + * (Constructor) Creates a new DiffCalculator object. + * + * @param taskTransmitter Reference to the TaskTransmitter + * @throws ConfigurationException if an error occurred while accessing the configuration + */ + public TimedDiffCalculator(final TaskTransmitterInterface taskTransmitter) + throws ConfigurationException { + super(taskTransmitter); + } + + /*--------------------------------------------------------------------------*/ + + /** + * Initializes the processing of a new RevisionTask. + * + * @param taskID Article ID + */ + protected void initNewTask(final int taskID) { + + super.initNewTask(taskID); + + this.processingTimeDiff = 0; + + this.revisionCounter = 0; + this.ignoredRevisionsCounter = 0; + + this.diffPartCounter = 0; + this.diffedSize = 0; + } + + /** + * Transmits a partial DiffTask. + * + * @param result Reference to the DiffTask + * @throws TimeoutException if a timeout occurred + */ + protected void transmitPartialTask(final Task result) + throws TimeoutException { + + this.diffedSize += result.byteSize(); + this.processingTimeDiff += System.currentTimeMillis() - startTime; + + super.transmitPartialTask(result); + + startTime = System.currentTimeMillis(); + } + + /** + * Transmits the DiffTask at the end of the RevisionTask processing. + * + * @param task Reference to the RevisionTask + * @param result Reference to the DiffTask + * @throws TimeoutException if a timeout occurred + */ + protected void transmitAtEndOfTask(final Task task, + final Task result) + throws TimeoutException { + + this.processingTimeDiff += System.currentTimeMillis() - startTime; + + if (task.getTaskType() == TaskTypes.TASK_FULL + || task.getTaskType() == TaskTypes.TASK_PARTIAL_LAST) { + + diffedSize += result.byteSize(); + + ArticleInformation info = result.getHeader(); + info.setRevisionCounter(revisionCounter); + info.setIgnoredRevisionsCounter(ignoredRevisionsCounter); + info.setDiffedSize(diffedSize); + info.setDiffPartCounter(diffPartCounter); + info.setProcessingTimeRead(task.getHeader().getProcessingTimeRead()); + info.setProcessingTimeDiff(processingTimeDiff); + } + + super.transmitAtEndOfTask(task, result); + } + + /** + * Calculates the diff for the given revision. + * + * @param revision Reference to a revision + * @return Diff + * @throws UnsupportedEncodingException if the character encoding is unsupported + */ + protected Diff processRevision(final Revision revision) + throws UnsupportedEncodingException { + + Diff diff = super.processRevision(revision); + if (diff == null) { + this.ignoredRevisionsCounter++; + } else { + this.revisionCounter++; + this.diffPartCounter += diff.size(); + } + + return diff; + } + + /*--------------------------------------------------------------------------*/ + + /* + * (non-Javadoc) + * + * @see + * de.tud.ukp.kulessa.delta.consumers.diff.calculation.DiffCalculatorInterface + * #process(de.tud.ukp.kulessa.delta.data.Task) + */ + public void process(final Task task) + throws DiffException, TimeoutException, UnsupportedEncodingException { + + this.startTime = System.currentTimeMillis(); + super.process(task); + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/SQLEscape.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/SQLEscape.java index ea3d8495..69f860da 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/SQLEscape.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/SQLEscape.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -24,63 +24,63 @@ * Copied from the WikiMachine to avoid having to add dependency. */ public class SQLEscape { - private SQLEscape() { + private SQLEscape() { - } + } - /** - * @see SQLEscape - * @param str unescaped String - * @return String with with escape characters - */ - public static String escape(String str) { - final int len = str.length(); + /** + * @param str unescaped String + * @return String with with escape characters + * @see SQLEscape + */ + public static String escape(String str) { + final int len = str.length(); - // maybe the StringBuffer would be safer? - StringBuilder sql = new StringBuilder(len * 2); + // maybe the StringBuffer would be safer? + StringBuilder sql = new StringBuilder(len * 2); - for (int i = 0; i < len; i++) { - char c = str.charAt(i); - switch (c) { - case '\u0000': - sql.append('\\').append('0'); - break; - case '\n': - sql.append('\\').append('n'); - break; - case '\t': - sql.append('\\').append('t'); - break; - case '\r': - sql.append('\\').append('r'); - break; - case '\u001a': - sql.append('\\').append('Z'); - break; - case '\'': - sql.append('\\').append('\''); - break; - case '\"': - sql.append('\\').append('"'); - break; - case '\b': - sql.append('\\').append('b'); - break; - case '\\': - sql.append('\\').append('\\'); - break; + for (int i = 0; i < len; i++) { + char c = str.charAt(i); + switch (c) { + case '\u0000': + sql.append('\\').append('0'); + break; + case '\n': + sql.append('\\').append('n'); + break; + case '\t': + sql.append('\\').append('t'); + break; + case '\r': + sql.append('\\').append('r'); + break; + case '\u001a': + sql.append('\\').append('Z'); + break; + case '\'': + sql.append('\\').append('\''); + break; + case '\"': + sql.append('\\').append('"'); + break; + case '\b': + sql.append('\\').append('b'); + break; + case '\\': + sql.append('\\').append('\\'); + break; // case '%': // sql.append('[').append('%').append(']'); // break; // case '_': // sql.append('[').append('_').append(']'); // break; - default: - sql.append(c); - break; - } - } - return sql.toString(); - } + default: + sql.append(c); + break; + } + } + return sql.toString(); + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/WriterInterface.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/WriterInterface.java index 3ec82caa..19ceea2a 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/WriterInterface.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/WriterInterface.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -27,43 +27,30 @@ /** * The WriterInterface symbolizes the link to the output writer. - * - * */ -public interface WriterInterface -{ +public interface WriterInterface { - /** - * This method will process the given DiffTask and send him to the specified - * output. - * - * @param task - * DiffTask - * - * @throws ConfigurationException - * if problems occurred while initializing the components - * - * @throws IOException - * if problems occurred while writing the output (to file or - * archive) - * - * @throws SQLConsumerException - * if problems occurred while writing the output (to the sql - * producer database) - */ - void process(final Task task) - throws ConfigurationException, IOException, SQLConsumerException; + /** + * This method will process the given DiffTask and send him to the specified + * output. + * + * @param task DiffTask + * @throws ConfigurationException if problems occurred while initializing the components + * @throws IOException if problems occurred while writing the output (to file or + * archive) + * @throws SQLConsumerException if problems occurred while writing the output (to the sql + * producer database) + */ + void process(final Task task) + throws ConfigurationException, IOException, SQLConsumerException; - /** - * This method will close the connection to the output. - * - * @throws IOException - * if problems occurred while closing the file or process. - * - * @throws SQLException - * if problems occurred while closing the connection to the - * database. - */ - void close() - throws IOException, SQLException; + /** + * This method will close the connection to the output. + * + * @throws IOException if problems occurred while closing the file or process. + * @throws SQLException if problems occurred while closing the connection to the + * database. + */ + void close() + throws IOException, SQLException; } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/codec/DataFileEncoder.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/codec/DataFileEncoder.java index 3aad6540..90ab03c4 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/codec/DataFileEncoder.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/codec/DataFileEncoder.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -34,27 +34,27 @@ /** * Alternative to the SQLEncoder - writes data files instead of UNCOMPRESSED dumps - * */ -public class DataFileEncoder -{ - - /** Reference to the RevisionApi */ - private final RevisionEncoderInterface encoder; - - /** Last used ID of a full revision */ - private int lastFullRevID = -1; - - /** - * (Constructor) Creates a new SQLEncoder object. - * - * @throws ConfigurationException - * if an error occurred while accessing the configuration - */ - public DataFileEncoder() - throws ConfigurationException - { - this.encoder = new RevisionEncoder(); +public class DataFileEncoder { + + /** + * Reference to the RevisionApi + */ + private final RevisionEncoderInterface encoder; + + /** + * Last used ID of a full revision + */ + private int lastFullRevID = -1; + + /** + * (Constructor) Creates a new SQLEncoder object. + * + * @throws ConfigurationException if an error occurred while accessing the configuration + */ + public DataFileEncoder() + throws ConfigurationException { + this.encoder = new RevisionEncoder(); // tableRevision = "CREATE TABLE IF NOT EXISTS revisions (" // + "PrimaryKey INT UNSIGNED NOT NULL AUTO_INCREMENT, " @@ -72,112 +72,102 @@ public DataFileEncoder() // + "PRIMARY KEY(PrimaryKey)" // + ") TYPE = MyISAM DEFAULT CHARSET utf8 COLLATE utf8_general_ci;"; - } - - /** - * Encodes the diff. - * - * @param task - * Reference to the DiffTask - * @param diff - * Diff to encode - * @return Base 64 encoded Diff - * - * @throws ConfigurationException - * if an error occurred while accessing the configuration - * @throws UnsupportedEncodingException - * if the character encoding is unsupported - * @throws DecodingException - * if the decoding failed - * @throws EncodingException - * if the encoding failed - * @throws SQLConsumerException - * if an error occurred while encoding the diff - */ - protected String encodeDiff(final Task task, final Diff diff) - throws ConfigurationException, UnsupportedEncodingException, - DecodingException, EncodingException, SQLConsumerException - { - - RevisionCodecData codecData = diff.getCodecData(); - String encoding = encoder.encodeDiff(codecData, diff); - - return encoding; - } - - /* (non-Javadoc) - * @see org.dkpro.jwpl.revisionmachine.difftool.consumer.dump.codec.SQLEncoderInterface#encodeTask(org.dkpro.jwpl.revisionmachine.difftool.data.tasks.Task) - */ - public List encodeTask(final Task task) - throws ConfigurationException, UnsupportedEncodingException, - DecodingException, EncodingException, SQLConsumerException - { - - // this.task = task; - if (task.getTaskType() == TaskTypes.TASK_FULL - || task.getTaskType() == TaskTypes.TASK_PARTIAL_FIRST) { - - this.lastFullRevID = -1; - } - - int articleId = task.getHeader().getArticleId(); - Diff diff; - - ArrayList list = new ArrayList<>(); - - String tempData; - - int size = task.size(); - for (int i = 0; i < size; i++) { - - diff = task.get(i); - - if (diff.isFullRevision()) { - this.lastFullRevID = diff.getRevisionID(); - } - - /* - * prepare values that might be null - * because we don't want quotes if they are null - * - * Furthermore, escape quote-characters. Quotes are used as the "ENCLOSED BY" character - * in MySQL to mark begin and end of Strings - */ - - //prepare values that might be null - //because we don't want quotes if they are null - String comm = diff.getComment(); - String comment = comm==null?"\\N":"\""+escape(comm)+"\""; - - Integer cId = diff.getContributorId(); - String contributorId = cId==null?"\\N":cId.toString(); - - String cName = diff.getContributorName(); - String contributorName = cName==null?"\\N":"\""+escape(cName)+"\""; - - //Prepare the actual data item - tempData = "\\N," - + this.lastFullRevID + "," - + diff.getRevisionCounter() + "," - + diff.getRevisionID()+ "," - + articleId + "," - + diff.getTimeStamp().getTime()+ ",\"" - + encodeDiff(task, diff) + "\"," - + comment+"," - + (diff.isMinor()?"1":"0")+"," - + contributorName+"," - + contributorId+ "," - + (diff.getContributorIsRegistered()?"1":"0"); - - //add item to the list - list.add(tempData); - } - - return list; - } - - private String escape(String str){ - return str.replaceAll("\\\\", "\\\\\\\\").replaceAll("\"", "\\\\\""); - } + } + + /** + * Encodes the diff. + * + * @param task Reference to the DiffTask + * @param diff Diff to encode + * @return Base 64 encoded Diff + * @throws ConfigurationException if an error occurred while accessing the configuration + * @throws UnsupportedEncodingException if the character encoding is unsupported + * @throws DecodingException if the decoding failed + * @throws EncodingException if the encoding failed + * @throws SQLConsumerException if an error occurred while encoding the diff + */ + protected String encodeDiff(final Task task, final Diff diff) + throws ConfigurationException, UnsupportedEncodingException, + DecodingException, EncodingException, SQLConsumerException { + + RevisionCodecData codecData = diff.getCodecData(); + String encoding = encoder.encodeDiff(codecData, diff); + + return encoding; + } + + /* (non-Javadoc) + * @see org.dkpro.jwpl.revisionmachine.difftool.consumer.dump.codec.SQLEncoderInterface#encodeTask(org.dkpro.jwpl.revisionmachine.difftool.data.tasks.Task) + */ + public List encodeTask(final Task task) + throws ConfigurationException, UnsupportedEncodingException, + DecodingException, EncodingException, SQLConsumerException { + + // this.task = task; + if (task.getTaskType() == TaskTypes.TASK_FULL + || task.getTaskType() == TaskTypes.TASK_PARTIAL_FIRST) { + + this.lastFullRevID = -1; + } + + int articleId = task.getHeader().getArticleId(); + Diff diff; + + ArrayList list = new ArrayList<>(); + + String tempData; + + int size = task.size(); + for (int i = 0; i < size; i++) { + + diff = task.get(i); + + if (diff.isFullRevision()) { + this.lastFullRevID = diff.getRevisionID(); + } + + /* + * prepare values that might be null + * because we don't want quotes if they are null + * + * Furthermore, escape quote-characters. Quotes are used as the "ENCLOSED BY" character + * in MySQL to mark begin and end of Strings + */ + + //prepare values that might be null + //because we don't want quotes if they are null + String comm = diff.getComment(); + String comment = comm == null ? "\\N" : "\"" + escape(comm) + "\""; + + Integer cId = diff.getContributorId(); + String contributorId = cId == null ? "\\N" : cId.toString(); + + String cName = diff.getContributorName(); + String contributorName = cName == null ? "\\N" : "\"" + escape(cName) + "\""; + + //Prepare the actual data item + tempData = "\\N," + + this.lastFullRevID + "," + + diff.getRevisionCounter() + "," + + diff.getRevisionID() + "," + + articleId + "," + + diff.getTimeStamp().getTime() + ",\"" + + encodeDiff(task, diff) + "\"," + + comment + "," + + (diff.isMinor() ? "1" : "0") + "," + + contributorName + "," + + contributorId + "," + + (diff.getContributorIsRegistered() ? "1" : "0"); + + //add item to the list + list.add(tempData); + } + + return list; + } + + private String escape(String str) { + return str.replaceAll("\\\\", "\\\\\\\\").replaceAll("\"", "\\\\\""); + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/codec/SQLEncoder.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/codec/SQLEncoder.java index 226e2ece..e4892ab1 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/codec/SQLEncoder.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/codec/SQLEncoder.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -45,479 +45,470 @@ /** * This creates the SQL statements - * */ public class SQLEncoder - implements SQLEncoderInterface -{ - - /** UNCOMPRESSED Statement for tables containing binary encoded diff information */ - private final String binaryTableRevision; - - /** Reference to the RevisionApi */ - private final RevisionEncoderInterface encoder; - - /** Last used ID of a full revision */ - private int lastFullRevID = -1; - - /** Configuration parameter - Maximum size of a sql statement */ - private final long LIMIT_SQL_STATEMENT_SIZE; - - /** Reference to the logger */ - private final Logger logger; - - /** Configuration parameter - Path for the debug logger */ - private final String LOGGING_PATH_DEBUG; - - /** Configuration parameter - Path for the DiffTool logger */ - private final String LOGGING_PATH_DIFFTOOL; - - /** - * Configuration parameter - Flag, which indicates whether debug output is - * enabled or not - */ - private final boolean MODE_DEBUG_OUTPUT_ACTIVATED; - - /** Configuration parameter - Surrogate Mode */ - private final SurrogateModes MODE_SURROGATES; - - /** UNCOMPRESSED Statement for tables containing base 64 encoded diff information */ - private final String tableRevision; - - /** - * Configuration parameter - Flag, which indicates whether the verification - * of the encoding is enabled or not - */ - private final boolean VERIFICATION_ENCODING; - - /** Configuration Parameter - Wikipedia Encoding */ - private final String WIKIPEDIA_ENCODING; - - /** - * (Constructor) Creates a new SQLEncoder object. - * - * @param logger - * Reference to the logger - * - * @throws ConfigurationException - * if an error occurred while accessing the configuration - * @throws LoggingException - * if an error occurred while accessing the logger - */ - public SQLEncoder(final Logger logger) - throws ConfigurationException, LoggingException - { - - this.logger = logger; - - // Load config parameters - ConfigurationManager config = ConfigurationManager.getInstance(); - - MODE_DEBUG_OUTPUT_ACTIVATED = (Boolean) config - .getConfigParameter(ConfigurationKeys.MODE_DEBUG_OUTPUT); - - VERIFICATION_ENCODING = (Boolean) config - .getConfigParameter(ConfigurationKeys.VERIFICATION_ENCODING); - - LOGGING_PATH_DIFFTOOL = (String) config - .getConfigParameter(ConfigurationKeys.LOGGING_PATH_DIFFTOOL); - - LOGGING_PATH_DEBUG = (String) config - .getConfigParameter(ConfigurationKeys.LOGGING_PATH_DEBUG); - - LIMIT_SQL_STATEMENT_SIZE = (Long) config - .getConfigParameter(ConfigurationKeys.LIMIT_SQLSERVER_MAX_ALLOWED_PACKET); - - MODE_SURROGATES = (SurrogateModes) config - .getConfigParameter(ConfigurationKeys.MODE_SURROGATES); - - WIKIPEDIA_ENCODING = (String) config - .getConfigParameter(ConfigurationKeys.WIKIPEDIA_ENCODING); - - this.encoder = new RevisionEncoder(); - - tableRevision = "CREATE TABLE IF NOT EXISTS revisions (" - + "PrimaryKey INT UNSIGNED NOT NULL AUTO_INCREMENT, " - + "FullRevisionID INTEGER UNSIGNED NOT NULL, " - + "RevisionCounter INTEGER UNSIGNED NOT NULL, " - + "RevisionID INTEGER UNSIGNED NOT NULL, " - + "ArticleID INTEGER UNSIGNED NOT NULL, " - + "Timestamp BIGINT NOT NULL, " - + "Revision MEDIUMTEXT NOT NULL, " - + "Comment MEDIUMTEXT, " - + "Minor TINYINT NOT NULL, " - + "ContributorName TEXT NOT NULL, " - + "ContributorId INTEGER UNSIGNED, " - + "ContributorIsRegistered TINYINT NOT NULL, " - + "PRIMARY KEY(PrimaryKey)" - + ") TYPE = MyISAM DEFAULT CHARSET utf8 COLLATE utf8_general_ci;"; - - binaryTableRevision = "CREATE TABLE IF NOT EXISTS revisions (" - + "PrimaryKey INT UNSIGNED NOT NULL AUTO_INCREMENT, " - + "FullRevisionID INTEGER UNSIGNED NOT NULL, " - + "RevisionCounter INTEGER UNSIGNED NOT NULL, " - + "RevisionID INTEGER UNSIGNED NOT NULL, " - + "ArticleID INTEGER UNSIGNED NOT NULL, " - + "Timestamp BIGINT NOT NULL, " - + "Revision MEDIUMBLOB NOT NULL," - + "Comment MEDIUMTEXT, " - + "Minor TINYINT NOT NULL, " - + "ContributorName TEXT NOT NULL, " - + "ContributorId INTEGER UNSIGNED, " - + "ContributorIsRegistered TINYINT NOT NULL, " - + "PRIMARY KEY(PrimaryKey)" - + ") TYPE = MyISAM DEFAULT CHARSET utf8 COLLATE utf8_general_ci;"; - - } - - /** - * @param task - * @param diff - * @return - * @throws ConfigurationException - * @throws UnsupportedEncodingException - * @throws DecodingException - * @throws EncodingException - * @throws SQLConsumerException - */ - protected byte[] binaryDiff(final Task task, final Diff diff) - throws ConfigurationException, UnsupportedEncodingException, - DecodingException, EncodingException, SQLConsumerException - { - - RevisionCodecData codecData = diff.getCodecData(); - byte[] encoding = encoder.binaryDiff(codecData, diff); - - if (VERIFICATION_ENCODING) { - RevisionDecoder decoder = new RevisionDecoder(WIKIPEDIA_ENCODING); - decoder.setInput(encoding); - Diff decDiff = decoder.decode(); - - verify(task, decDiff, diff); - } - - return encoding; - } - - /* (non-Javadoc) - * @see org.dkpro.jwpl.revisionmachine.difftool.consumer.dump.codec.SQLEncoderInterface#binaryTask(org.dkpro.jwpl.revisionmachine.difftool.data.tasks.Task) - */ - @Override - public SQLEncoding[] binaryTask(final Task task) - throws ConfigurationException, UnsupportedEncodingException, - DecodingException, EncodingException, SQLConsumerException - { - - // this.task = task; - if (task.getTaskType() == TaskTypes.TASK_FULL - || task.getTaskType() == TaskTypes.TASK_PARTIAL_FIRST) { - - this.lastFullRevID = -1; - } - - int articleId = task.getHeader().getArticleId(); - Diff diff; - - ArrayList list = new ArrayList<>(); - - SQLEncoding revisionsEncoding = new SQLEncoding(); - SQLEncoding usersEncoding = new SQLEncoding(); - revisionsEncoding.append("INSERT INTO revisions VALUES"); - usersEncoding.append("INSERT INTO users VALUES"); - - byte[] tempBinaryData; - String tempData; - - int size = task.size(); - for (int i = 0; i < size; i++) { - diff = task.get(i); - - /* - * Process revision table - */ - if (diff.isFullRevision()) { - this.lastFullRevID = diff.getRevisionID(); - } - - //prepare values that might be null - //because we don't want quotes if they are null - String comm = diff.getComment(); - String comment = comm==null?null:"'"+comm+"'"; - - Integer cId = diff.getContributorId(); - String contributorId = cId==null?null:cId.toString(); - - // save the query and binary data temporary - tempData = "(null, " + this.lastFullRevID + "," - + diff.getRevisionCounter() + "," + diff.getRevisionID() - + "," + articleId + "," + diff.getTimeStamp().getTime() - + ",?,"+comment+","+(diff.isMinor()?"1":"0")+","+contributorId+ ","+(diff.getContributorIsRegistered()?"1":"0")+")"; - tempBinaryData = binaryDiff(task, diff); - - // if the limit would be reached start a new encoding - if ((revisionsEncoding.byteSize() + tempBinaryData.length + tempData.length() >= LIMIT_SQL_STATEMENT_SIZE) && (i!=0)) { - revisionsEncoding.append(";"); - list.add(revisionsEncoding); - - revisionsEncoding = new SQLEncoding(); - revisionsEncoding.append("INSERT INTO revisions VALUES"); - } - - if (revisionsEncoding.size() > 0) { - revisionsEncoding.append(","); - } - revisionsEncoding.append(tempData); - revisionsEncoding.addBinaryData(tempBinaryData); - - } - - // Add the pending encoding - if (revisionsEncoding.size() > 0) { - revisionsEncoding.append(";"); - list.add(revisionsEncoding); - } - - - // Transform the list into an array - SQLEncoding[] queries = new SQLEncoding[list.size()]; - return list.toArray(queries); - } - - /** - * Encodes the diff. - * - * @param task - * Reference to the DiffTask - * @param diff - * Diff to encode - * @return Base 64 encoded Diff - * - * @throws ConfigurationException - * if an error occurred while accessing the configuration - * @throws UnsupportedEncodingException - * if the character encoding is unsupported - * @throws DecodingException - * if the decoding failed - * @throws EncodingException - * if the encoding failed - * @throws SQLConsumerException - * if an error occurred while encoding the diff - */ - protected String encodeDiff(final Task task, final Diff diff) - throws ConfigurationException, UnsupportedEncodingException, - DecodingException, EncodingException, SQLConsumerException - { - - RevisionCodecData codecData = diff.getCodecData(); - String encoding = encoder.encodeDiff(codecData, diff); - - if (VERIFICATION_ENCODING) { - RevisionDecoder decoder = new RevisionDecoder(WIKIPEDIA_ENCODING); - decoder.setInput(encoding); - Diff decDiff = decoder.decode(); - - verify(task, decDiff, diff); - } - - return encoding; - } - - /* (non-Javadoc) - * @see org.dkpro.jwpl.revisionmachine.difftool.consumer.dump.codec.SQLEncoderInterface#encodeTask(org.dkpro.jwpl.revisionmachine.difftool.data.tasks.Task) - */ - @Override - public SQLEncoding[] encodeTask(final Task task) - throws ConfigurationException, UnsupportedEncodingException, - DecodingException, EncodingException, SQLConsumerException - { - - // this.task = task; - if (task.getTaskType() == TaskTypes.TASK_FULL - || task.getTaskType() == TaskTypes.TASK_PARTIAL_FIRST) { - - this.lastFullRevID = -1; - } - - int articleId = task.getHeader().getArticleId(); - Diff diff; - - ArrayList list = new ArrayList<>(); - - SQLEncoding revisionEncoding = new SQLEncoding(); - revisionEncoding.append("INSERT INTO revisions VALUES"); - - String tempData; - - int size = task.size(); - for (int i = 0; i < size; i++) { - - diff = task.get(i); - - /* - * Process revision table - */ - if (diff.isFullRevision()) { - this.lastFullRevID = diff.getRevisionID(); - } - - //prepare values that might be null - //because we don't want quotes if they are null - String comm = diff.getComment(); - String comment = comm==null?null:"'"+comm+"'"; - - Integer cId = diff.getContributorId(); - String contributorId = cId==null?null:cId.toString(); - - // save the query temporary - tempData = "(null," + this.lastFullRevID + "," - + diff.getRevisionCounter() + "," + diff.getRevisionID() - + "," + articleId + "," + diff.getTimeStamp().getTime() - + ",'" + encodeDiff(task, diff) + "',"+comment+","+(diff.isMinor()?"1":"0")+",'"+diff.getContributorName() +"',"+contributorId+ ","+(diff.getContributorIsRegistered()?"1":"0")+")"; - - // if the limit would be reached start a new encoding - if ((revisionEncoding.byteSize() + tempData.length() >= LIMIT_SQL_STATEMENT_SIZE) && (i!=0)) { - revisionEncoding.append(";"); - list.add(revisionEncoding); - - revisionEncoding = new SQLEncoding(); - revisionEncoding.append("INSERT INTO revisions VALUES"); - } - - if (revisionEncoding.byteSize() > 30) { - revisionEncoding.append(","); - } - revisionEncoding.append(tempData); - - } - - // Add the pending encodings - if (revisionEncoding.byteSize() > 30) { - revisionEncoding.append(";"); - list.add(revisionEncoding); - } - - // Transform the list into an array - SQLEncoding[] queries = new SQLEncoding[list.size()]; - return list.toArray(queries); - } - - /* (non-Javadoc) - * @see org.dkpro.jwpl.revisionmachine.difftool.consumer.dump.codec.SQLEncoderInterface#getBinaryTable() - */ - @Override - public String[] getBinaryTable() - { - return new String[] { binaryTableRevision }; - } - - /* (non-Javadoc) - * @see org.dkpro.jwpl.revisionmachine.difftool.consumer.dump.codec.SQLEncoderInterface#getTable() - */ - @Override - public String[] getTable() - { - return new String[] { tableRevision }; - } - - - - /** - * Verifies that the decoded diff is identical to the original diff. - * - * @param task - * DiffTask - * @param decodedDiff - * diff created from encoding the decoded diff information - * @param originalDiff - * original diff - * - * @throws SQLConsumerException - * if an error occurs - */ - private void verify(final Task task, final Diff decodedDiff, - final Diff originalDiff) - throws SQLConsumerException - { - - String orig = originalDiff.toString(); - String deco = decodedDiff.toString(); - - boolean notEqual = !orig.equals(deco); - - if (notEqual && MODE_SURROGATES == SurrogateModes.REPLACE) { - - char[] origDiff = orig.toCharArray(); - - // TODO: test - if (Surrogates.scan(origDiff)) { - - String repDiff = new String(Surrogates.replace(origDiff)); - notEqual = !repDiff.equals(deco); - } - } - - if (notEqual) { - - if (MODE_DEBUG_OUTPUT_ACTIVATED) { - - try { - // System.out.println("DEBUG\t" + task.toString()); - - WikipediaXMLWriter writer = new WikipediaXMLWriter( - LOGGING_PATH_DIFFTOOL + LOGGING_PATH_DEBUG - + task.getHeader().getArticleName() - + ".dbg"); - - switch (task.getTaskType()) { - case TASK_FULL: - case TASK_PARTIAL_FIRST: - writer.writeDiff(task); - break; - - case TASK_PARTIAL: - case TASK_PARTIAL_LAST: { - - int revCount = originalDiff.getRevisionCounter(); - Diff d; - boolean fullRev = false; - - for (int diffCount = 0; !fullRev - && diffCount < originalDiff.size(); diffCount++) { - - d = task.get(diffCount); - if (d.getRevisionCounter() <= revCount - && d.isFullRevision()) { - fullRev = true; - writer.writeDiff(task, diffCount); - } - } - - if (!fullRev) { - writer.writeDiffFile(task); - } - - } - break; - default: - throw new IOException("Unknown TaskType"); - // TODO: Debug output - } - - writer.close(); - } - catch (IOException e) { - ConsumerLogMessages.logException(logger, e); - } - } - - throw ErrorFactory - .createSQLConsumerException( - ErrorKeys.DIFFTOOL_SQLCONSUMER_ENCODING_VERIFICATION_FAILED, - "Redecoding of " - + task.getHeader().getArticleName() - + " failed at revision " - + originalDiff.getRevisionCounter() + "."); - } - } + implements SQLEncoderInterface { + + /** + * UNCOMPRESSED Statement for tables containing binary encoded diff information + */ + private final String binaryTableRevision; + + /** + * Reference to the RevisionApi + */ + private final RevisionEncoderInterface encoder; + + /** + * Last used ID of a full revision + */ + private int lastFullRevID = -1; + + /** + * Configuration parameter - Maximum size of a sql statement + */ + private final long LIMIT_SQL_STATEMENT_SIZE; + + /** + * Reference to the logger + */ + private final Logger logger; + + /** + * Configuration parameter - Path for the debug logger + */ + private final String LOGGING_PATH_DEBUG; + + /** + * Configuration parameter - Path for the DiffTool logger + */ + private final String LOGGING_PATH_DIFFTOOL; + + /** + * Configuration parameter - Flag, which indicates whether debug output is + * enabled or not + */ + private final boolean MODE_DEBUG_OUTPUT_ACTIVATED; + + /** + * Configuration parameter - Surrogate Mode + */ + private final SurrogateModes MODE_SURROGATES; + + /** + * UNCOMPRESSED Statement for tables containing base 64 encoded diff information + */ + private final String tableRevision; + + /** + * Configuration parameter - Flag, which indicates whether the verification + * of the encoding is enabled or not + */ + private final boolean VERIFICATION_ENCODING; + + /** + * Configuration Parameter - Wikipedia Encoding + */ + private final String WIKIPEDIA_ENCODING; + + /** + * (Constructor) Creates a new SQLEncoder object. + * + * @param logger Reference to the logger + * @throws ConfigurationException if an error occurred while accessing the configuration + * @throws LoggingException if an error occurred while accessing the logger + */ + public SQLEncoder(final Logger logger) + throws ConfigurationException, LoggingException { + + this.logger = logger; + + // Load config parameters + ConfigurationManager config = ConfigurationManager.getInstance(); + + MODE_DEBUG_OUTPUT_ACTIVATED = (Boolean) config + .getConfigParameter(ConfigurationKeys.MODE_DEBUG_OUTPUT); + + VERIFICATION_ENCODING = (Boolean) config + .getConfigParameter(ConfigurationKeys.VERIFICATION_ENCODING); + + LOGGING_PATH_DIFFTOOL = (String) config + .getConfigParameter(ConfigurationKeys.LOGGING_PATH_DIFFTOOL); + + LOGGING_PATH_DEBUG = (String) config + .getConfigParameter(ConfigurationKeys.LOGGING_PATH_DEBUG); + + LIMIT_SQL_STATEMENT_SIZE = (Long) config + .getConfigParameter(ConfigurationKeys.LIMIT_SQLSERVER_MAX_ALLOWED_PACKET); + + MODE_SURROGATES = (SurrogateModes) config + .getConfigParameter(ConfigurationKeys.MODE_SURROGATES); + + WIKIPEDIA_ENCODING = (String) config + .getConfigParameter(ConfigurationKeys.WIKIPEDIA_ENCODING); + + this.encoder = new RevisionEncoder(); + + tableRevision = "CREATE TABLE IF NOT EXISTS revisions (" + + "PrimaryKey INT UNSIGNED NOT NULL AUTO_INCREMENT, " + + "FullRevisionID INTEGER UNSIGNED NOT NULL, " + + "RevisionCounter INTEGER UNSIGNED NOT NULL, " + + "RevisionID INTEGER UNSIGNED NOT NULL, " + + "ArticleID INTEGER UNSIGNED NOT NULL, " + + "Timestamp BIGINT NOT NULL, " + + "Revision MEDIUMTEXT NOT NULL, " + + "Comment MEDIUMTEXT, " + + "Minor TINYINT NOT NULL, " + + "ContributorName TEXT NOT NULL, " + + "ContributorId INTEGER UNSIGNED, " + + "ContributorIsRegistered TINYINT NOT NULL, " + + "PRIMARY KEY(PrimaryKey)" + + ") TYPE = MyISAM DEFAULT CHARSET utf8 COLLATE utf8_general_ci;"; + + binaryTableRevision = "CREATE TABLE IF NOT EXISTS revisions (" + + "PrimaryKey INT UNSIGNED NOT NULL AUTO_INCREMENT, " + + "FullRevisionID INTEGER UNSIGNED NOT NULL, " + + "RevisionCounter INTEGER UNSIGNED NOT NULL, " + + "RevisionID INTEGER UNSIGNED NOT NULL, " + + "ArticleID INTEGER UNSIGNED NOT NULL, " + + "Timestamp BIGINT NOT NULL, " + + "Revision MEDIUMBLOB NOT NULL," + + "Comment MEDIUMTEXT, " + + "Minor TINYINT NOT NULL, " + + "ContributorName TEXT NOT NULL, " + + "ContributorId INTEGER UNSIGNED, " + + "ContributorIsRegistered TINYINT NOT NULL, " + + "PRIMARY KEY(PrimaryKey)" + + ") TYPE = MyISAM DEFAULT CHARSET utf8 COLLATE utf8_general_ci;"; + + } + + /** + * @param task + * @param diff + * @return + * @throws ConfigurationException + * @throws UnsupportedEncodingException + * @throws DecodingException + * @throws EncodingException + * @throws SQLConsumerException + */ + protected byte[] binaryDiff(final Task task, final Diff diff) + throws ConfigurationException, UnsupportedEncodingException, + DecodingException, EncodingException, SQLConsumerException { + + RevisionCodecData codecData = diff.getCodecData(); + byte[] encoding = encoder.binaryDiff(codecData, diff); + + if (VERIFICATION_ENCODING) { + RevisionDecoder decoder = new RevisionDecoder(WIKIPEDIA_ENCODING); + decoder.setInput(encoding); + Diff decDiff = decoder.decode(); + + verify(task, decDiff, diff); + } + + return encoding; + } + + /* (non-Javadoc) + * @see org.dkpro.jwpl.revisionmachine.difftool.consumer.dump.codec.SQLEncoderInterface#binaryTask(org.dkpro.jwpl.revisionmachine.difftool.data.tasks.Task) + */ + @Override + public SQLEncoding[] binaryTask(final Task task) + throws ConfigurationException, UnsupportedEncodingException, + DecodingException, EncodingException, SQLConsumerException { + + // this.task = task; + if (task.getTaskType() == TaskTypes.TASK_FULL + || task.getTaskType() == TaskTypes.TASK_PARTIAL_FIRST) { + + this.lastFullRevID = -1; + } + + int articleId = task.getHeader().getArticleId(); + Diff diff; + + ArrayList list = new ArrayList<>(); + + SQLEncoding revisionsEncoding = new SQLEncoding(); + SQLEncoding usersEncoding = new SQLEncoding(); + revisionsEncoding.append("INSERT INTO revisions VALUES"); + usersEncoding.append("INSERT INTO users VALUES"); + + byte[] tempBinaryData; + String tempData; + + int size = task.size(); + for (int i = 0; i < size; i++) { + diff = task.get(i); + + /* + * Process revision table + */ + if (diff.isFullRevision()) { + this.lastFullRevID = diff.getRevisionID(); + } + + //prepare values that might be null + //because we don't want quotes if they are null + String comm = diff.getComment(); + String comment = comm == null ? null : "'" + comm + "'"; + + Integer cId = diff.getContributorId(); + String contributorId = cId == null ? null : cId.toString(); + + // save the query and binary data temporary + tempData = "(null, " + this.lastFullRevID + "," + + diff.getRevisionCounter() + "," + diff.getRevisionID() + + "," + articleId + "," + diff.getTimeStamp().getTime() + + ",?," + comment + "," + (diff.isMinor() ? "1" : "0") + "," + contributorId + "," + (diff.getContributorIsRegistered() ? "1" : "0") + ")"; + tempBinaryData = binaryDiff(task, diff); + + // if the limit would be reached start a new encoding + if ((revisionsEncoding.byteSize() + tempBinaryData.length + tempData.length() >= LIMIT_SQL_STATEMENT_SIZE) && (i != 0)) { + revisionsEncoding.append(";"); + list.add(revisionsEncoding); + + revisionsEncoding = new SQLEncoding(); + revisionsEncoding.append("INSERT INTO revisions VALUES"); + } + + if (revisionsEncoding.size() > 0) { + revisionsEncoding.append(","); + } + revisionsEncoding.append(tempData); + revisionsEncoding.addBinaryData(tempBinaryData); + + } + + // Add the pending encoding + if (revisionsEncoding.size() > 0) { + revisionsEncoding.append(";"); + list.add(revisionsEncoding); + } + + + // Transform the list into an array + SQLEncoding[] queries = new SQLEncoding[list.size()]; + return list.toArray(queries); + } + + /** + * Encodes the diff. + * + * @param task Reference to the DiffTask + * @param diff Diff to encode + * @return Base 64 encoded Diff + * @throws ConfigurationException if an error occurred while accessing the configuration + * @throws UnsupportedEncodingException if the character encoding is unsupported + * @throws DecodingException if the decoding failed + * @throws EncodingException if the encoding failed + * @throws SQLConsumerException if an error occurred while encoding the diff + */ + protected String encodeDiff(final Task task, final Diff diff) + throws ConfigurationException, UnsupportedEncodingException, + DecodingException, EncodingException, SQLConsumerException { + + RevisionCodecData codecData = diff.getCodecData(); + String encoding = encoder.encodeDiff(codecData, diff); + + if (VERIFICATION_ENCODING) { + RevisionDecoder decoder = new RevisionDecoder(WIKIPEDIA_ENCODING); + decoder.setInput(encoding); + Diff decDiff = decoder.decode(); + + verify(task, decDiff, diff); + } + + return encoding; + } + + /* (non-Javadoc) + * @see org.dkpro.jwpl.revisionmachine.difftool.consumer.dump.codec.SQLEncoderInterface#encodeTask(org.dkpro.jwpl.revisionmachine.difftool.data.tasks.Task) + */ + @Override + public SQLEncoding[] encodeTask(final Task task) + throws ConfigurationException, UnsupportedEncodingException, + DecodingException, EncodingException, SQLConsumerException { + + // this.task = task; + if (task.getTaskType() == TaskTypes.TASK_FULL + || task.getTaskType() == TaskTypes.TASK_PARTIAL_FIRST) { + + this.lastFullRevID = -1; + } + + int articleId = task.getHeader().getArticleId(); + Diff diff; + + ArrayList list = new ArrayList<>(); + + SQLEncoding revisionEncoding = new SQLEncoding(); + revisionEncoding.append("INSERT INTO revisions VALUES"); + + String tempData; + + int size = task.size(); + for (int i = 0; i < size; i++) { + + diff = task.get(i); + + /* + * Process revision table + */ + if (diff.isFullRevision()) { + this.lastFullRevID = diff.getRevisionID(); + } + + //prepare values that might be null + //because we don't want quotes if they are null + String comm = diff.getComment(); + String comment = comm == null ? null : "'" + comm + "'"; + + Integer cId = diff.getContributorId(); + String contributorId = cId == null ? null : cId.toString(); + + // save the query temporary + tempData = "(null," + this.lastFullRevID + "," + + diff.getRevisionCounter() + "," + diff.getRevisionID() + + "," + articleId + "," + diff.getTimeStamp().getTime() + + ",'" + encodeDiff(task, diff) + "'," + comment + "," + (diff.isMinor() ? "1" : "0") + ",'" + diff.getContributorName() + "'," + contributorId + "," + (diff.getContributorIsRegistered() ? "1" : "0") + ")"; + + // if the limit would be reached start a new encoding + if ((revisionEncoding.byteSize() + tempData.length() >= LIMIT_SQL_STATEMENT_SIZE) && (i != 0)) { + revisionEncoding.append(";"); + list.add(revisionEncoding); + + revisionEncoding = new SQLEncoding(); + revisionEncoding.append("INSERT INTO revisions VALUES"); + } + + if (revisionEncoding.byteSize() > 30) { + revisionEncoding.append(","); + } + revisionEncoding.append(tempData); + + } + + // Add the pending encodings + if (revisionEncoding.byteSize() > 30) { + revisionEncoding.append(";"); + list.add(revisionEncoding); + } + + // Transform the list into an array + SQLEncoding[] queries = new SQLEncoding[list.size()]; + return list.toArray(queries); + } + + /* (non-Javadoc) + * @see org.dkpro.jwpl.revisionmachine.difftool.consumer.dump.codec.SQLEncoderInterface#getBinaryTable() + */ + @Override + public String[] getBinaryTable() { + return new String[]{binaryTableRevision}; + } + + /* (non-Javadoc) + * @see org.dkpro.jwpl.revisionmachine.difftool.consumer.dump.codec.SQLEncoderInterface#getTable() + */ + @Override + public String[] getTable() { + return new String[]{tableRevision}; + } + + + /** + * Verifies that the decoded diff is identical to the original diff. + * + * @param task DiffTask + * @param decodedDiff diff created from encoding the decoded diff information + * @param originalDiff original diff + * @throws SQLConsumerException if an error occurs + */ + private void verify(final Task task, final Diff decodedDiff, + final Diff originalDiff) + throws SQLConsumerException { + + String orig = originalDiff.toString(); + String deco = decodedDiff.toString(); + + boolean notEqual = !orig.equals(deco); + + if (notEqual && MODE_SURROGATES == SurrogateModes.REPLACE) { + + char[] origDiff = orig.toCharArray(); + + // TODO: test + if (Surrogates.scan(origDiff)) { + + String repDiff = new String(Surrogates.replace(origDiff)); + notEqual = !repDiff.equals(deco); + } + } + + if (notEqual) { + + if (MODE_DEBUG_OUTPUT_ACTIVATED) { + + try { + // System.out.println("DEBUG\t" + task.toString()); + + WikipediaXMLWriter writer = new WikipediaXMLWriter( + LOGGING_PATH_DIFFTOOL + LOGGING_PATH_DEBUG + + task.getHeader().getArticleName() + + ".dbg"); + + switch (task.getTaskType()) { + case TASK_FULL: + case TASK_PARTIAL_FIRST: + writer.writeDiff(task); + break; + + case TASK_PARTIAL: + case TASK_PARTIAL_LAST: { + + int revCount = originalDiff.getRevisionCounter(); + Diff d; + boolean fullRev = false; + + for (int diffCount = 0; !fullRev + && diffCount < originalDiff.size(); diffCount++) { + + d = task.get(diffCount); + if (d.getRevisionCounter() <= revCount + && d.isFullRevision()) { + fullRev = true; + writer.writeDiff(task, diffCount); + } + } + + if (!fullRev) { + writer.writeDiffFile(task); + } + + } + break; + default: + throw new IOException("Unknown TaskType"); + // TODO: Debug output + } + + writer.close(); + } catch (IOException e) { + ConsumerLogMessages.logException(logger, e); + } + } + + throw ErrorFactory + .createSQLConsumerException( + ErrorKeys.DIFFTOOL_SQLCONSUMER_ENCODING_VERIFICATION_FAILED, + "Redecoding of " + + task.getHeader().getArticleName() + + " failed at revision " + + originalDiff.getRevisionCounter() + "."); + } + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/codec/SQLEncoderInterface.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/codec/SQLEncoderInterface.java index 4be4a63f..c215e430 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/codec/SQLEncoderInterface.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/codec/SQLEncoderInterface.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -29,91 +29,64 @@ /** * The SQLEncoderInterface provides the link to the SQLEncoder who will define * the formatting of the output. - * - * */ -public interface SQLEncoderInterface -{ +public interface SQLEncoderInterface { - /** - * Returns the tables for textual output. - *

- * Each Array entry will contain a single sql command. - * - * @return sql command to create the tables - */ + /** + * Returns the tables for textual output. + *

+ * Each Array entry will contain a single sql command. + * + * @return sql command to create the tables + */ String[] getTable(); - /** - * Returns the tables for binary output. - *

- * Each Array entry will contain a single sql command. - * - * @return sql command to create the tables - */ + /** + * Returns the tables for binary output. + *

+ * Each Array entry will contain a single sql command. + * + * @return sql command to create the tables + */ String[] getBinaryTable(); - /** - * Returns the binary encoding of the given DiffTask. - *

- * Each Array entry will contain a single sql command. - * - * @param task - * DiffTask - * @return binary encoding of the task. - * - * @throws ConfigurationException - * if problems occurred while initializing the components - * - * @throws UnsupportedEncodingException - * if the CharacterSet defined in the configuration is not - * supported by JAVA. - * - * @throws DecodingException - * if the decoding process fails (during the verification - * process) - * - * @throws EncodingException - * if the encoding process fails - * - * @throws SQLConsumerException - * if the verification process fails - * - */ + /** + * Returns the binary encoding of the given DiffTask. + *

+ * Each Array entry will contain a single sql command. + * + * @param task DiffTask + * @return binary encoding of the task. + * @throws ConfigurationException if problems occurred while initializing the components + * @throws UnsupportedEncodingException if the CharacterSet defined in the configuration is not + * supported by JAVA. + * @throws DecodingException if the decoding process fails (during the verification + * process) + * @throws EncodingException if the encoding process fails + * @throws SQLConsumerException if the verification process fails + */ SQLEncoding[] binaryTask(final Task task) - throws ConfigurationException, UnsupportedEncodingException, - DecodingException, EncodingException, SQLConsumerException; + throws ConfigurationException, UnsupportedEncodingException, + DecodingException, EncodingException, SQLConsumerException; - /** - * Returns the textual encoding of the given DiffTask. - *

- * Each Array entry will contain a single sql command. - * - * @param task - * DiffTask - * @return binary encoding of the task. - * - * @throws ConfigurationException - * if problems occurred while initializing the components - * - * @throws UnsupportedEncodingException - * if the CharacterSet defined in the configuration is not - * supported by JAVA. - * - * @throws DecodingException - * if the decoding process fails (during the verification - * process) - * - * @throws EncodingException - * if the encoding process fails - * - * @throws SQLConsumerException - * if the verification process fails - * - */ + /** + * Returns the textual encoding of the given DiffTask. + *

+ * Each Array entry will contain a single sql command. + * + * @param task DiffTask + * @return binary encoding of the task. + * @throws ConfigurationException if problems occurred while initializing the components + * @throws UnsupportedEncodingException if the CharacterSet defined in the configuration is not + * supported by JAVA. + * @throws DecodingException if the decoding process fails (during the verification + * process) + * @throws EncodingException if the encoding process fails + * @throws SQLConsumerException if the verification process fails + */ SQLEncoding[] encodeTask(final Task task) - throws ConfigurationException, UnsupportedEncodingException, - DecodingException, EncodingException, SQLConsumerException; + throws ConfigurationException, UnsupportedEncodingException, + DecodingException, EncodingException, SQLConsumerException; } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/codec/SQLEncoding.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/codec/SQLEncoding.java index 6efdb54b..0afff380 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/codec/SQLEncoding.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/codec/SQLEncoding.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -22,121 +22,111 @@ /** * This class is used to stored the sql statements. - * - * - * */ -public class SQLEncoding -{ - - /** UNCOMPRESSED Query */ - private final StringBuilder query; - - /** List of binary data */ - private final List list; - - /** Size of binary data */ - private int binaryDataSize; - - /** - * (Constructor) Creates a new SQLEncoding object. - */ - public SQLEncoding() - { - this.query = new StringBuilder(); - this.list = new ArrayList<>(); - this.binaryDataSize = 0; - } - - /** - * Appends textual content to the query. - * - * @param seq - * textual content - */ - public void append(final CharSequence seq) - { - this.query.append(seq); - } - - /** - * Appends binary data to storage. - * - * @param bData - * binary data - */ - public void addBinaryData(final byte[] bData) - { - this.binaryDataSize += bData.length; - this.list.add(bData); - } - - /** - * Returns the size of the query. - * - * @return size of the query - */ - public int byteSize() - { - return this.binaryDataSize + this.query.length(); - } - - /** - * Returns the number of contained binary data parts. - * - * @return number of binary data parts - */ - public int size() - { - return this.list.size(); - } - - /** - * Returns the specified binary data. - * - * @param index - * index of the binary data - * @return binary data - */ - public byte[] getBinaryData(final int index) - { - return list.get(index); - } - - /** - * Returns the query. - * - * @return query - */ - public String getQuery() - { - return query.toString(); - } - - /** - * Returns the string representation of this object. - * - * @return string representation - */ - public String toString() - { - - try { - StringBuilder buffer = new StringBuilder(); - - buffer.append(query + "\r\n\r\n"); - - for (int i = 0; i < list.size(); i++) { - buffer.append(i + "\t" + list.get(i).length + "\r\n"); - } - - return buffer.toString(); - - } - catch (Exception e) { - - } - - return "<" + list.size() + ">\r\n" + query; - } +public class SQLEncoding { + + /** + * UNCOMPRESSED Query + */ + private final StringBuilder query; + + /** + * List of binary data + */ + private final List list; + + /** + * Size of binary data + */ + private int binaryDataSize; + + /** + * (Constructor) Creates a new SQLEncoding object. + */ + public SQLEncoding() { + this.query = new StringBuilder(); + this.list = new ArrayList<>(); + this.binaryDataSize = 0; + } + + /** + * Appends textual content to the query. + * + * @param seq textual content + */ + public void append(final CharSequence seq) { + this.query.append(seq); + } + + /** + * Appends binary data to storage. + * + * @param bData binary data + */ + public void addBinaryData(final byte[] bData) { + this.binaryDataSize += bData.length; + this.list.add(bData); + } + + /** + * Returns the size of the query. + * + * @return size of the query + */ + public int byteSize() { + return this.binaryDataSize + this.query.length(); + } + + /** + * Returns the number of contained binary data parts. + * + * @return number of binary data parts + */ + public int size() { + return this.list.size(); + } + + /** + * Returns the specified binary data. + * + * @param index index of the binary data + * @return binary data + */ + public byte[] getBinaryData(final int index) { + return list.get(index); + } + + /** + * Returns the query. + * + * @return query + */ + public String getQuery() { + return query.toString(); + } + + /** + * Returns the string representation of this object. + * + * @return string representation + */ + public String toString() { + + try { + StringBuilder buffer = new StringBuilder(); + + buffer.append(query + "\r\n\r\n"); + + for (int i = 0; i < list.size(); i++) { + buffer.append(i + "\t" + list.get(i).length + "\r\n"); + } + + return buffer.toString(); + + } catch (Exception e) { + + } + + return "<" + list.size() + ">\r\n" + query; + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/codec/TimedSQLEncoder.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/codec/TimedSQLEncoder.java index 5ee8df51..5d4c582c 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/codec/TimedSQLEncoder.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/codec/TimedSQLEncoder.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -30,143 +30,120 @@ /** * This class encodes the diffs while collecting statistical information. - * - * - * */ public class TimedSQLEncoder - extends SQLEncoder -{ - - /** - * Temporary variable - used for storing the encoded size - */ - private long encodedSize; - - /** - * Temporary variable - used for storing the encoded sql size - */ - private long encodedSQLSize; - - /** - * (Constructor) Creates a new TimedSQLEncoder object. - * - * @param logger - * Reference to the logger - * - * @throws ConfigurationException - * if an error occurred while accessing the configuration - * @throws LoggingException - * if an error occurred while accessing the logger - */ - public TimedSQLEncoder(final Logger logger) - throws ConfigurationException, LoggingException - { - super(logger); - } - - /*--------------------------------------------------------------------------*/ - - /** - * Initializes the encoding information. - */ - public void init() - { - this.encodedSize = 0; - this.encodedSQLSize = 0; - } - - /** - * Returns the encoded size. - * - * @return encoded size - */ - public long getEncodedSize() - { - return encodedSize; - } - - /** - * Returns the encoded sql size. - * - * @return encoded sql size - */ - public long getEncodedSQLSize() - { - return encodedSQLSize; - } - - /*--------------------------------------------------------------------------*/ - - /** - * Encodes the diff. - * - * @param task - * Reference to the DiffTask - * @param diff - * Diff to encode - * @return Base 64 encoded Diff - * - * @throws ConfigurationException - * if an error occurred while accessing the configuration - * @throws UnsupportedEncodingException - * if the character encoding is unsupported - * @throws DecodingException - * if the decoding failed - * @throws EncodingException - * if the encoding failed - * @throws SQLConsumerException - * if an error occurred while encoding the diff - */ - protected String encodeDiff(final Task task, final Diff diff) - throws ConfigurationException, UnsupportedEncodingException, - DecodingException, EncodingException, SQLConsumerException - { - - String encoding = super.encodeDiff(task, diff); - - this.encodedSize += encoding.length(); - - return encoding; - } - - /* - * (non-Javadoc) - * - * @see - * de.tud.ukp.kulessa.delta.consumers.sql.codec.SQLEncodrInterface#binaryTask - * (de.tudarmstadt.ukp.kulessa.delta.data.tasks.Task) - */ - protected byte[] binaryDiff(final Task task, final Diff diff) - throws ConfigurationException, UnsupportedEncodingException, - DecodingException, EncodingException, SQLConsumerException - { - - byte[] encoding = super.binaryDiff(task, diff); - - this.encodedSize += encoding.length; - - return encoding; - } - - /* - * (non-Javadoc) - * - * @see - * de.tudarmstadt.ukp.kulessa.delta.consumers.sql.codec.SQLEncodrInterface - * #encodeTask(de.tudarmstadt.ukp.kulessa.delta.data.tasks.Task) - */ - public SQLEncoding[] encodeTask(final Task task) - throws ConfigurationException, UnsupportedEncodingException, - DecodingException, EncodingException, SQLConsumerException - { - - SQLEncoding[] encoding = super.encodeTask(task); - - for (SQLEncoding sql : encoding) { - this.encodedSQLSize += sql.byteSize(); - } - - return encoding; - } + extends SQLEncoder { + + /** + * Temporary variable - used for storing the encoded size + */ + private long encodedSize; + + /** + * Temporary variable - used for storing the encoded sql size + */ + private long encodedSQLSize; + + /** + * (Constructor) Creates a new TimedSQLEncoder object. + * + * @param logger Reference to the logger + * @throws ConfigurationException if an error occurred while accessing the configuration + * @throws LoggingException if an error occurred while accessing the logger + */ + public TimedSQLEncoder(final Logger logger) + throws ConfigurationException, LoggingException { + super(logger); + } + + /*--------------------------------------------------------------------------*/ + + /** + * Initializes the encoding information. + */ + public void init() { + this.encodedSize = 0; + this.encodedSQLSize = 0; + } + + /** + * Returns the encoded size. + * + * @return encoded size + */ + public long getEncodedSize() { + return encodedSize; + } + + /** + * Returns the encoded sql size. + * + * @return encoded sql size + */ + public long getEncodedSQLSize() { + return encodedSQLSize; + } + + /*--------------------------------------------------------------------------*/ + + /** + * Encodes the diff. + * + * @param task Reference to the DiffTask + * @param diff Diff to encode + * @return Base 64 encoded Diff + * @throws ConfigurationException if an error occurred while accessing the configuration + * @throws UnsupportedEncodingException if the character encoding is unsupported + * @throws DecodingException if the decoding failed + * @throws EncodingException if the encoding failed + * @throws SQLConsumerException if an error occurred while encoding the diff + */ + protected String encodeDiff(final Task task, final Diff diff) + throws ConfigurationException, UnsupportedEncodingException, + DecodingException, EncodingException, SQLConsumerException { + + String encoding = super.encodeDiff(task, diff); + + this.encodedSize += encoding.length(); + + return encoding; + } + + /* + * (non-Javadoc) + * + * @see + * de.tud.ukp.kulessa.delta.consumers.sql.codec.SQLEncodrInterface#binaryTask + * (de.tudarmstadt.ukp.kulessa.delta.data.tasks.Task) + */ + protected byte[] binaryDiff(final Task task, final Diff diff) + throws ConfigurationException, UnsupportedEncodingException, + DecodingException, EncodingException, SQLConsumerException { + + byte[] encoding = super.binaryDiff(task, diff); + + this.encodedSize += encoding.length; + + return encoding; + } + + /* + * (non-Javadoc) + * + * @see + * de.tudarmstadt.ukp.kulessa.delta.consumers.sql.codec.SQLEncodrInterface + * #encodeTask(de.tudarmstadt.ukp.kulessa.delta.data.tasks.Task) + */ + public SQLEncoding[] encodeTask(final Task task) + throws ConfigurationException, UnsupportedEncodingException, + DecodingException, EncodingException, SQLConsumerException { + + SQLEncoding[] encoding = super.encodeTask(task); + + for (SQLEncoding sql : encoding) { + this.encodedSQLSize += sql.byteSize(); + } + + return encoding; + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/writer/DataFileArchiveWriter.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/writer/DataFileArchiveWriter.java index c9704722..5f109d2f 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/writer/DataFileArchiveWriter.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/writer/DataFileArchiveWriter.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -39,203 +39,185 @@ /** * This class writes the output to an archive. - * - * - * */ public class DataFileArchiveWriter - implements WriterInterface -{ - - /** File counter */ - private int counter; - - /** Configuration parameter - maximum size of an output archive */ - private final long LIMIT_SQL_ARCHIVE_SIZE; - - /** - * Configuration parameter - Flag, that indicates whether the statistical - * output is enabled or not - */ - private final boolean MODE_STATISTICAL_OUTPUT; - - /** Reference to the output stream */ - private OutputStream output; - - /** - * Name of the related sql consumer - used as prefix for the output - * filenames - */ - private String outputName; - - /** Configuration parameter - output path */ - private final String PATH_OUTPUT_SQL_FILES; - - /** Reference to the output archive */ - private File dataArchive; - - /** Reference to the SQLEncoder */ - protected DataFileEncoder dataFileEncoder; - - /** - * (Constructor) Creates a new SQLArchiveWriter object. - * - * @throws ConfigurationException - * if an error occurred while accessing the configuration - */ - private DataFileArchiveWriter() - throws ConfigurationException - { - - // Load config parameters - ConfigurationManager config = ConfigurationManager.getInstance(); - - LIMIT_SQL_ARCHIVE_SIZE = (Long) config - .getConfigParameter(ConfigurationKeys.LIMIT_SQL_ARCHIVE_SIZE); - - PATH_OUTPUT_SQL_FILES = (String) config - .getConfigParameter(ConfigurationKeys.PATH_OUTPUT_SQL_FILES); - - MODE_STATISTICAL_OUTPUT = (Boolean) config - .getConfigParameter(ConfigurationKeys.MODE_STATISTICAL_OUTPUT); - - // Create sql file - counter = 0; - } - - - /** - * (Constructor) Creates a new SQLArchiveWriter object. - * - * @param outputName - * Name of the sql consumer - * - * @throws ConfigurationException - * if an error occurred while accessing the configuration - * @throws LoggingException - * if an error occurred while accessing the logger - */ - public DataFileArchiveWriter(final String outputName) - throws IOException, ConfigurationException, LoggingException - { - - this(); - - this.outputName = outputName; - - init(); - writeHeader(); - } - - /** - * This method will close the connection to the output. - * - * @throws IOException - * if problems occurred while closing the file or process. - * - */ - @Override - public void close() - throws IOException - { - this.output.close(); - this.output = null; - } - - /** - * Creates the sql encoder. - * - * @throws ConfigurationException - * if an error occurred while accessing the configuration - * @throws LoggingException - * if an error occurred while accessing the logger - */ - protected void init() - throws ConfigurationException, LoggingException - { - - this.dataFileEncoder = new DataFileEncoder(); - } - - /** - * This method will process the given DiffTask and send it to the specified - * output. - * - * @param task - * DiffTask - * - * @throws ConfigurationException - * if problems occurred while initializing the components - * - * @throws IOException - * if problems occurred while writing the output (to file or - * archive) - * - * @throws SQLConsumerException - * if problems occurred while writing the output (to the sql - * producer database) - */ - @Override - public void process(final Task task) - throws ConfigurationException, IOException, SQLConsumerException - { - - // this.startTime = System.currentTimeMillis(); - try { - List data = dataFileEncoder.encodeTask(task); - - for (String d : data) { - this.output.write((d + ";").getBytes()); - this.output.flush(); - } - - if (task.getTaskType() == TaskTypes.TASK_FULL - || task.getTaskType() == TaskTypes.TASK_PARTIAL_LAST) { - - if (this.dataArchive.length() > LIMIT_SQL_ARCHIVE_SIZE) { - writeHeader(); - } - - if (!MODE_STATISTICAL_OUTPUT) { - System.out.println(task); - } - - } - else { - System.out.println(task); - } - - } - catch (EncodingException | DecodingException e) { - throw ErrorFactory.createSQLConsumerException( - ErrorKeys.DIFFTOOL_SQLCONSUMER_FILEWRITER_EXCEPTION, e); - } + implements WriterInterface { + + /** + * File counter + */ + private int counter; + + /** + * Configuration parameter - maximum size of an output archive + */ + private final long LIMIT_SQL_ARCHIVE_SIZE; + + /** + * Configuration parameter - Flag, that indicates whether the statistical + * output is enabled or not + */ + private final boolean MODE_STATISTICAL_OUTPUT; + + /** + * Reference to the output stream + */ + private OutputStream output; + + /** + * Name of the related sql consumer - used as prefix for the output + * filenames + */ + private String outputName; + + /** + * Configuration parameter - output path + */ + private final String PATH_OUTPUT_SQL_FILES; + + /** + * Reference to the output archive + */ + private File dataArchive; + + /** + * Reference to the SQLEncoder + */ + protected DataFileEncoder dataFileEncoder; + + /** + * (Constructor) Creates a new SQLArchiveWriter object. + * + * @throws ConfigurationException if an error occurred while accessing the configuration + */ + private DataFileArchiveWriter() + throws ConfigurationException { + + // Load config parameters + ConfigurationManager config = ConfigurationManager.getInstance(); + + LIMIT_SQL_ARCHIVE_SIZE = (Long) config + .getConfigParameter(ConfigurationKeys.LIMIT_SQL_ARCHIVE_SIZE); + + PATH_OUTPUT_SQL_FILES = (String) config + .getConfigParameter(ConfigurationKeys.PATH_OUTPUT_SQL_FILES); + + MODE_STATISTICAL_OUTPUT = (Boolean) config + .getConfigParameter(ConfigurationKeys.MODE_STATISTICAL_OUTPUT); + + // Create sql file + counter = 0; } - /** - * Creates a new output file and writes the header information. - * - * @throws ConfigurationException - * if an error occurred while accessing the configuration - * @throws IOException - * if an error occurred while writing a file - */ - protected void writeHeader() - throws ConfigurationException, IOException - { - if (this.output != null) { - close(); - } + /** + * (Constructor) Creates a new SQLArchiveWriter object. + * + * @param outputName Name of the sql consumer + * @throws ConfigurationException if an error occurred while accessing the configuration + * @throws LoggingException if an error occurred while accessing the logger + */ + public DataFileArchiveWriter(final String outputName) + throws IOException, ConfigurationException, LoggingException { - this.counter++; + this(); - String filePath = PATH_OUTPUT_SQL_FILES + this.outputName +"_"+counter; + this.outputName = outputName; - this.output = OutputFactory.getOutputStream(filePath); + init(); + writeHeader(); + } + + /** + * This method will close the connection to the output. + * + * @throws IOException if problems occurred while closing the file or process. + */ + @Override + public void close() + throws IOException { + this.output.close(); + this.output = null; + } - this.dataArchive = new File(filePath); + /** + * Creates the sql encoder. + * + * @throws ConfigurationException if an error occurred while accessing the configuration + * @throws LoggingException if an error occurred while accessing the logger + */ + protected void init() + throws ConfigurationException, LoggingException { + + this.dataFileEncoder = new DataFileEncoder(); + } - this.output.flush(); - } + /** + * This method will process the given DiffTask and send it to the specified + * output. + * + * @param task DiffTask + * @throws ConfigurationException if problems occurred while initializing the components + * @throws IOException if problems occurred while writing the output (to file or + * archive) + * @throws SQLConsumerException if problems occurred while writing the output (to the sql + * producer database) + */ + @Override + public void process(final Task task) + throws ConfigurationException, IOException, SQLConsumerException { + + // this.startTime = System.currentTimeMillis(); + try { + List data = dataFileEncoder.encodeTask(task); + + for (String d : data) { + this.output.write((d + ";").getBytes()); + this.output.flush(); + } + + if (task.getTaskType() == TaskTypes.TASK_FULL + || task.getTaskType() == TaskTypes.TASK_PARTIAL_LAST) { + + if (this.dataArchive.length() > LIMIT_SQL_ARCHIVE_SIZE) { + writeHeader(); + } + + if (!MODE_STATISTICAL_OUTPUT) { + System.out.println(task); + } + + } else { + System.out.println(task); + } + + } catch (EncodingException | DecodingException e) { + throw ErrorFactory.createSQLConsumerException( + ErrorKeys.DIFFTOOL_SQLCONSUMER_FILEWRITER_EXCEPTION, e); + } + } + + /** + * Creates a new output file and writes the header information. + * + * @throws ConfigurationException if an error occurred while accessing the configuration + * @throws IOException if an error occurred while writing a file + */ + protected void writeHeader() + throws ConfigurationException, IOException { + + if (this.output != null) { + close(); + } + + this.counter++; + + String filePath = PATH_OUTPUT_SQL_FILES + this.outputName + "_" + counter; + + this.output = OutputFactory.getOutputStream(filePath); + + this.dataArchive = new File(filePath); + + this.output.flush(); + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/writer/DataFileWriter.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/writer/DataFileWriter.java index 46e67484..4cf5af80 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/writer/DataFileWriter.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/writer/DataFileWriter.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -45,205 +45,190 @@ * This class writes the output to a data file (not an sql file) */ public class DataFileWriter - implements WriterInterface -{ - - /** File counter */ - private int fileCounter; - - /** Configuration parameter - maximum size of an output file */ - private final long LIMIT_SQL_FILE_SIZE; - - /** - * Configuration parameter - Flag, that indicates whether the statistical - * output is enabled or not - */ - private final boolean MODE_STATISTICAL_OUTPUT; - - /** - * Name of the related sql consumer - used as prefix for the output - * filenames - */ - private String outputName; - - /** Configuration parameter - output path */ - private final String PATH_OUTPUT_DATA_FILES; - - /** Reference to the DataFileEncoder */ - protected DataFileEncoder dataFileEncoder; - - /** Reference to the output file */ - private File dataFile; - - /** Reference to the file writer */ - private Writer writer; - - private final String WIKIPEDIA_ENCODING; - - /** - * (Constructor) Creates a new SQLFileWriter object. - * - * @throws ConfigurationException - * if an error occurred while accessing the configuration - */ - private DataFileWriter() - throws ConfigurationException - { - - // Load config parameters - ConfigurationManager config = ConfigurationManager.getInstance(); - - LIMIT_SQL_FILE_SIZE = (Long) config - .getConfigParameter(ConfigurationKeys.LIMIT_SQL_FILE_SIZE); - - PATH_OUTPUT_DATA_FILES = (String) config - .getConfigParameter(ConfigurationKeys.PATH_OUTPUT_SQL_FILES); - - MODE_STATISTICAL_OUTPUT = (Boolean) config - .getConfigParameter(ConfigurationKeys.MODE_STATISTICAL_OUTPUT); - - WIKIPEDIA_ENCODING = (String) config - .getConfigParameter(ConfigurationKeys.WIKIPEDIA_ENCODING); - - // Create sql file - fileCounter = 0; - } - - - /** - * (Constructor) Creates a new SQLFileWriter object. - * - * @param outputName - * Name of the sql consumer - * - * @throws ConfigurationException - * if an error occurred while accessing the configuration - * @throws LoggingException - * if an error occurred while accessing the logger - */ - public DataFileWriter(final String outputName) - throws IOException, ConfigurationException, LoggingException - { - - this(); - - this.outputName = outputName; - - init(); - writeHeader(); - } - - /** - * This method will close the connection to the output. - * - * @throws IOException - * if problems occurred while closing the file or process. - * - */ - @Override - public void close() - throws IOException - { - this.writer.close(); - } - - /** - * Creates the sql encoder. - * - * @throws ConfigurationException - * if an error occurred while accessing the configuration - * @throws LoggingException - * if an error occurred while accessing the logger - */ - protected void init() - throws ConfigurationException, LoggingException - { - - this.dataFileEncoder = new DataFileEncoder(); - } - - /** - * This method will process the given DiffTask and send it to the specified - * output. - * - * @param task - * DiffTask - * - * @throws ConfigurationException - * if problems occurred while initializing the components - * - * @throws IOException - * if problems occurred while writing the output (to file or - * archive) - * - * @throws SQLConsumerException - * if problems occurred while writing the output (to the sql - * producer database) - */ - @Override - public void process(final Task task) - throws ConfigurationException, IOException, SQLConsumerException - { - - try { - List data = dataFileEncoder.encodeTask(task); - - for (String d : data) { - this.writer.write(d + ";"); - this.writer.flush(); - } - - if (task.getTaskType() == TaskTypes.TASK_FULL - || task.getTaskType() == TaskTypes.TASK_PARTIAL_LAST) { - - if (this.dataFile.length() > LIMIT_SQL_FILE_SIZE) { - writeHeader(); - } - - if (!MODE_STATISTICAL_OUTPUT) { - System.out.println(task); - } - - } - else { - System.out.println(task); - } - - } - catch (DecodingException | EncodingException e) { - - throw ErrorFactory.createSQLConsumerException( - ErrorKeys.DIFFTOOL_SQLCONSUMER_FILEWRITER_EXCEPTION, e); - - } + implements WriterInterface { + + /** + * File counter + */ + private int fileCounter; + + /** + * Configuration parameter - maximum size of an output file + */ + private final long LIMIT_SQL_FILE_SIZE; + + /** + * Configuration parameter - Flag, that indicates whether the statistical + * output is enabled or not + */ + private final boolean MODE_STATISTICAL_OUTPUT; + + /** + * Name of the related sql consumer - used as prefix for the output + * filenames + */ + private String outputName; + + /** + * Configuration parameter - output path + */ + private final String PATH_OUTPUT_DATA_FILES; + + /** + * Reference to the DataFileEncoder + */ + protected DataFileEncoder dataFileEncoder; + + /** + * Reference to the output file + */ + private File dataFile; + + /** + * Reference to the file writer + */ + private Writer writer; + + private final String WIKIPEDIA_ENCODING; + + /** + * (Constructor) Creates a new SQLFileWriter object. + * + * @throws ConfigurationException if an error occurred while accessing the configuration + */ + private DataFileWriter() + throws ConfigurationException { + + // Load config parameters + ConfigurationManager config = ConfigurationManager.getInstance(); + + LIMIT_SQL_FILE_SIZE = (Long) config + .getConfigParameter(ConfigurationKeys.LIMIT_SQL_FILE_SIZE); + + PATH_OUTPUT_DATA_FILES = (String) config + .getConfigParameter(ConfigurationKeys.PATH_OUTPUT_SQL_FILES); + + MODE_STATISTICAL_OUTPUT = (Boolean) config + .getConfigParameter(ConfigurationKeys.MODE_STATISTICAL_OUTPUT); + + WIKIPEDIA_ENCODING = (String) config + .getConfigParameter(ConfigurationKeys.WIKIPEDIA_ENCODING); + + // Create sql file + fileCounter = 0; } - /** - * Creates a new output file and writes the header information. - * - * @throws ConfigurationException - * if an error occurred while accessing the configuration - * @throws IOException - * if an error occurred while writing a file - */ - protected void writeHeader() - throws ConfigurationException, IOException - { - if (writer != null) { - writer.close(); - } + /** + * (Constructor) Creates a new SQLFileWriter object. + * + * @param outputName Name of the sql consumer + * @throws ConfigurationException if an error occurred while accessing the configuration + * @throws LoggingException if an error occurred while accessing the logger + */ + public DataFileWriter(final String outputName) + throws IOException, ConfigurationException, LoggingException { - this.fileCounter++; - String filePath = PATH_OUTPUT_DATA_FILES + this.outputName + "_" - + fileCounter+".csv"; + this(); - this.dataFile = new File(filePath); + this.outputName = outputName; - this.writer = new BufferedWriter(new OutputStreamWriter(new BufferedOutputStream( - new FileOutputStream(filePath)), WIKIPEDIA_ENCODING)); + init(); + writeHeader(); + } + + /** + * This method will close the connection to the output. + * + * @throws IOException if problems occurred while closing the file or process. + */ + @Override + public void close() + throws IOException { + this.writer.close(); + } + + /** + * Creates the sql encoder. + * + * @throws ConfigurationException if an error occurred while accessing the configuration + * @throws LoggingException if an error occurred while accessing the logger + */ + protected void init() + throws ConfigurationException, LoggingException { + + this.dataFileEncoder = new DataFileEncoder(); + } + + /** + * This method will process the given DiffTask and send it to the specified + * output. + * + * @param task DiffTask + * @throws ConfigurationException if problems occurred while initializing the components + * @throws IOException if problems occurred while writing the output (to file or + * archive) + * @throws SQLConsumerException if problems occurred while writing the output (to the sql + * producer database) + */ + @Override + public void process(final Task task) + throws ConfigurationException, IOException, SQLConsumerException { + + try { + List data = dataFileEncoder.encodeTask(task); + + for (String d : data) { + this.writer.write(d + ";"); + this.writer.flush(); + } + + if (task.getTaskType() == TaskTypes.TASK_FULL + || task.getTaskType() == TaskTypes.TASK_PARTIAL_LAST) { + + if (this.dataFile.length() > LIMIT_SQL_FILE_SIZE) { + writeHeader(); + } + + if (!MODE_STATISTICAL_OUTPUT) { + System.out.println(task); + } + + } else { + System.out.println(task); + } + + } catch (DecodingException | EncodingException e) { + + throw ErrorFactory.createSQLConsumerException( + ErrorKeys.DIFFTOOL_SQLCONSUMER_FILEWRITER_EXCEPTION, e); + + } + } + + /** + * Creates a new output file and writes the header information. + * + * @throws ConfigurationException if an error occurred while accessing the configuration + * @throws IOException if an error occurred while writing a file + */ + protected void writeHeader() + throws ConfigurationException, IOException { + + if (writer != null) { + writer.close(); + } + + this.fileCounter++; + String filePath = PATH_OUTPUT_DATA_FILES + this.outputName + "_" + + fileCounter + ".csv"; + this.dataFile = new File(filePath); - this.writer.flush(); - } + this.writer = new BufferedWriter(new OutputStreamWriter(new BufferedOutputStream( + new FileOutputStream(filePath)), WIKIPEDIA_ENCODING)); + + + this.writer.flush(); + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/writer/OutputFactory.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/writer/OutputFactory.java index 6856977d..fceb0438 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/writer/OutputFactory.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/writer/OutputFactory.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -28,80 +28,73 @@ import org.dkpro.jwpl.revisionmachine.difftool.config.ConfigurationManager; import org.dkpro.jwpl.revisionmachine.difftool.data.OutputType; -public class OutputFactory -{ +public class OutputFactory { - private static String PATH_PROGRAM_7ZIP = null; - private static OutputType MODE_OUTPUT = null; - private static ConfigurationManager config = null; + private static String PATH_PROGRAM_7ZIP = null; + private static OutputType MODE_OUTPUT = null; + private static ConfigurationManager config = null; - static { - try { - config = ConfigurationManager.getInstance(); - MODE_OUTPUT = (OutputType) config.getConfigParameter(ConfigurationKeys.MODE_OUTPUT); - } - catch (ConfigurationException e) { - e.printStackTrace(); - System.exit(-1); - } - } + static { + try { + config = ConfigurationManager.getInstance(); + MODE_OUTPUT = (OutputType) config.getConfigParameter(ConfigurationKeys.MODE_OUTPUT); + } catch (ConfigurationException e) { + e.printStackTrace(); + System.exit(-1); + } + } - private static OutputStream compressWith7Zip(final String archivePath) - throws ConfigurationException - { + private static OutputStream compressWith7Zip(final String archivePath) + throws ConfigurationException { - PATH_PROGRAM_7ZIP = (String) config.getConfigParameter(ConfigurationKeys.PATH_PROGRAM_7ZIP); + PATH_PROGRAM_7ZIP = (String) config.getConfigParameter(ConfigurationKeys.PATH_PROGRAM_7ZIP); - if (PATH_PROGRAM_7ZIP == null) { - throw ErrorFactory - .createConfigurationException(ErrorKeys.CONFIGURATION_PARAMETER_UNDEFINED); - } + if (PATH_PROGRAM_7ZIP == null) { + throw ErrorFactory + .createConfigurationException(ErrorKeys.CONFIGURATION_PARAMETER_UNDEFINED); + } - try { - Runtime runtime = Runtime.getRuntime(); - Process p = runtime.exec(PATH_PROGRAM_7ZIP + " a -t7z -si " + archivePath); - return p.getOutputStream(); + try { + Runtime runtime = Runtime.getRuntime(); + Process p = runtime.exec(PATH_PROGRAM_7ZIP + " a -t7z -si " + archivePath); + return p.getOutputStream(); - } - catch (Exception e) { - throw new RuntimeException(e); - } - } + } catch (Exception e) { + throw new RuntimeException(e); + } + } - private static OutputStream compressWithBZip2(final String archivePath) - throws ConfigurationException - { + private static OutputStream compressWithBZip2(final String archivePath) + throws ConfigurationException { - OutputStream output = null; - try { - output = new Bzip2Archiver().getCompressionStream(archivePath); - } - catch (IOException e) { - e.printStackTrace(); - } - return output; - } + OutputStream output = null; + try { + output = new Bzip2Archiver().getCompressionStream(archivePath); + } catch (IOException e) { + e.printStackTrace(); + } + return output; + } - public static OutputStream getOutputStream(final String archivePath) - throws ConfigurationException - { + public static OutputStream getOutputStream(final String archivePath) + throws ConfigurationException { - switch (MODE_OUTPUT) { - case SEVENZIP: - if((Boolean)config.getConfigParameter(ConfigurationKeys.MODE_DATAFILE_OUTPUT)){ - return compressWith7Zip(archivePath+ ".csv.7z"); - }else{ - return compressWith7Zip(archivePath+ ".sql.7z"); - } - case BZIP2: - if((Boolean)config.getConfigParameter(ConfigurationKeys.MODE_DATAFILE_OUTPUT)){ - return compressWithBZip2(archivePath+ ".csv.bz2"); - }else{ - return compressWithBZip2(archivePath+ ".sql.bz2"); - } - default: - throw ErrorFactory - .createConfigurationException(ErrorKeys.DELTA_CONSUMERS_SQL_WRITER_OUTPUTFACTORY_ILLEGAL_OUTPUTMODE_VALUE); - } - } + switch (MODE_OUTPUT) { + case SEVENZIP: + if ((Boolean) config.getConfigParameter(ConfigurationKeys.MODE_DATAFILE_OUTPUT)) { + return compressWith7Zip(archivePath + ".csv.7z"); + } else { + return compressWith7Zip(archivePath + ".sql.7z"); + } + case BZIP2: + if ((Boolean) config.getConfigParameter(ConfigurationKeys.MODE_DATAFILE_OUTPUT)) { + return compressWithBZip2(archivePath + ".csv.bz2"); + } else { + return compressWithBZip2(archivePath + ".sql.bz2"); + } + default: + throw ErrorFactory + .createConfigurationException(ErrorKeys.DELTA_CONSUMERS_SQL_WRITER_OUTPUTFACTORY_ILLEGAL_OUTPUTMODE_VALUE); + } + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/writer/SQLArchiveWriter.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/writer/SQLArchiveWriter.java index 8a3cd675..29f175cb 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/writer/SQLArchiveWriter.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/writer/SQLArchiveWriter.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -42,229 +42,214 @@ /** * This class writes the output to an archive. - * - * - * */ public class SQLArchiveWriter - implements WriterInterface -{ - - /** File counter */ - private int counter; - - /** Configuration parameter - maximum size of an output archive */ - private final long LIMIT_SQL_ARCHIVE_SIZE; - - /** Reference to the logger */ - protected Logger logger; - - /** - * Configuration parameter - Flag, that indicates whether the statistical - * output is enabled or not - */ - private final boolean MODE_STATISTICAL_OUTPUT; - - /** Reference to the output stream */ - private OutputStream output; - - /** - * Name of the related sql consumer - used as prefix for the output - * filenames - */ - private String outputName; - - /** Configuration parameter - output path */ - private final String PATH_OUTPUT_SQL_FILES; - - /** Reference to the output archive */ - private File sqlArchive; - - /** Reference to the SQLEncoder */ - protected SQLEncoderInterface sqlEncoder; - - /** Configuration parameter - Charset name of the input data */ - private final String WIKIPEDIA_ENCODING; - - /** - * (Constructor) Creates a new SQLArchiveWriter object. - * - * @throws ConfigurationException - * if an error occurred while accessing the configuration - */ - private SQLArchiveWriter() - throws ConfigurationException - { - - // Load config parameters - ConfigurationManager config = ConfigurationManager.getInstance(); - - LIMIT_SQL_ARCHIVE_SIZE = (Long) config - .getConfigParameter(ConfigurationKeys.LIMIT_SQL_ARCHIVE_SIZE); - - PATH_OUTPUT_SQL_FILES = (String) config - .getConfigParameter(ConfigurationKeys.PATH_OUTPUT_SQL_FILES); - - MODE_STATISTICAL_OUTPUT = (Boolean) config - .getConfigParameter(ConfigurationKeys.MODE_STATISTICAL_OUTPUT); - - WIKIPEDIA_ENCODING = (String) config - .getConfigParameter(ConfigurationKeys.WIKIPEDIA_ENCODING); - - // Create sql file - counter = 0; - } - - - /** - * (Constructor) Creates a new SQLArchiveWriter object. - * - * @param outputName - * Name of the sql consumer - * @param logger - * Reference to a logger - * - * @throws ConfigurationException - * if an error occurred while accessing the configuration - * @throws LoggingException - * if an error occurred while accessing the logger - */ - public SQLArchiveWriter(final String outputName, final Logger logger) - throws IOException, ConfigurationException, LoggingException - { - - this(); - - this.outputName = outputName; - this.logger = logger; - - init(); - writeHeader(); - } - - /** - * This method will close the connection to the output. - * - * @throws IOException - * if problems occurred while closing the file or process. - * - */ - @Override - public void close() - throws IOException - { - this.output.close(); - this.output = null; - } - - /** - * Creates the sql encoder. - * - * @throws ConfigurationException - * if an error occurred while accessing the configuration - * @throws LoggingException - * if an error occurred while accessing the logger - */ - protected void init() - throws ConfigurationException, LoggingException - { - - this.sqlEncoder = new SQLEncoder(logger); - } - - /** - * This method will process the given DiffTask and send it to the specified - * output. - * - * @param task - * DiffTask - * - * @throws ConfigurationException - * if problems occurred while initializing the components - * - * @throws IOException - * if problems occurred while writing the output (to file or - * archive) - * - * @throws SQLConsumerException - * if problems occurred while writing the output (to the sql - * producer database) - */ - @Override - public void process(final Task task) - throws ConfigurationException, IOException, SQLConsumerException - { - - // this.startTime = System.currentTimeMillis(); - try { - SQLEncoding[] encoding = this.sqlEncoder.encodeTask(task); - - String s; - for (SQLEncoding sql : encoding) { - s = sql.getQuery() + "\r\n"; - this.output.write(s.getBytes(WIKIPEDIA_ENCODING)); - this.output.flush(); - } - - if (task.getTaskType() == TaskTypes.TASK_FULL - || task.getTaskType() == TaskTypes.TASK_PARTIAL_LAST) { - - if (this.sqlArchive.length() > LIMIT_SQL_ARCHIVE_SIZE) { - writeHeader(); - } - - if (!MODE_STATISTICAL_OUTPUT) { - System.out.println(task); - } - - } - else { - System.out.println(task); - } - - } - catch (DecodingException | EncodingException e) { - - throw ErrorFactory.createSQLConsumerException( - ErrorKeys.DIFFTOOL_SQLCONSUMER_FILEWRITER_EXCEPTION, e); - - } + implements WriterInterface { + + /** + * File counter + */ + private int counter; + + /** + * Configuration parameter - maximum size of an output archive + */ + private final long LIMIT_SQL_ARCHIVE_SIZE; + + /** + * Reference to the logger + */ + protected Logger logger; + + /** + * Configuration parameter - Flag, that indicates whether the statistical + * output is enabled or not + */ + private final boolean MODE_STATISTICAL_OUTPUT; + + /** + * Reference to the output stream + */ + private OutputStream output; + + /** + * Name of the related sql consumer - used as prefix for the output + * filenames + */ + private String outputName; + + /** + * Configuration parameter - output path + */ + private final String PATH_OUTPUT_SQL_FILES; + + /** + * Reference to the output archive + */ + private File sqlArchive; + + /** + * Reference to the SQLEncoder + */ + protected SQLEncoderInterface sqlEncoder; + + /** + * Configuration parameter - Charset name of the input data + */ + private final String WIKIPEDIA_ENCODING; + + /** + * (Constructor) Creates a new SQLArchiveWriter object. + * + * @throws ConfigurationException if an error occurred while accessing the configuration + */ + private SQLArchiveWriter() + throws ConfigurationException { + + // Load config parameters + ConfigurationManager config = ConfigurationManager.getInstance(); + + LIMIT_SQL_ARCHIVE_SIZE = (Long) config + .getConfigParameter(ConfigurationKeys.LIMIT_SQL_ARCHIVE_SIZE); + + PATH_OUTPUT_SQL_FILES = (String) config + .getConfigParameter(ConfigurationKeys.PATH_OUTPUT_SQL_FILES); + + MODE_STATISTICAL_OUTPUT = (Boolean) config + .getConfigParameter(ConfigurationKeys.MODE_STATISTICAL_OUTPUT); + + WIKIPEDIA_ENCODING = (String) config + .getConfigParameter(ConfigurationKeys.WIKIPEDIA_ENCODING); + + // Create sql file + counter = 0; } - /** - * Creates a new output file and writes the header information. - * - * @throws ConfigurationException - * if an error occurred while accessing the configuration - * @throws IOException - * if an error occurred while writing a file - */ - protected void writeHeader() - throws ConfigurationException, IOException - { - if (this.output != null) { - close(); - } + /** + * (Constructor) Creates a new SQLArchiveWriter object. + * + * @param outputName Name of the sql consumer + * @param logger Reference to a logger + * @throws ConfigurationException if an error occurred while accessing the configuration + * @throws LoggingException if an error occurred while accessing the logger + */ + public SQLArchiveWriter(final String outputName, final Logger logger) + throws IOException, ConfigurationException, LoggingException { - this.counter++; + this(); - String filePath = PATH_OUTPUT_SQL_FILES + this.outputName +"_"+counter; + this.outputName = outputName; + this.logger = logger; - this.output = OutputFactory.getOutputStream(filePath); + init(); + writeHeader(); + } - // System.out.println(filePath); - SQLConsumerLogMessages.logFileCreation(logger, filePath); + /** + * This method will close the connection to the output. + * + * @throws IOException if problems occurred while closing the file or process. + */ + @Override + public void close() + throws IOException { + this.output.close(); + this.output = null; + } - this.sqlArchive = new File(filePath); + /** + * Creates the sql encoder. + * + * @throws ConfigurationException if an error occurred while accessing the configuration + * @throws LoggingException if an error occurred while accessing the logger + */ + protected void init() + throws ConfigurationException, LoggingException { - String[] revTable = this.sqlEncoder.getTable(); - for (String sTable : revTable) { - String curLine = sTable + "\r\n"; - byte[] bytes = curLine.getBytes(WIKIPEDIA_ENCODING); - this.output.write(bytes); - } + this.sqlEncoder = new SQLEncoder(logger); + } + + /** + * This method will process the given DiffTask and send it to the specified + * output. + * + * @param task DiffTask + * @throws ConfigurationException if problems occurred while initializing the components + * @throws IOException if problems occurred while writing the output (to file or + * archive) + * @throws SQLConsumerException if problems occurred while writing the output (to the sql + * producer database) + */ + @Override + public void process(final Task task) + throws ConfigurationException, IOException, SQLConsumerException { + + // this.startTime = System.currentTimeMillis(); + try { + SQLEncoding[] encoding = this.sqlEncoder.encodeTask(task); + + String s; + for (SQLEncoding sql : encoding) { + s = sql.getQuery() + "\r\n"; + this.output.write(s.getBytes(WIKIPEDIA_ENCODING)); + this.output.flush(); + } + + if (task.getTaskType() == TaskTypes.TASK_FULL + || task.getTaskType() == TaskTypes.TASK_PARTIAL_LAST) { + + if (this.sqlArchive.length() > LIMIT_SQL_ARCHIVE_SIZE) { + writeHeader(); + } + + if (!MODE_STATISTICAL_OUTPUT) { + System.out.println(task); + } + + } else { + System.out.println(task); + } + + } catch (DecodingException | EncodingException e) { + + throw ErrorFactory.createSQLConsumerException( + ErrorKeys.DIFFTOOL_SQLCONSUMER_FILEWRITER_EXCEPTION, e); + + } + } - this.output.flush(); - } + /** + * Creates a new output file and writes the header information. + * + * @throws ConfigurationException if an error occurred while accessing the configuration + * @throws IOException if an error occurred while writing a file + */ + protected void writeHeader() + throws ConfigurationException, IOException { + + if (this.output != null) { + close(); + } + + this.counter++; + + String filePath = PATH_OUTPUT_SQL_FILES + this.outputName + "_" + counter; + + this.output = OutputFactory.getOutputStream(filePath); + + // System.out.println(filePath); + SQLConsumerLogMessages.logFileCreation(logger, filePath); + + this.sqlArchive = new File(filePath); + + String[] revTable = this.sqlEncoder.getTable(); + for (String sTable : revTable) { + String curLine = sTable + "\r\n"; + byte[] bytes = curLine.getBytes(WIKIPEDIA_ENCODING); + this.output.write(bytes); + } + + this.output.flush(); + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/writer/SQLDatabaseWriter.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/writer/SQLDatabaseWriter.java index 901b1713..57e1f515 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/writer/SQLDatabaseWriter.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/writer/SQLDatabaseWriter.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -42,187 +42,164 @@ /** * This class writes the output to a database. - * - * - * */ public class SQLDatabaseWriter - implements WriterInterface -{ - - /** Reference to the database connection */ - private Connection connection; - - /** Reference to the logger */ - protected final Logger logger; - - /** Reference to the sql encoder */ - protected SQLEncoderInterface sqlEncoder; - - /** - * (Constructor) Creates a new SQLDatabaseWriter object. - * - * @param logger - * Reference to the logger - * - * @throws ConfigurationException - * if an error occurred while accessing the configuration - * @throws LoggingException - * if an error occurred while accessing the logger - */ - public SQLDatabaseWriter(final Logger logger) - throws ConfigurationException, LoggingException - { - - this.logger = logger; - - ConfigurationManager config = ConfigurationManager.getInstance(); - - String host = (String) config - .getConfigParameter(ConfigurationKeys.SQL_HOST); - String user = (String) config - .getConfigParameter(ConfigurationKeys.SQL_USERNAME); - String password = (String) config - .getConfigParameter(ConfigurationKeys.SQL_PASSWORD); - String sTable = (String) config - .getConfigParameter(ConfigurationKeys.SQL_DATABASE); - - try { - String driverDB = "com.mysql.jdbc.Driver"; - Class.forName(driverDB); - - this.connection = DriverManager.getConnection("jdbc:mysql://" - + host + "/" + sTable, user, password); - - init(); - writeHeader(); - - } - catch (ClassNotFoundException | SQLException e) { - throw new ConfigurationException(e); - } + implements WriterInterface { + + /** + * Reference to the database connection + */ + private Connection connection; + + /** + * Reference to the logger + */ + protected final Logger logger; + + /** + * Reference to the sql encoder + */ + protected SQLEncoderInterface sqlEncoder; + + /** + * (Constructor) Creates a new SQLDatabaseWriter object. + * + * @param logger Reference to the logger + * @throws ConfigurationException if an error occurred while accessing the configuration + * @throws LoggingException if an error occurred while accessing the logger + */ + public SQLDatabaseWriter(final Logger logger) + throws ConfigurationException, LoggingException { + + this.logger = logger; + + ConfigurationManager config = ConfigurationManager.getInstance(); + + String host = (String) config + .getConfigParameter(ConfigurationKeys.SQL_HOST); + String user = (String) config + .getConfigParameter(ConfigurationKeys.SQL_USERNAME); + String password = (String) config + .getConfigParameter(ConfigurationKeys.SQL_PASSWORD); + String sTable = (String) config + .getConfigParameter(ConfigurationKeys.SQL_DATABASE); + + try { + String driverDB = "com.mysql.jdbc.Driver"; + Class.forName(driverDB); + + this.connection = DriverManager.getConnection("jdbc:mysql://" + + host + "/" + sTable, user, password); + + init(); + writeHeader(); + + } catch (ClassNotFoundException | SQLException e) { + throw new ConfigurationException(e); + } + } + + /** + * This method will close the connection to the output. + * + * @throws SQLException if problems occurred while closing the connection to the + * database. + */ + @Override + public void close() + throws SQLException { + this.connection.close(); + this.connection = null; + } + + /** + * Creates the sql encoder. + * + * @throws ConfigurationException if an error occurred while accessing the configuration + * @throws LoggingException if an error occurred while accessing the logger + */ + protected void init() + throws ConfigurationException, LoggingException { + + this.sqlEncoder = new SQLEncoder(logger); + } + + /** + * This method will process the given DiffTask and send it to the specified + * output. + * + * @param task DiffTask + * @throws ConfigurationException if problems occurred while initializing the components + * @throws IOException if problems occurred while writing the output (to file or + * archive) + * @throws SQLConsumerException if problems occurred while writing the output (to the sql + * producer database) + */ + @Override + public void process(final Task task) + throws ConfigurationException, IOException, SQLConsumerException { + + int i = -1; + SQLEncoding[] queries = null; + + try { + queries = sqlEncoder.encodeTask(task); + + Statement query; + int size = queries.length; + for (i = 0; i < size; i++) { + + query = connection.createStatement(); + query.executeUpdate(queries[i].getQuery()); + query.close(); + } + // System.out.println(task.toString()); + + } catch (SQLException e) { + + String q; + if (queries == null || queries.length <= i || queries[i] == null) { + q = ""; + } else { + q = queries[i].toString(); + } + + throw ErrorFactory.createSQLConsumerException( + ErrorKeys.DIFFTOOL_SQLCONSUMER_DATABASEWRITER_EXCEPTION, q, + e); + + } catch (DecodingException e) { + + throw ErrorFactory.createSQLConsumerException( + ErrorKeys.DIFFTOOL_SQLCONSUMER_DATABASEWRITER_EXCEPTION, e); + + } catch (EncodingException e) { + + throw ErrorFactory.createSQLConsumerException( + ErrorKeys.DIFFTOOL_SQLCONSUMER_FILEWRITER_EXCEPTION, e); + } } - /** - * This method will close the connection to the output. - * - * @throws SQLException - * if problems occurred while closing the connection to the - * database. - */ - @Override - public void close() - throws SQLException - { - this.connection.close(); - this.connection = null; - } - - /** - * Creates the sql encoder. - * - * @throws ConfigurationException - * if an error occurred while accessing the configuration - * @throws LoggingException - * if an error occurred while accessing the logger - */ - protected void init() - throws ConfigurationException, LoggingException - { - - this.sqlEncoder = new SQLEncoder(logger); - } - - /** - * This method will process the given DiffTask and send it to the specified - * output. - * - * @param task - * DiffTask - * - * @throws ConfigurationException - * if problems occurred while initializing the components - * - * @throws IOException - * if problems occurred while writing the output (to file or - * archive) - * - * @throws SQLConsumerException - * if problems occurred while writing the output (to the sql - * producer database) - */ - @Override - public void process(final Task task) - throws ConfigurationException, IOException, SQLConsumerException - { - - int i = -1; - SQLEncoding[] queries = null; - - try { - queries = sqlEncoder.encodeTask(task); - - Statement query; - int size = queries.length; - for (i = 0; i < size; i++) { - - query = connection.createStatement(); - query.executeUpdate(queries[i].getQuery()); - query.close(); - } - // System.out.println(task.toString()); - - } - catch (SQLException e) { - - String q; - if (queries == null || queries.length <= i || queries[i] == null) { - q = ""; - } - else { - q = queries[i].toString(); - } - - throw ErrorFactory.createSQLConsumerException( - ErrorKeys.DIFFTOOL_SQLCONSUMER_DATABASEWRITER_EXCEPTION, q, - e); - - } - catch (DecodingException e) { - - throw ErrorFactory.createSQLConsumerException( - ErrorKeys.DIFFTOOL_SQLCONSUMER_DATABASEWRITER_EXCEPTION, e); - - } - catch (EncodingException e) { - - throw ErrorFactory.createSQLConsumerException( - ErrorKeys.DIFFTOOL_SQLCONSUMER_FILEWRITER_EXCEPTION, e); - } - } - - /** - * Retrieves the encoded sql orders and executes them. - * - * @throws SQLException - * if an error occurred while accessing the database - */ - private void writeHeader() - throws SQLException - { - - Statement query; - String[] revTableHeaderQueries; - - revTableHeaderQueries = sqlEncoder.getTable(); - - //commit revision table header - for (String revTableHeaderQuery : revTableHeaderQueries) { - query = connection.createStatement(); - - query.executeUpdate(revTableHeaderQuery); - query.close(); - } - - } + /** + * Retrieves the encoded sql orders and executes them. + * + * @throws SQLException if an error occurred while accessing the database + */ + private void writeHeader() + throws SQLException { + + Statement query; + String[] revTableHeaderQueries; + + revTableHeaderQueries = sqlEncoder.getTable(); + + //commit revision table header + for (String revTableHeaderQuery : revTableHeaderQueries) { + query = connection.createStatement(); + + query.executeUpdate(revTableHeaderQuery); + query.close(); + } + + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/writer/SQLFileWriter.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/writer/SQLFileWriter.java index 34dab2e4..10f64306 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/writer/SQLFileWriter.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/writer/SQLFileWriter.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -48,219 +48,205 @@ * This class writes the output to a file. */ public class SQLFileWriter - implements WriterInterface -{ - - /** File counter */ - private int fileCounter; - - /** Configuration parameter - maximum size of an output file */ - private final long LIMIT_SQL_FILE_SIZE; - - /** Reference to the logger */ - protected Logger logger; - - /** - * Configuration parameter - Flag, that indicates whether the statistical - * output is enabled or not - */ - private final boolean MODE_STATISTICAL_OUTPUT; - - /** - * Name of the related sql consumer - used as prefix for the output - * filenames - */ - private String outputName; - - /** Configuration parameter - output path */ - private final String PATH_OUTPUT_SQL_FILES; - - /** Reference to the SQLEncoder */ - protected SQLEncoderInterface sqlEncoder; - - /** Reference to the output file */ - private File sqlFile; - - /** Reference to the file writer */ - private Writer writer; - - private final String WIKIPEDIA_ENCODING; - - /** - * (Constructor) Creates a new SQLFileWriter object. - * - * @throws ConfigurationException - * if an error occurred while accessing the configuration - */ - private SQLFileWriter() - throws ConfigurationException - { - - // Load config parameters - ConfigurationManager config = ConfigurationManager.getInstance(); - - LIMIT_SQL_FILE_SIZE = (Long) config - .getConfigParameter(ConfigurationKeys.LIMIT_SQL_FILE_SIZE); - - PATH_OUTPUT_SQL_FILES = (String) config - .getConfigParameter(ConfigurationKeys.PATH_OUTPUT_SQL_FILES); - - MODE_STATISTICAL_OUTPUT = (Boolean) config - .getConfigParameter(ConfigurationKeys.MODE_STATISTICAL_OUTPUT); - - WIKIPEDIA_ENCODING = (String) config - .getConfigParameter(ConfigurationKeys.WIKIPEDIA_ENCODING); - - // Create sql file - fileCounter = 0; - } - - - /** - * (Constructor) Creates a new SQLFileWriter object. - * - * @param outputName - * Name of the sql consumer - * @param logger - * Reference to a logger - * - * @throws ConfigurationException - * if an error occurred while accessing the configuration - * @throws LoggingException - * if an error occurred while accessing the logger - */ - public SQLFileWriter(final String outputName, final Logger logger) - throws IOException, ConfigurationException, LoggingException - { - - this(); - - this.outputName = outputName; - this.logger = logger; - - init(); - writeHeader(); - } - - /** - * This method will close the connection to the output. - * - * @throws IOException - * if problems occurred while closing the file or process. - * - */ - @Override - public void close() - throws IOException - { - this.writer.close(); - } - - /** - * Creates the sql encoder. - * - * @throws ConfigurationException - * if an error occurred while accessing the configuration - * @throws LoggingException - * if an error occurred while accessing the logger - */ - protected void init() - throws ConfigurationException, LoggingException - { - - this.sqlEncoder = new SQLEncoder(logger); - } - - /** - * This method will process the given DiffTask and send it to the specified - * output. - * - * @param task - * DiffTask - * - * @throws ConfigurationException - * if problems occurred while initializing the components - * - * @throws IOException - * if problems occurred while writing the output (to file or - * archive) - * - * @throws SQLConsumerException - * if problems occurred while writing the output (to the sql - * producer database) - */ - @Override - public void process(final Task task) - throws ConfigurationException, IOException, SQLConsumerException - { - - try { - SQLEncoding[] encoding = sqlEncoder.encodeTask(task); - - for (SQLEncoding sql : encoding) { - this.writer.write(sql.getQuery() + "\r\n"); - this.writer.flush(); - } - - if (task.getTaskType() == TaskTypes.TASK_FULL - || task.getTaskType() == TaskTypes.TASK_PARTIAL_LAST) { - - if (this.sqlFile.length() > LIMIT_SQL_FILE_SIZE) { - writeHeader(); - } - - if (!MODE_STATISTICAL_OUTPUT) { - System.out.println(task); - } - - } - else { - System.out.println(task); - } - - } - catch (DecodingException | EncodingException e) { - - throw ErrorFactory.createSQLConsumerException( - ErrorKeys.DIFFTOOL_SQLCONSUMER_FILEWRITER_EXCEPTION, e); - - } + implements WriterInterface { + + /** + * File counter + */ + private int fileCounter; + + /** + * Configuration parameter - maximum size of an output file + */ + private final long LIMIT_SQL_FILE_SIZE; + + /** + * Reference to the logger + */ + protected Logger logger; + + /** + * Configuration parameter - Flag, that indicates whether the statistical + * output is enabled or not + */ + private final boolean MODE_STATISTICAL_OUTPUT; + + /** + * Name of the related sql consumer - used as prefix for the output + * filenames + */ + private String outputName; + + /** + * Configuration parameter - output path + */ + private final String PATH_OUTPUT_SQL_FILES; + + /** + * Reference to the SQLEncoder + */ + protected SQLEncoderInterface sqlEncoder; + + /** + * Reference to the output file + */ + private File sqlFile; + + /** + * Reference to the file writer + */ + private Writer writer; + + private final String WIKIPEDIA_ENCODING; + + /** + * (Constructor) Creates a new SQLFileWriter object. + * + * @throws ConfigurationException if an error occurred while accessing the configuration + */ + private SQLFileWriter() + throws ConfigurationException { + + // Load config parameters + ConfigurationManager config = ConfigurationManager.getInstance(); + + LIMIT_SQL_FILE_SIZE = (Long) config + .getConfigParameter(ConfigurationKeys.LIMIT_SQL_FILE_SIZE); + + PATH_OUTPUT_SQL_FILES = (String) config + .getConfigParameter(ConfigurationKeys.PATH_OUTPUT_SQL_FILES); + + MODE_STATISTICAL_OUTPUT = (Boolean) config + .getConfigParameter(ConfigurationKeys.MODE_STATISTICAL_OUTPUT); + + WIKIPEDIA_ENCODING = (String) config + .getConfigParameter(ConfigurationKeys.WIKIPEDIA_ENCODING); + + // Create sql file + fileCounter = 0; } - /** - * Creates a new output file and writes the header information. - * - * @throws ConfigurationException - * if an error occurred while accessing the configuration - * @throws IOException - * if an error occurred while writing a file - */ - protected void writeHeader() - throws ConfigurationException, IOException - { - if (writer != null) { - writer.close(); - } + /** + * (Constructor) Creates a new SQLFileWriter object. + * + * @param outputName Name of the sql consumer + * @param logger Reference to a logger + * @throws ConfigurationException if an error occurred while accessing the configuration + * @throws LoggingException if an error occurred while accessing the logger + */ + public SQLFileWriter(final String outputName, final Logger logger) + throws IOException, ConfigurationException, LoggingException { - this.fileCounter++; - String filePath = PATH_OUTPUT_SQL_FILES + this.outputName + "_" - + fileCounter + ".sql"; + this(); - SQLConsumerLogMessages.logFileCreation(logger, filePath); + this.outputName = outputName; + this.logger = logger; - this.sqlFile = new File(filePath); + init(); + writeHeader(); + } + + /** + * This method will close the connection to the output. + * + * @throws IOException if problems occurred while closing the file or process. + */ + @Override + public void close() + throws IOException { + this.writer.close(); + } + + /** + * Creates the sql encoder. + * + * @throws ConfigurationException if an error occurred while accessing the configuration + * @throws LoggingException if an error occurred while accessing the logger + */ + protected void init() + throws ConfigurationException, LoggingException { + + this.sqlEncoder = new SQLEncoder(logger); + } + + /** + * This method will process the given DiffTask and send it to the specified + * output. + * + * @param task DiffTask + * @throws ConfigurationException if problems occurred while initializing the components + * @throws IOException if problems occurred while writing the output (to file or + * archive) + * @throws SQLConsumerException if problems occurred while writing the output (to the sql + * producer database) + */ + @Override + public void process(final Task task) + throws ConfigurationException, IOException, SQLConsumerException { + + try { + SQLEncoding[] encoding = sqlEncoder.encodeTask(task); + + for (SQLEncoding sql : encoding) { + this.writer.write(sql.getQuery() + "\r\n"); + this.writer.flush(); + } + + if (task.getTaskType() == TaskTypes.TASK_FULL + || task.getTaskType() == TaskTypes.TASK_PARTIAL_LAST) { + + if (this.sqlFile.length() > LIMIT_SQL_FILE_SIZE) { + writeHeader(); + } + + if (!MODE_STATISTICAL_OUTPUT) { + System.out.println(task); + } + + } else { + System.out.println(task); + } + + } catch (DecodingException | EncodingException e) { + + throw ErrorFactory.createSQLConsumerException( + ErrorKeys.DIFFTOOL_SQLCONSUMER_FILEWRITER_EXCEPTION, e); + + } + } + + /** + * Creates a new output file and writes the header information. + * + * @throws ConfigurationException if an error occurred while accessing the configuration + * @throws IOException if an error occurred while writing a file + */ + protected void writeHeader() + throws ConfigurationException, IOException { + + if (writer != null) { + writer.close(); + } + + this.fileCounter++; + String filePath = PATH_OUTPUT_SQL_FILES + this.outputName + "_" + + fileCounter + ".sql"; - this.writer = new BufferedWriter(new OutputStreamWriter(new BufferedOutputStream( - new FileOutputStream(filePath)), WIKIPEDIA_ENCODING)); + SQLConsumerLogMessages.logFileCreation(logger, filePath); + + this.sqlFile = new File(filePath); + + this.writer = new BufferedWriter(new OutputStreamWriter(new BufferedOutputStream( + new FileOutputStream(filePath)), WIKIPEDIA_ENCODING)); String[] revTable = this.sqlEncoder.getTable(); - for (String sTable : revTable) { - this.writer.write(sTable + "\r\n"); - } + for (String sTable : revTable) { + this.writer.write(sTable + "\r\n"); + } - this.writer.flush(); - } + this.writer.flush(); + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/writer/TimedSQLArchiveWriter.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/writer/TimedSQLArchiveWriter.java index 595a1a29..4c966184 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/writer/TimedSQLArchiveWriter.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/writer/TimedSQLArchiveWriter.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -34,134 +34,115 @@ /** * This class writes the output to an archive while collecting statistical * information. - * */ public class TimedSQLArchiveWriter - extends SQLArchiveWriter -{ - - /** Reference to the logger */ - private final Logger outputLogger; - - /** - * Temporary variable - used for storing the time needed to encode a task - */ - private long processingTimeSQL; - - /** Reference to the sql encoder */ - private TimedSQLEncoder sqlEncoder; - - - /** - * (Constructor) Creates a new TimedSQLFileWriter object. - * - * @param outputName - * Name of the sql consumer - * @param logger - * Reference to a logger - * - * @throws ConfigurationException - * if an error occurred while accessing the configuration - * @throws LoggingException - * if an error occurred while accessing the logger - */ - public TimedSQLArchiveWriter(final String outputName, final Logger logger) - throws IOException, ConfigurationException, LoggingException - { - - super(outputName, logger); - this.outputLogger = LoggingFactory - .getLogger(LoggingFactory.NAME_ARTICLE_OUTPUT_LOGGER); - } - - /*--------------------------------------------------------------------------*/ - - /** - * Creates the sql encoder. - * - * @throws ConfigurationException - * if an error occurred while accessing the configuration - * @throws LoggingException - * if an error occurred while accessing the logger - */ - @Override - protected void init() - throws ConfigurationException, LoggingException - { - - this.sqlEncoder = new TimedSQLEncoder(logger); - super.sqlEncoder = this.sqlEncoder; - } - - /*--------------------------------------------------------------------------*/ - - /** - * This method will process the given DiffTask and send him to the specified - * output. - * - * @param task - * DiffTask - * - * @throws ConfigurationException - * if problems occurred while initializing the components - * - * @throws IOException - * if problems occurred while writing the output (to file or - * archive) - * - * @throws SQLConsumerException - * if problems occurred while writing the output (to the sql - * producer database) - */ - @Override - public void process(final Task task) - throws ConfigurationException, IOException, SQLConsumerException - { - - long startTime = System.currentTimeMillis(); - - TaskTypes type = task.getTaskType(); - - if (type == TaskTypes.TASK_FULL || type == TaskTypes.TASK_PARTIAL_FIRST) { - - this.sqlEncoder.init(); - this.processingTimeSQL = 0; - } - - super.process(task); - - this.processingTimeSQL += System.currentTimeMillis() - startTime; - - if (type == TaskTypes.TASK_FULL || type == TaskTypes.TASK_PARTIAL_LAST) { - - ArticleInformation info = task.getHeader(); - info.setEncodedSize(this.sqlEncoder.getEncodedSize()); - info.setEncodedSQLSize(this.sqlEncoder.getEncodedSQLSize()); - info.setExitingTime(System.currentTimeMillis()); - info.setProcessingTimeSQL(processingTimeSQL); - - String succesReport = info.toString(); - this.outputLogger.logMessage(Level.INFO, "\r\n" + succesReport); - } - } - - /** - * This method will close the connection to the output. - * - * @throws IOException - * if problems occurred while closing the file or process. - * - */ - @Override - public void close() - throws IOException - { - try { - super.close(); - } - finally { - this.outputLogger.flush(); - //this.outputLogger.close(); - } - } + extends SQLArchiveWriter { + + /** + * Reference to the logger + */ + private final Logger outputLogger; + + /** + * Temporary variable - used for storing the time needed to encode a task + */ + private long processingTimeSQL; + + /** + * Reference to the sql encoder + */ + private TimedSQLEncoder sqlEncoder; + + + /** + * (Constructor) Creates a new TimedSQLFileWriter object. + * + * @param outputName Name of the sql consumer + * @param logger Reference to a logger + * @throws ConfigurationException if an error occurred while accessing the configuration + * @throws LoggingException if an error occurred while accessing the logger + */ + public TimedSQLArchiveWriter(final String outputName, final Logger logger) + throws IOException, ConfigurationException, LoggingException { + + super(outputName, logger); + this.outputLogger = LoggingFactory + .getLogger(LoggingFactory.NAME_ARTICLE_OUTPUT_LOGGER); + } + + /*--------------------------------------------------------------------------*/ + + /** + * Creates the sql encoder. + * + * @throws ConfigurationException if an error occurred while accessing the configuration + * @throws LoggingException if an error occurred while accessing the logger + */ + @Override + protected void init() + throws ConfigurationException, LoggingException { + + this.sqlEncoder = new TimedSQLEncoder(logger); + super.sqlEncoder = this.sqlEncoder; + } + + /*--------------------------------------------------------------------------*/ + + /** + * This method will process the given DiffTask and send him to the specified + * output. + * + * @param task DiffTask + * @throws ConfigurationException if problems occurred while initializing the components + * @throws IOException if problems occurred while writing the output (to file or + * archive) + * @throws SQLConsumerException if problems occurred while writing the output (to the sql + * producer database) + */ + @Override + public void process(final Task task) + throws ConfigurationException, IOException, SQLConsumerException { + + long startTime = System.currentTimeMillis(); + + TaskTypes type = task.getTaskType(); + + if (type == TaskTypes.TASK_FULL || type == TaskTypes.TASK_PARTIAL_FIRST) { + + this.sqlEncoder.init(); + this.processingTimeSQL = 0; + } + + super.process(task); + + this.processingTimeSQL += System.currentTimeMillis() - startTime; + + if (type == TaskTypes.TASK_FULL || type == TaskTypes.TASK_PARTIAL_LAST) { + + ArticleInformation info = task.getHeader(); + info.setEncodedSize(this.sqlEncoder.getEncodedSize()); + info.setEncodedSQLSize(this.sqlEncoder.getEncodedSQLSize()); + info.setExitingTime(System.currentTimeMillis()); + info.setProcessingTimeSQL(processingTimeSQL); + + String succesReport = info.toString(); + this.outputLogger.logMessage(Level.INFO, "\r\n" + succesReport); + } + } + + /** + * This method will close the connection to the output. + * + * @throws IOException if problems occurred while closing the file or process. + */ + @Override + public void close() + throws IOException { + try { + super.close(); + } finally { + this.outputLogger.flush(); + //this.outputLogger.close(); + } + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/writer/TimedSQLDatabaseWriter.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/writer/TimedSQLDatabaseWriter.java index 407a4233..fdb56cd8 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/writer/TimedSQLDatabaseWriter.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/writer/TimedSQLDatabaseWriter.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -35,132 +35,115 @@ /** * This class writes the output to a database while collecting statistical * information. - * */ public class TimedSQLDatabaseWriter - extends SQLDatabaseWriter -{ - - /** Reference to the logger */ - private final Logger outputLogger; - - /** - * Temporary variable - used for storing the time needed to encode a task - */ - private long processingTimeSQL; - - /** Reference to the sql encoder */ - private TimedSQLEncoder sqlEncoder; - - - /** - * (Constructor) Creates a new TimedSQLDatabaseWriter object. - * - * @param logger - * Reference to the logger - * - * @throws ConfigurationException - * if an error occurred while accessing the configuration - * @throws LoggingException - * if an error occurred while accessing the logger - */ - public TimedSQLDatabaseWriter(final Logger logger) - throws ConfigurationException, LoggingException - { - - super(logger); - this.outputLogger = LoggingFactory - .getLogger(LoggingFactory.NAME_ARTICLE_OUTPUT_LOGGER); - } - - /*--------------------------------------------------------------------------*/ - - /** - * Creates the sql encoder. - * - * @throws ConfigurationException - * if an error occurred while accessing the configuration - * @throws LoggingException - * if an error occurred while accessing the logger - */ - @Override - protected void init() - throws ConfigurationException, LoggingException - { - - this.sqlEncoder = new TimedSQLEncoder(logger); - super.sqlEncoder = this.sqlEncoder; - } - - /*--------------------------------------------------------------------------*/ - - /** - * This method will process the given DiffTask and send him to the specified - * output. - * - * @param task - * DiffTask - * - * @throws ConfigurationException - * if problems occurred while initializing the components - * - * @throws IOException - * if problems occurred while writing the output (to file or - * archive) - * - * @throws SQLConsumerException - * if problems occurred while writing the output (to the sql - * producer database) - */ - @Override - public void process(final Task task) - throws ConfigurationException, IOException, SQLConsumerException - { - - long startTime = System.currentTimeMillis(); - - TaskTypes type = task.getTaskType(); - - if (type == TaskTypes.TASK_FULL || type == TaskTypes.TASK_PARTIAL_FIRST) { - - this.sqlEncoder.init(); - this.processingTimeSQL = 0; - } - - super.process(task); - - this.processingTimeSQL += System.currentTimeMillis() - startTime; - - if (type == TaskTypes.TASK_FULL || type == TaskTypes.TASK_PARTIAL_LAST) { - - ArticleInformation info = task.getHeader(); - info.setEncodedSize(this.sqlEncoder.getEncodedSize()); - info.setEncodedSQLSize(this.sqlEncoder.getEncodedSQLSize()); - info.setExitingTime(System.currentTimeMillis()); - info.setProcessingTimeSQL(processingTimeSQL); - - String succesReport = info.toString(); - // System.out.println(succesReport); - this.outputLogger.logMessage(Level.INFO, "\r\n" + succesReport); - } - } - - /** - * This method will close the connection to the output. - * - * @throws SQLException - * if problems occurred while closing the connection to the - * database. - */ - @Override - public void close() - throws SQLException - { - try { - super.close(); - } - finally { - this.outputLogger.close(); - } - } + extends SQLDatabaseWriter { + + /** + * Reference to the logger + */ + private final Logger outputLogger; + + /** + * Temporary variable - used for storing the time needed to encode a task + */ + private long processingTimeSQL; + + /** + * Reference to the sql encoder + */ + private TimedSQLEncoder sqlEncoder; + + + /** + * (Constructor) Creates a new TimedSQLDatabaseWriter object. + * + * @param logger Reference to the logger + * @throws ConfigurationException if an error occurred while accessing the configuration + * @throws LoggingException if an error occurred while accessing the logger + */ + public TimedSQLDatabaseWriter(final Logger logger) + throws ConfigurationException, LoggingException { + + super(logger); + this.outputLogger = LoggingFactory + .getLogger(LoggingFactory.NAME_ARTICLE_OUTPUT_LOGGER); + } + + /*--------------------------------------------------------------------------*/ + + /** + * Creates the sql encoder. + * + * @throws ConfigurationException if an error occurred while accessing the configuration + * @throws LoggingException if an error occurred while accessing the logger + */ + @Override + protected void init() + throws ConfigurationException, LoggingException { + + this.sqlEncoder = new TimedSQLEncoder(logger); + super.sqlEncoder = this.sqlEncoder; + } + + /*--------------------------------------------------------------------------*/ + + /** + * This method will process the given DiffTask and send him to the specified + * output. + * + * @param task DiffTask + * @throws ConfigurationException if problems occurred while initializing the components + * @throws IOException if problems occurred while writing the output (to file or + * archive) + * @throws SQLConsumerException if problems occurred while writing the output (to the sql + * producer database) + */ + @Override + public void process(final Task task) + throws ConfigurationException, IOException, SQLConsumerException { + + long startTime = System.currentTimeMillis(); + + TaskTypes type = task.getTaskType(); + + if (type == TaskTypes.TASK_FULL || type == TaskTypes.TASK_PARTIAL_FIRST) { + + this.sqlEncoder.init(); + this.processingTimeSQL = 0; + } + + super.process(task); + + this.processingTimeSQL += System.currentTimeMillis() - startTime; + + if (type == TaskTypes.TASK_FULL || type == TaskTypes.TASK_PARTIAL_LAST) { + + ArticleInformation info = task.getHeader(); + info.setEncodedSize(this.sqlEncoder.getEncodedSize()); + info.setEncodedSQLSize(this.sqlEncoder.getEncodedSQLSize()); + info.setExitingTime(System.currentTimeMillis()); + info.setProcessingTimeSQL(processingTimeSQL); + + String succesReport = info.toString(); + // System.out.println(succesReport); + this.outputLogger.logMessage(Level.INFO, "\r\n" + succesReport); + } + } + + /** + * This method will close the connection to the output. + * + * @throws SQLException if problems occurred while closing the connection to the + * database. + */ + @Override + public void close() + throws SQLException { + try { + super.close(); + } finally { + this.outputLogger.close(); + } + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/writer/TimedSQLFileWriter.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/writer/TimedSQLFileWriter.java index 5b61ffd8..67df8b4f 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/writer/TimedSQLFileWriter.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/writer/TimedSQLFileWriter.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -34,132 +34,113 @@ /** * This class writes the output to a file while collecting statistical * information. - * */ public class TimedSQLFileWriter - extends SQLFileWriter -{ - - /** Reference to the logger */ - private final Logger outputLogger; - - /** - * Temporary variable - used for storing the time needed to encode a task - */ - private long processingTimeSQL; - - /** Reference to the sql encoder */ - private TimedSQLEncoder sqlEncoder; - - /** - * (Constructor) Creates a new TimedSQLFileWriter object. - * - * @param outputName - * Name of the sql consumer - * @param logger - * Reference to a logger - * - * @throws ConfigurationException - * if an error occurred while accessing the configuration - * @throws LoggingException - * if an error occurred while accessing the logger - */ - public TimedSQLFileWriter(final String outputName, final Logger logger) - throws IOException, ConfigurationException, LoggingException - { - - super(outputName, logger); - this.outputLogger = LoggingFactory - .getLogger(LoggingFactory.NAME_ARTICLE_OUTPUT_LOGGER); - } - - - /*--------------------------------------------------------------------------*/ - - /** - * Creates the sql encoder. - * - * @throws ConfigurationException - * if an error occurred while accessing the configuration - * @throws LoggingException - * if an error occurred while accessing the logger - */ - @Override - protected void init() - throws ConfigurationException, LoggingException - { - - this.sqlEncoder = new TimedSQLEncoder(logger); - super.sqlEncoder = this.sqlEncoder; - } - - /** - * This method will process the given DiffTask and send him to the specified - * output. - * - * @param task - * DiffTask - * - * @throws ConfigurationException - * if problems occurred while initializing the components - * - * @throws IOException - * if problems occurred while writing the output (to file or - * archive) - * - * @throws SQLConsumerException - * if problems occurred while writing the output (to the sql - * producer database) - */ - @Override - public void process(final Task task) - throws ConfigurationException, IOException, SQLConsumerException - { - - long startTime = System.currentTimeMillis(); - - TaskTypes type = task.getTaskType(); - - if (type == TaskTypes.TASK_FULL || type == TaskTypes.TASK_PARTIAL_FIRST) { - - this.sqlEncoder.init(); - this.processingTimeSQL = 0; - } - - super.process(task); - - this.processingTimeSQL += System.currentTimeMillis() - startTime; - - if (type == TaskTypes.TASK_FULL || type == TaskTypes.TASK_PARTIAL_LAST) { - - ArticleInformation info = task.getHeader(); - info.setEncodedSize(this.sqlEncoder.getEncodedSize()); - info.setEncodedSQLSize(this.sqlEncoder.getEncodedSQLSize()); - info.setExitingTime(System.currentTimeMillis()); - info.setProcessingTimeSQL(processingTimeSQL); - - String succesReport = info.toString(); - // System.out.println(succesReport); - this.outputLogger.logMessage(Level.INFO, "\r\n" + succesReport); - } - } - - /** - * This method will close the connection to the output. - * - * @throws IOException - * if problems occurred while closing the file or process. - * - */ - @Override - public void close() - throws IOException - { - try { - super.close(); - } - finally { - this.outputLogger.close(); - } - } + extends SQLFileWriter { + + /** + * Reference to the logger + */ + private final Logger outputLogger; + + /** + * Temporary variable - used for storing the time needed to encode a task + */ + private long processingTimeSQL; + + /** + * Reference to the sql encoder + */ + private TimedSQLEncoder sqlEncoder; + + /** + * (Constructor) Creates a new TimedSQLFileWriter object. + * + * @param outputName Name of the sql consumer + * @param logger Reference to a logger + * @throws ConfigurationException if an error occurred while accessing the configuration + * @throws LoggingException if an error occurred while accessing the logger + */ + public TimedSQLFileWriter(final String outputName, final Logger logger) + throws IOException, ConfigurationException, LoggingException { + + super(outputName, logger); + this.outputLogger = LoggingFactory + .getLogger(LoggingFactory.NAME_ARTICLE_OUTPUT_LOGGER); + } + + + /*--------------------------------------------------------------------------*/ + + /** + * Creates the sql encoder. + * + * @throws ConfigurationException if an error occurred while accessing the configuration + * @throws LoggingException if an error occurred while accessing the logger + */ + @Override + protected void init() + throws ConfigurationException, LoggingException { + + this.sqlEncoder = new TimedSQLEncoder(logger); + super.sqlEncoder = this.sqlEncoder; + } + + /** + * This method will process the given DiffTask and send him to the specified + * output. + * + * @param task DiffTask + * @throws ConfigurationException if problems occurred while initializing the components + * @throws IOException if problems occurred while writing the output (to file or + * archive) + * @throws SQLConsumerException if problems occurred while writing the output (to the sql + * producer database) + */ + @Override + public void process(final Task task) + throws ConfigurationException, IOException, SQLConsumerException { + + long startTime = System.currentTimeMillis(); + + TaskTypes type = task.getTaskType(); + + if (type == TaskTypes.TASK_FULL || type == TaskTypes.TASK_PARTIAL_FIRST) { + + this.sqlEncoder.init(); + this.processingTimeSQL = 0; + } + + super.process(task); + + this.processingTimeSQL += System.currentTimeMillis() - startTime; + + if (type == TaskTypes.TASK_FULL || type == TaskTypes.TASK_PARTIAL_LAST) { + + ArticleInformation info = task.getHeader(); + info.setEncodedSize(this.sqlEncoder.getEncodedSize()); + info.setEncodedSQLSize(this.sqlEncoder.getEncodedSQLSize()); + info.setExitingTime(System.currentTimeMillis()); + info.setProcessingTimeSQL(processingTimeSQL); + + String succesReport = info.toString(); + // System.out.println(succesReport); + this.outputLogger.logMessage(Level.INFO, "\r\n" + succesReport); + } + } + + /** + * This method will close the connection to the output. + * + * @throws IOException if problems occurred while closing the file or process. + */ + @Override + public void close() + throws IOException { + try { + super.close(); + } finally { + this.outputLogger.close(); + } + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/OutputType.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/OutputType.java index 9dbea61e..1783d40f 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/OutputType.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/OutputType.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -19,56 +19,56 @@ /** * This Enumerator list the possible output values. - * - * - * */ -public enum OutputType -{ +public enum OutputType { - /** The output will consist of a single or multiple sql files */ - UNCOMPRESSED, + /** + * The output will consist of a single or multiple sql files + */ + UNCOMPRESSED, - /** The output will consist of a single or multiple 7z archives */ - SEVENZIP, + /** + * The output will consist of a single or multiple 7z archives + */ + SEVENZIP, - /** The output will consist of a single or multiple bzip2 archives */ - BZIP2, + /** + * The output will consist of a single or multiple bzip2 archives + */ + BZIP2, - /** The output will consist of a single or multiple alternate archives */ - ALTERNATE, + /** + * The output will consist of a single or multiple alternate archives + */ + ALTERNATE, - /** The output will be directly written into a database */ - DATABASE; + /** + * The output will be directly written into a database + */ + DATABASE; - /** - * Parses the given string. - * - * @param s - * string - * @return OutputTypes - */ - public static OutputType parse(final String s) - { + /** + * Parses the given string. + * + * @param s string + * @return OutputTypes + */ + public static OutputType parse(final String s) { - String t = s.toUpperCase(); + String t = s.toUpperCase(); - if (t.equals("UNCOMPRESSED")) { - return OutputType.UNCOMPRESSED; - } - else if (t.equals("SEVENZIP")) { - return OutputType.SEVENZIP; - } - else if (t.equals("BZIP2")) { - return OutputType.BZIP2; - } - else if (t.equals("DATABASE")) { - return OutputType.DATABASE; - } - else if (t.equals("ALTERNATE")) { - return OutputType.ALTERNATE; - } + if (t.equals("UNCOMPRESSED")) { + return OutputType.UNCOMPRESSED; + } else if (t.equals("SEVENZIP")) { + return OutputType.SEVENZIP; + } else if (t.equals("BZIP2")) { + return OutputType.BZIP2; + } else if (t.equals("DATABASE")) { + return OutputType.DATABASE; + } else if (t.equals("ALTERNATE")) { + return OutputType.ALTERNATE; + } - throw new IllegalArgumentException("Unknown OutputType : " + s); - } + throw new IllegalArgumentException("Unknown OutputType : " + s); + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/SurrogateModes.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/SurrogateModes.java index 7e3d2344..350ffccc 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/SurrogateModes.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/SurrogateModes.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -23,61 +23,57 @@ * TODO: The surrogate mode implementations need a work over. * TODO Add documentation for surrogates */ -public enum SurrogateModes -{ +public enum SurrogateModes { - /** - * Replace the surrogate - * TODO COULD BE FAULTY. CHECK BEFORE USING!!! DISABLED FOR NOW! - */ - REPLACE, + /** + * Replace the surrogate + * TODO COULD BE FAULTY. CHECK BEFORE USING!!! DISABLED FOR NOW! + */ + REPLACE, - /** - * Throw an error if a surrogate is detected - * TODO COULD BE FAULTY. CHECK BEFORE USING!!! DISABLED FOR NOW! - */ - THROW_ERROR, + /** + * Throw an error if a surrogate is detected + * TODO COULD BE FAULTY. CHECK BEFORE USING!!! DISABLED FOR NOW! + */ + THROW_ERROR, - /** - * Discard the rest of the article after a surrogate is detected - * TODO COULD BE FAULTY. CHECK BEFORE USING!!! DISABLED FOR NOW! - */ - DISCARD_REST, + /** + * Discard the rest of the article after a surrogate is detected + * TODO COULD BE FAULTY. CHECK BEFORE USING!!! DISABLED FOR NOW! + */ + DISCARD_REST, - /** Discard revisions which contain surrogates (java default setting) */ - DISCARD_REVISION; + /** + * Discard revisions which contain surrogates (java default setting) + */ + DISCARD_REVISION; - /** - * Parses the given string. - * - * @param s - * string - * @return SurrogateModes - */ - public static SurrogateModes parse(final String s) - { + /** + * Parses the given string. + * + * @param s string + * @return SurrogateModes + */ + public static SurrogateModes parse(final String s) { - String t = s.toUpperCase(); + String t = s.toUpperCase(); - if (t.equals("REPLACE")) { - // return REPLACE; - throw new UnsupportedOperationException( - "This mode is currently not supported. Please check the implementation first. For now, you can use the default mode DISCARD_REVISION"); - } - else if (t.equals("THROW_ERROR")) { - // return THROW_ERROR; - throw new UnsupportedOperationException( - "This mode is currently not supported. Please check the implementation first. For now, you can use the default mode DISCARD_REVISION"); - } - else if (t.equals("DISCARD_REST")) { - // return DISCARD_REST; - throw new UnsupportedOperationException( - "This mode is currently not supported. Please check the implementation first. For now, you can use the default mode DISCARD_REVISION"); - } - else if (t.equals("DISCARD_REVISION")) { - return DISCARD_REVISION; - } + if (t.equals("REPLACE")) { + // return REPLACE; + throw new UnsupportedOperationException( + "This mode is currently not supported. Please check the implementation first. For now, you can use the default mode DISCARD_REVISION"); + } else if (t.equals("THROW_ERROR")) { + // return THROW_ERROR; + throw new UnsupportedOperationException( + "This mode is currently not supported. Please check the implementation first. For now, you can use the default mode DISCARD_REVISION"); + } else if (t.equals("DISCARD_REST")) { + // return DISCARD_REST; + throw new UnsupportedOperationException( + "This mode is currently not supported. Please check the implementation first. For now, you can use the default mode DISCARD_REVISION"); + } else if (t.equals("DISCARD_REVISION")) { + return DISCARD_REVISION; + } - throw new IllegalArgumentException("Unknown SurrogateModes : " + s); - } + throw new IllegalArgumentException("Unknown SurrogateModes : " + s); + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/archive/ArchiveDescription.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/archive/ArchiveDescription.java index c2071675..39408902 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/archive/ArchiveDescription.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/archive/ArchiveDescription.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -19,86 +19,79 @@ /** * This class represents a description of an input file. - * + *

* TODO: The start position is currently unused. - * - * - * */ -public class ArchiveDescription -{ +public class ArchiveDescription { - /** Path to the archive */ - private final String path; + /** + * Path to the archive + */ + private final String path; - /** Start position */ - private long startPosition; + /** + * Start position + */ + private long startPosition; - /** InputType */ - private final InputType type; + /** + * InputType + */ + private final InputType type; - /** - * (Constructor) Creates a new ArchiveDescription - * - * @param type - * InputType - * @param path - * Path - */ - public ArchiveDescription(final InputType type, final String path) - { - this.type = type; - this.path = path; - } + /** + * (Constructor) Creates a new ArchiveDescription + * + * @param type InputType + * @param path Path + */ + public ArchiveDescription(final InputType type, final String path) { + this.type = type; + this.path = path; + } - /** - * Returns the path. - * - * @return path - */ - public String getPath() - { - return this.path; - } + /** + * Returns the path. + * + * @return path + */ + public String getPath() { + return this.path; + } - /** - * Returns the start position. - * - * @return start position - */ - public long getStartPosition() - { - return startPosition; - } + /** + * Returns the start position. + * + * @return start position + */ + public long getStartPosition() { + return startPosition; + } - /** - * Returns the InputType. - * - * @return InputType - */ - public InputType getType() - { - return this.type; - } + /** + * Returns the InputType. + * + * @return InputType + */ + public InputType getType() { + return this.type; + } - /** - * Sets the start position. - * - * @param startPosition - * start position - */ - public void setStartPosition(final long startPosition) - { - this.startPosition = startPosition; - } + /** + * Sets the start position. + * + * @param startPosition start position + */ + public void setStartPosition(final long startPosition) { + this.startPosition = startPosition; + } - /** - * Returns the string representation of this object. - * - * @return [InputType, path] - */ - public String toString() - { - return "[" + this.getType() + ", " + this.getPath() + "]"; - } + /** + * Returns the string representation of this object. + * + * @return [InputType, path] + */ + public String toString() { + return "[" + this.getType() + ", " + this.getPath() + "]"; + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/archive/ArchiveManager.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/archive/ArchiveManager.java index b26fa15e..d4d2516a 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/archive/ArchiveManager.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/archive/ArchiveManager.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -27,69 +27,64 @@ * * @version 0.5.0 */ -public class ArchiveManager -{ +public class ArchiveManager { - /** List of available archives */ - private final List archives; + /** + * List of available archives + */ + private final List archives; - /** - * (Constructor) Creates the ArchiveManager. - * - * @throws ConfigurationException - * if an error occurs while accessing the configuration - */ - public ArchiveManager() - throws ConfigurationException - { + /** + * (Constructor) Creates the ArchiveManager. + * + * @throws ConfigurationException if an error occurs while accessing the configuration + */ + public ArchiveManager() + throws ConfigurationException { - ConfigurationManager config = ConfigurationManager.getInstance(); - this.archives = config.getArchiveList(); - } + ConfigurationManager config = ConfigurationManager.getInstance(); + this.archives = config.getArchiveList(); + } - /** - * Returns whether an archive is available or not. - * - * @return TRUE | FALSE - */ - public boolean hasArchive() - { - return !this.archives.isEmpty(); - } + /** + * Returns whether an archive is available or not. + * + * @return TRUE | FALSE + */ + public boolean hasArchive() { + return !this.archives.isEmpty(); + } - /** - * Returns an archive. - * - * @return ArchiveDescription or NULL if no archive is available - */ - public synchronized ArchiveDescription getArchive() - { + /** + * Returns an archive. + * + * @return ArchiveDescription or NULL if no archive is available + */ + public synchronized ArchiveDescription getArchive() { - if (!this.archives.isEmpty()) { + if (!this.archives.isEmpty()) { - return this.archives.remove(0); - } + return this.archives.remove(0); + } - return null; - } + return null; + } - /** - * Returns the number of remaining archives. - * - * @return number of available archives - */ - public int size() - { - return this.archives.size(); - } + /** + * Returns the number of remaining archives. + * + * @return number of available archives + */ + public int size() { + return this.archives.size(); + } - /** - * Returns the string representation of the ArchiveManager's content. - * - * @return [ number of archives ] - */ - public String toString() - { - return "ArchiveManager:\t[" + this.size() + "]"; - } + /** + * Returns the string representation of the ArchiveManager's content. + * + * @return [ number of archives ] + */ + public String toString() { + return "ArchiveManager:\t[" + this.size() + "]"; + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/archive/InputType.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/archive/InputType.java index 369f5df4..caebbb1d 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/archive/InputType.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/archive/InputType.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -19,48 +19,44 @@ /** * This class represents an enumeration of the input type. - * - * - * */ -public enum InputType -{ - - /** Uncompressed XML Input */ - XML, - - /** SevenZip Compressed XML Input */ - SEVENZIP, - - /** BZip2 Compressed XML Input */ - BZIP2; - - /** - * Parses the string representation to the related InputType. - * - * @param s - * String representation of the InputType. - * @return InputType Enumerator - * - * @throws IllegalArgumentException - * if the parsed String does not match with one of the - * enumerators - */ - public static InputType parse(final String s) - { - - String t = s.toUpperCase(); - - if (t.equals("XML")) { - return XML; - } - else if (t.equals("SEVENZIP")) { - return SEVENZIP; - } - else if (t.equals("BZIP2")) { - return BZIP2; - } - - throw new IllegalArgumentException("Unknown InputType : " + s); - } +public enum InputType { + + /** + * Uncompressed XML Input + */ + XML, + + /** + * SevenZip Compressed XML Input + */ + SEVENZIP, + + /** + * BZip2 Compressed XML Input + */ + BZIP2; + + /** + * Parses the string representation to the related InputType. + * + * @param s String representation of the InputType. + * @return InputType Enumerator + * @throws IllegalArgumentException if the parsed String does not match with one of the + * enumerators + */ + public static InputType parse(final String s) { + + String t = s.toUpperCase(); + + if (t.equals("XML")) { + return XML; + } else if (t.equals("SEVENZIP")) { + return SEVENZIP; + } else if (t.equals("BZIP2")) { + return BZIP2; + } + + throw new IllegalArgumentException("Unknown InputType : " + s); + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/codec/BitReader.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/codec/BitReader.java index 5f47b0a8..dbbc18e4 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/codec/BitReader.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/codec/BitReader.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -23,134 +23,125 @@ /** * The BitReader buffers a byte-array. - * - * - * */ -public class BitReader -{ - - /** Current index in the byte array */ - private int inputIndex; - - /** Byte input array */ - private final byte[] input; - - /** Buffer used to store a single byte */ - private int buffer; - - /** Length of the bits in the buffer that have not been read yet */ - private int bufferLength; - - /** - * Constructor of the BitReader - * - * @param input - * byte input array - */ - public BitReader(final byte[] input) - { - this.input = input; - - this.buffer = 0; - this.bufferLength = -1; - this.inputIndex = 0; - } - - /** - * Reads the next bit from the input. - * - * @return 0 or 1 - * - * @throws DecodingException - * if the decoding failed - */ - public int readBit() - throws DecodingException - { - - if (bufferLength == -1) { - buffer = readByte(); - if (buffer == -1) { - return -1; - } - - bufferLength = 7; - } - - return (buffer >> bufferLength--) & 1; - } - - /** - * Reads the next length-bits from the input. - *

- * The maximum value of bits that could be read is 31. (Maximum value of a - * positive number that could be stored in an integer without any - * conversion.) - * - * @param length - * number of bits to read - * @return content as integer value or -1 if the end of the stream has been - * reached - * - * @throws DecodingException - * if the decoding failed - */ - public int read(final int length) - throws DecodingException - { - - if (length > 31) { - throw ErrorFactory.createDecodingException( - ErrorKeys.DIFFTOOL_DECODING_VALUE_OUT_OF_RANGE, - "more than maximum length: " + length); - } - - int v, b = 0; - for (int i = length - 1; i >= 0; i--) { - v = readBit(); - if (v == -1) { - if (i != length - 1) { - throw ErrorFactory - .createDecodingException(ErrorKeys.DIFFTOOL_DECODING_UNEXPECTED_END_OF_STREAM); - } - - return -1; - } - b |= v << i; - } - - return b; - } - - /** - * Resets the buffer. - */ - public void skip() - { - this.buffer = 0; - this.bufferLength = -1; - } - - /** - * Reads the next character in the input Note: The current content of the - * buffer will be deleted. This method should only be used for reading the - * textual content of the diff-part. - * - * @return the next character in the string - * - * @throws DecodingException - * if the decoding failed - */ - public int readByte() - throws DecodingException - { - - skip(); - if (input == null || inputIndex >= input.length) { - return -1; - } - - return 0xFF & input[inputIndex++]; - } +public class BitReader { + + /** + * Current index in the byte array + */ + private int inputIndex; + + /** + * Byte input array + */ + private final byte[] input; + + /** + * Buffer used to store a single byte + */ + private int buffer; + + /** + * Length of the bits in the buffer that have not been read yet + */ + private int bufferLength; + + /** + * Constructor of the BitReader + * + * @param input byte input array + */ + public BitReader(final byte[] input) { + this.input = input; + + this.buffer = 0; + this.bufferLength = -1; + this.inputIndex = 0; + } + + /** + * Reads the next bit from the input. + * + * @return 0 or 1 + * @throws DecodingException if the decoding failed + */ + public int readBit() + throws DecodingException { + + if (bufferLength == -1) { + buffer = readByte(); + if (buffer == -1) { + return -1; + } + + bufferLength = 7; + } + + return (buffer >> bufferLength--) & 1; + } + + /** + * Reads the next length-bits from the input. + *

+ * The maximum value of bits that could be read is 31. (Maximum value of a + * positive number that could be stored in an integer without any + * conversion.) + * + * @param length number of bits to read + * @return content as integer value or -1 if the end of the stream has been + * reached + * @throws DecodingException if the decoding failed + */ + public int read(final int length) + throws DecodingException { + + if (length > 31) { + throw ErrorFactory.createDecodingException( + ErrorKeys.DIFFTOOL_DECODING_VALUE_OUT_OF_RANGE, + "more than maximum length: " + length); + } + + int v, b = 0; + for (int i = length - 1; i >= 0; i--) { + v = readBit(); + if (v == -1) { + if (i != length - 1) { + throw ErrorFactory + .createDecodingException(ErrorKeys.DIFFTOOL_DECODING_UNEXPECTED_END_OF_STREAM); + } + + return -1; + } + b |= v << i; + } + + return b; + } + + /** + * Resets the buffer. + */ + public void skip() { + this.buffer = 0; + this.bufferLength = -1; + } + + /** + * Reads the next character in the input Note: The current content of the + * buffer will be deleted. This method should only be used for reading the + * textual content of the diff-part. + * + * @return the next character in the string + * @throws DecodingException if the decoding failed + */ + public int readByte() + throws DecodingException { + + skip(); + if (input == null || inputIndex >= input.length) { + return -1; + } + + return 0xFF & input[inputIndex++]; + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/codec/BitWriter.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/codec/BitWriter.java index 9b79f45f..896987e1 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/codec/BitWriter.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/codec/BitWriter.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -26,165 +26,145 @@ /** * The BitWriter buffers bit that will be written byte-by-byte to an output * stream. - * - * - * */ -public class BitWriter -{ - - /** Output buffer */ - private final ByteArrayOutputStream stream; - - /** Buffer to store the bits */ - private int buffer; - - /** Number of stored bits */ - private byte bufferLength = 0; - - /** - * Constructor Creates a BitWriter with a byte buffer of the given length. - * - * @param length - * Length of the byte buffer - */ - public BitWriter(final int length) - { - this.stream = new ByteArrayOutputStream(length); - } - - /** - * Constructor Creates a BitWriter with a standard buffer. - */ - public BitWriter() - { - this.stream = new ByteArrayOutputStream(); - } - - /** - * Writes a byte to the buffer. - * - * @param val - * an integer representing a full byte - * - * @throws EncodingException - * if the value is out range - */ - private void write(final int val) - throws EncodingException - { - - if (val < 0 || val > 255) { - throw ErrorFactory.createEncodingException( - ErrorKeys.DIFFTOOL_ENCODING_VALUE_OUT_OF_RANGE, - "byte value out of range: " + val); - } - - this.stream.write(val); - } - - /** - * Writes a single bit to the buffer. - * - * @param bit - * 0 or 1 - * @throws EncodingException - * if the input is neither 0 nor 1. - */ - public void writeBit(final int bit) - throws EncodingException - { - - if (bit != 0 && bit != 1) { - throw ErrorFactory.createEncodingException( - ErrorKeys.DIFFTOOL_ENCODING_VALUE_OUT_OF_RANGE, - "bit value out of range: " + bit); - } - - this.buffer |= bit << (7 - this.bufferLength); - this.bufferLength++; - - if (bufferLength == 8) { - - write(buffer); - - this.bufferLength = 0; - this.buffer = 0; - } - } - - /** - * Writes a positive integer to the buffer. - * - * @param length - * the number of bits to write - * @param value - * an integer value - * - * @throws EncodingException - * if the length of the input is more than 31 bits. - */ - public void writeValue(final int length, final int value) - throws EncodingException - { - if (length > 31) { - throw ErrorFactory.createEncodingException( - ErrorKeys.DIFFTOOL_ENCODING_VALUE_OUT_OF_RANGE, - "more than maximum length: " + value); - } - - for (int i = length - 1; i >= 0; i--) { - writeBit((value >> i) & 1); - } - } - - /** - * Writes the byte array to the buffer. The currently used buffer will be - * filled with zero bits before is is written in front of the byte-array. - * - * @param bText - * byte array - * - * @throws EncodingException - * if the writing fails - */ - public void write(final byte[] bText) - throws EncodingException - { - - writeFillBits(); - - int l = bText.length; - for (int i = 0; i < l; i++) { - write(0xFF & bText[i]); - } - } - - /** - * The currently used buffer will be filled with zero bits before is is - * written in the buffer. - * - * @throws EncodingException - * if the writing fails - */ - public void writeFillBits() - throws EncodingException - { - - while (this.bufferLength != 0) { - writeBit(0); - } - - this.buffer = 0; - } - - /** - * Returns the content of the buffer as byte-array. - * - * @return byte-array - */ - public byte[] toByteArray() - { - return this.stream.toByteArray(); - } +public class BitWriter { + + /** + * Output buffer + */ + private final ByteArrayOutputStream stream; + + /** + * Buffer to store the bits + */ + private int buffer; + + /** + * Number of stored bits + */ + private byte bufferLength = 0; + + /** + * Constructor Creates a BitWriter with a byte buffer of the given length. + * + * @param length Length of the byte buffer + */ + public BitWriter(final int length) { + this.stream = new ByteArrayOutputStream(length); + } + + /** + * Constructor Creates a BitWriter with a standard buffer. + */ + public BitWriter() { + this.stream = new ByteArrayOutputStream(); + } + + /** + * Writes a byte to the buffer. + * + * @param val an integer representing a full byte + * @throws EncodingException if the value is out range + */ + private void write(final int val) + throws EncodingException { + + if (val < 0 || val > 255) { + throw ErrorFactory.createEncodingException( + ErrorKeys.DIFFTOOL_ENCODING_VALUE_OUT_OF_RANGE, + "byte value out of range: " + val); + } + + this.stream.write(val); + } + + /** + * Writes a single bit to the buffer. + * + * @param bit 0 or 1 + * @throws EncodingException if the input is neither 0 nor 1. + */ + public void writeBit(final int bit) + throws EncodingException { + + if (bit != 0 && bit != 1) { + throw ErrorFactory.createEncodingException( + ErrorKeys.DIFFTOOL_ENCODING_VALUE_OUT_OF_RANGE, + "bit value out of range: " + bit); + } + + this.buffer |= bit << (7 - this.bufferLength); + this.bufferLength++; + + if (bufferLength == 8) { + + write(buffer); + + this.bufferLength = 0; + this.buffer = 0; + } + } + + /** + * Writes a positive integer to the buffer. + * + * @param length the number of bits to write + * @param value an integer value + * @throws EncodingException if the length of the input is more than 31 bits. + */ + public void writeValue(final int length, final int value) + throws EncodingException { + if (length > 31) { + throw ErrorFactory.createEncodingException( + ErrorKeys.DIFFTOOL_ENCODING_VALUE_OUT_OF_RANGE, + "more than maximum length: " + value); + } + + for (int i = length - 1; i >= 0; i--) { + writeBit((value >> i) & 1); + } + } + + /** + * Writes the byte array to the buffer. The currently used buffer will be + * filled with zero bits before is is written in front of the byte-array. + * + * @param bText byte array + * @throws EncodingException if the writing fails + */ + public void write(final byte[] bText) + throws EncodingException { + + writeFillBits(); + + int l = bText.length; + for (int i = 0; i < l; i++) { + write(0xFF & bText[i]); + } + } + + /** + * The currently used buffer will be filled with zero bits before is is + * written in the buffer. + * + * @throws EncodingException if the writing fails + */ + public void writeFillBits() + throws EncodingException { + + while (this.bufferLength != 0) { + writeBit(0); + } + + this.buffer = 0; + } + + /** + * Returns the content of the buffer as byte-array. + * + * @return byte-array + */ + public byte[] toByteArray() { + return this.stream.toByteArray(); + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/codec/RevisionCodecData.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/codec/RevisionCodecData.java index 8fa68378..873f8fd7 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/codec/RevisionCodecData.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/codec/RevisionCodecData.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -24,235 +24,233 @@ * Block C 3bit operation value Block S start position Block E length (end * position = start position + length) Block B block id Block L length of the t * block Block T block containing L bytes data - * - * - * */ -public class RevisionCodecData -{ - - /** Maximum size of a S block */ - private int blocksize_S; - - /** Maximum size of a E block */ - private int blocksize_E; - - /** Maximum size of an B block */ - private int blocksize_B; - - /** Maximum size of an L block */ - private int blocksize_L; - - /** Number of C blocks */ - private int countC; - - /** Number of S blocks */ - private int countS; - - /** Number of E blocks */ - private int countE; - - /** Number of B blocks */ - private int countB; - - /** Number of L blocks */ - private int countL; - - /** Number of T blocks */ - private int countT; - - /** Whether the information has already been converted or not */ - private boolean converted; - - /** - * Constructor Creates a new RevisionCodecData object. - */ - public RevisionCodecData() - { - this.converted = false; - this.blocksize_S = 0; - this.blocksize_E = 0; - this.blocksize_B = 0; - this.blocksize_L = 0; - } - - /** - * Gathers the information about an s block. - * - * @param value - * start position - */ - public void checkBlocksizeS(final int value) - { - if (value > blocksize_S) { - this.blocksize_S = value; - } - this.countS++; - this.countC++; - } - - /** - * Gathers the information about an e block. - * - * @param value - * length of the diff-block - */ - public void checkBlocksizeE(final int value) - { - if (value > blocksize_E) { - this.blocksize_E = value; - } - this.countE++; - } - - /** - * Gathers the information about an b block. - * - * @param value - * block id - */ - public void checkBlocksizeB(final int value) - { - if (value > blocksize_B) { - this.blocksize_B = value; - } - this.countB++; - } - - /** - * Gathers the information about an l block. - * - * @param value - * length of the text block - */ - public void checkBlocksizeL(final int value) - { - if (value > blocksize_L) { - this.blocksize_L = value; - } - this.countL++; - this.countT += value; - } - - /** - * Converts the input information into their log2 values. If an operation is - * contained in the diff, the minimum number of bits used to encode this - * block is 1 byte. - * - * @return number of bytes needed to encode the associated diff - */ - public int totalSizeInBits() - { - - if (converted) { - - return 24 + this.countC * 3 + this.countS * blocksize_S - + this.countE * blocksize_E + this.countB * blocksize_B - + this.countL * blocksize_L + this.countT * 8; - } - - converted = true; - // System.out.println(this.toString()); - - if (this.blocksize_B > 0) { - this.blocksize_B = (int) Math.ceil(Math.log(blocksize_B + 1) - / Math.log(2.)); - } - else if (this.countB > 0) { - this.blocksize_B = 1; - } - - if (this.blocksize_E > 0) { - this.blocksize_E = (int) Math.ceil(Math.log(blocksize_E + 1) - / Math.log(2.)); - } - else if (this.countE > 0) { - this.blocksize_E = 1; - } - - if (this.blocksize_L > 0) { - this.blocksize_L = (int) Math.ceil(Math.log(blocksize_L + 1) - / Math.log(2.)); - } - else if (this.countL > 0) { - this.blocksize_L = 1; - } - - if (this.blocksize_S > 0) { - this.blocksize_S = (int) Math.ceil(Math.log(blocksize_S + 1) - / Math.log(2.)); - } - else if (this.countS > 0) { - this.blocksize_S = 1; - } - - return 24 + this.countC * 3 + this.countS * blocksize_S + this.countE - * blocksize_E + this.countB * blocksize_B + this.countL - * blocksize_L + this.countT * 8; - } - - /** - * Returns the number of bits used to encode a B block. This method is - * intended to used after the conversion. - * - * @return block bit-length - */ - public int getBlocksizeB() - { - return this.blocksize_B; - } - - /** - * Returns the number of bits used to encode a E block. This method is - * intended to used after the conversion. - * - * @return block bit-length - */ - public int getBlocksizeE() - { - return this.blocksize_E; - } - - /** - * Returns the number of bits used to encode a L block. This method is - * intended to used after the conversion. - * - * @return block bit-length - */ - public int getBlocksizeL() - { - return this.blocksize_L; - } - - /** - * Returns the number of bits used to encode a S block. This method is - * intended to used after the conversion. - * - * @return block bit-length - */ - public int getBlocksizeS() - { - return this.blocksize_S; - } - - /** - * String representation of the revision codec data. - * - * @return string representation - */ - public String toString() - { - return this.blocksize_S + " " + this.blocksize_E + " " - + this.blocksize_B + " " + this.blocksize_L; - } - - /** - * Whether the information has already converted to the log2 basis or not. - * - * @return conversion information - */ - public boolean isConverted() - { - return this.converted; - } +public class RevisionCodecData { + + /** + * Maximum size of a S block + */ + private int blocksize_S; + + /** + * Maximum size of a E block + */ + private int blocksize_E; + + /** + * Maximum size of an B block + */ + private int blocksize_B; + + /** + * Maximum size of an L block + */ + private int blocksize_L; + + /** + * Number of C blocks + */ + private int countC; + + /** + * Number of S blocks + */ + private int countS; + + /** + * Number of E blocks + */ + private int countE; + + /** + * Number of B blocks + */ + private int countB; + + /** + * Number of L blocks + */ + private int countL; + + /** + * Number of T blocks + */ + private int countT; + + /** + * Whether the information has already been converted or not + */ + private boolean converted; + + /** + * Constructor Creates a new RevisionCodecData object. + */ + public RevisionCodecData() { + this.converted = false; + this.blocksize_S = 0; + this.blocksize_E = 0; + this.blocksize_B = 0; + this.blocksize_L = 0; + } + + /** + * Gathers the information about an s block. + * + * @param value start position + */ + public void checkBlocksizeS(final int value) { + if (value > blocksize_S) { + this.blocksize_S = value; + } + this.countS++; + this.countC++; + } + + /** + * Gathers the information about an e block. + * + * @param value length of the diff-block + */ + public void checkBlocksizeE(final int value) { + if (value > blocksize_E) { + this.blocksize_E = value; + } + this.countE++; + } + + /** + * Gathers the information about an b block. + * + * @param value block id + */ + public void checkBlocksizeB(final int value) { + if (value > blocksize_B) { + this.blocksize_B = value; + } + this.countB++; + } + + /** + * Gathers the information about an l block. + * + * @param value length of the text block + */ + public void checkBlocksizeL(final int value) { + if (value > blocksize_L) { + this.blocksize_L = value; + } + this.countL++; + this.countT += value; + } + + /** + * Converts the input information into their log2 values. If an operation is + * contained in the diff, the minimum number of bits used to encode this + * block is 1 byte. + * + * @return number of bytes needed to encode the associated diff + */ + public int totalSizeInBits() { + + if (converted) { + + return 24 + this.countC * 3 + this.countS * blocksize_S + + this.countE * blocksize_E + this.countB * blocksize_B + + this.countL * blocksize_L + this.countT * 8; + } + + converted = true; + // System.out.println(this.toString()); + + if (this.blocksize_B > 0) { + this.blocksize_B = (int) Math.ceil(Math.log(blocksize_B + 1) + / Math.log(2.)); + } else if (this.countB > 0) { + this.blocksize_B = 1; + } + + if (this.blocksize_E > 0) { + this.blocksize_E = (int) Math.ceil(Math.log(blocksize_E + 1) + / Math.log(2.)); + } else if (this.countE > 0) { + this.blocksize_E = 1; + } + + if (this.blocksize_L > 0) { + this.blocksize_L = (int) Math.ceil(Math.log(blocksize_L + 1) + / Math.log(2.)); + } else if (this.countL > 0) { + this.blocksize_L = 1; + } + + if (this.blocksize_S > 0) { + this.blocksize_S = (int) Math.ceil(Math.log(blocksize_S + 1) + / Math.log(2.)); + } else if (this.countS > 0) { + this.blocksize_S = 1; + } + + return 24 + this.countC * 3 + this.countS * blocksize_S + this.countE + * blocksize_E + this.countB * blocksize_B + this.countL + * blocksize_L + this.countT * 8; + } + + /** + * Returns the number of bits used to encode a B block. This method is + * intended to used after the conversion. + * + * @return block bit-length + */ + public int getBlocksizeB() { + return this.blocksize_B; + } + + /** + * Returns the number of bits used to encode a E block. This method is + * intended to used after the conversion. + * + * @return block bit-length + */ + public int getBlocksizeE() { + return this.blocksize_E; + } + + /** + * Returns the number of bits used to encode a L block. This method is + * intended to used after the conversion. + * + * @return block bit-length + */ + public int getBlocksizeL() { + return this.blocksize_L; + } + + /** + * Returns the number of bits used to encode a S block. This method is + * intended to used after the conversion. + * + * @return block bit-length + */ + public int getBlocksizeS() { + return this.blocksize_S; + } + + /** + * String representation of the revision codec data. + * + * @return string representation + */ + public String toString() { + return this.blocksize_S + " " + this.blocksize_E + " " + + this.blocksize_B + " " + this.blocksize_L; + } + + /** + * Whether the information has already converted to the log2 basis or not. + * + * @return conversion information + */ + public boolean isConverted() { + return this.converted; + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/codec/RevisionDecoder.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/codec/RevisionDecoder.java index 0b1031b3..8b6fd9f5 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/codec/RevisionDecoder.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/codec/RevisionDecoder.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -36,560 +36,486 @@ /** * The RevisionDecoder class contains methods to decode an encoded diff * information. - * - * - * */ -public class RevisionDecoder -{ - - /** Reference to the BitReader */ - private BitReader r; - - /** Configuration Parameter - Wikipedia Encoding */ - private final String WIKIPEDIA_ENCODING; - - /** - * (Constructor) Creates a new RevisionDecoder object. - * - * @throws ConfigurationException - * if an error occurs while accessing the configuration - * parameters - */ - private RevisionDecoder() - throws ConfigurationException - { - - // Load config parameters - ConfigurationManager config = ConfigurationManager.getInstance(); - - WIKIPEDIA_ENCODING = (String) config - .getConfigParameter(ConfigurationKeys.WIKIPEDIA_ENCODING); - } - - /** - * (Constructor) Creates a new RevisionDecoder object. - * - * @param wikipediaEncoding - * Character encoding - */ - public RevisionDecoder(final String wikipediaEncoding) - { - - WIKIPEDIA_ENCODING = wikipediaEncoding; - } - - /** - * (Constructor) Creates a new RevisionDecoder object. - * - * @param input - * binary encoded diff - * - * @throws ConfigurationException - * if an error occurs while accessing the configuration - * parameters - */ - public RevisionDecoder(final byte[] input) - throws ConfigurationException - { - - this(); - if (input[0] == -128) { - r = new BitReader(inflateInput(input, 1)); - } - else { - r = new BitReader(input); - } - } - - /** - * Decodes the information and returns the Diff. - * - * @return Diff - * - * @throws UnsupportedEncodingException - * if the character encoding is unsupported - * @throws DecodingException - * if the decoding failed - */ - public Diff decode() - throws UnsupportedEncodingException, DecodingException - { - - int header = r.read(3); - if (DiffAction.parse(header) != DiffAction.DECODER_DATA) { - - throw new DecodingException("Invalid codecData code: " + header); - } - - int blockSize_C = 3; - int blockSize_S = r.read(5); - int blockSize_E = r.read(5); - int blockSize_B = r.read(5); - int blockSize_L = r.read(5); - r.read(1); - - if (blockSize_S < 0 || blockSize_S > 31) { - throw new DecodingException("blockSize_S out of range: " - + blockSize_S); - } - if (blockSize_E < 0 || blockSize_E > 31) { - throw new DecodingException("blockSize_E out of range: " - + blockSize_E); - } - if (blockSize_B < 0 || blockSize_B > 31) { - throw new DecodingException("blockSize_B out of range: " - + blockSize_B); - } - if (blockSize_L < 0 || blockSize_L > 31) { - throw new DecodingException("blockSize_L out of range: " - + blockSize_L); - } - - return decode(blockSize_C, blockSize_S, blockSize_E, blockSize_B, - blockSize_L); - } - - /** - * Decodes the information, after the codec was successfully decoded, and - * returns the Diff. - * - * @param blockSize_C - * length of a C block - * @param blockSize_S - * length of a S block - * @param blockSize_E - * length of a E block - * @param blockSize_B - * length of a B block - * @param blockSize_L - * length of a L block - * @return Diff - * - * @throws UnsupportedEncodingException - * if the character encoding is unsupported - * @throws DecodingException - * if the decoding failed - */ - private Diff decode(final int blockSize_C, final int blockSize_S, - final int blockSize_E, final int blockSize_B, final int blockSize_L) - throws UnsupportedEncodingException, DecodingException - { - - int code = r.read(blockSize_C); - Diff diff = new Diff(); - - while (code != -1) { - // System.out.print(code + "\t"); - - switch (DiffAction.parse(code)) { - case FULL_REVISION_UNCOMPRESSED: - diff.add(decodeFullRevision(blockSize_L)); - break; - case INSERT: - diff.add(decodeAdd(blockSize_S, blockSize_L)); - break; - case DELETE: - diff.add(decodeDelete(blockSize_S, blockSize_E)); - break; - case REPLACE: - diff.add(decodeReplace(blockSize_S, blockSize_E, blockSize_L)); - break; - case CUT: - diff.add(decodeCut(blockSize_S, blockSize_E, blockSize_B)); - break; - case PASTE: - diff.add(decodePaste(blockSize_S, blockSize_B, r)); - break; - default: - throw new DecodingException("Invalid block_c code: " + code); - } - - // System.out.println(); - code = r.read(blockSize_C); - } - - return diff; - } - - /** - * Decodes an Add operation. - * - * @param blockSize_S - * length of a S block - * @param blockSize_L - * length of a L block - * @return DiffPart, Add operation - * - * @throws UnsupportedEncodingException - * if the character encoding is unsupported - * @throws DecodingException - * if the decoding failed - */ - private DiffPart decodeAdd(final int blockSize_S, final int blockSize_L) - throws UnsupportedEncodingException, DecodingException - { - - if (blockSize_S < 1 || blockSize_L < 1) { - throw new DecodingException("Invalid value for blockSize_S: " - + blockSize_S + " or blockSize_L: " + blockSize_L); - } - - int s = r.read(blockSize_S); - int l = r.read(blockSize_L); - - ByteArrayOutputStream output = new ByteArrayOutputStream(); - for (int i = 0; i < l; i++) { - output.write(r.readByte()); - } - - DiffPart part = new DiffPart(DiffAction.INSERT); - part.setStart(s); - part.setText(output.toString(WIKIPEDIA_ENCODING)); - - return part; - } - - /** - * Decodes a Cut operation. - * - * @param blockSize_S - * length of a S block - * @param blockSize_E - * length of a E block - * @param blockSize_B - * length of a B block - * @return DiffPart, Cut operation - * - * @throws DecodingException - * if the decoding failed - */ - private DiffPart decodeCut(final int blockSize_S, final int blockSize_E, - final int blockSize_B) - throws DecodingException - { - - if (blockSize_S < 1 || blockSize_E < 1 || blockSize_B < 1) { - throw new DecodingException("Invalid value for blockSize_S: " - + blockSize_S + ", blockSize_E: " + blockSize_E - + " or blockSize_B: " + blockSize_B); - } - - int s = r.read(blockSize_S); - int e = r.read(blockSize_E); - int b = r.read(blockSize_B); - - DiffPart part = new DiffPart(DiffAction.CUT); - part.setStart(s); - part.setLength(e); - part.setText(Integer.toString(b)); - - r.skip(); - - return part; - } - - /** - * Decodes a Delete operation. - * - * @param blockSize_S - * length of a S block - * @param blockSize_E - * length of a E block - * @return DiffPart, Delete operation - * - * @throws DecodingException - * if the decoding failed - */ - private DiffPart decodeDelete(final int blockSize_S, final int blockSize_E) - throws DecodingException - { - - if (blockSize_S < 1 || blockSize_E < 1) { - throw new DecodingException("Invalid value for blockSize_S: " - + blockSize_S + " or blockSize_E: " + blockSize_E); - } - - int s = r.read(blockSize_S); - int e = r.read(blockSize_E); - - DiffPart part = new DiffPart(DiffAction.DELETE); - part.setStart(s); - part.setLength(e); - - r.skip(); - - return part; - } - - /** - * Decodes a FullRevision operation. - * - * @param blockSize_L - * length of a L block - * @return DiffPart, FullRevision - * - * @throws UnsupportedEncodingException - * if the character encoding is unsupported - * @throws DecodingException - * if the decoding failed - */ - private DiffPart decodeFullRevision(final int blockSize_L) - throws UnsupportedEncodingException, DecodingException - { - - if (blockSize_L < 1) { - throw new DecodingException("Invalid value for blockSize_L: " - + blockSize_L); - } - - int l = r.read(blockSize_L); - - ByteArrayOutputStream output = new ByteArrayOutputStream(); - for (int i = 0; i < l; i++) { - output.write(r.readByte()); - } - DiffPart part = new DiffPart(DiffAction.FULL_REVISION_UNCOMPRESSED); - part.setText(output.toString(WIKIPEDIA_ENCODING)); - - return part; - } - - /** - * Decodes a Paste operation. - * - * @param blockSize_S - * length of a S block - * @param blockSize_B - * length of a B block - * @return DiffPart, Paste operation - * - * @throws DecodingException - * if the decoding failed - */ - private DiffPart decodePaste(final int blockSize_S, final int blockSize_B, - final BitReader r) - throws DecodingException - { - - if (blockSize_S < 1 || blockSize_B < 1) { - throw new DecodingException("Invalid value for blockSize_S: " - + blockSize_S + " or blockSize_B: " + blockSize_B); - } - - int s = r.read(blockSize_S); - int b = r.read(blockSize_B); - - DiffPart part = new DiffPart(DiffAction.PASTE); - part.setStart(s); - part.setText(Integer.toString(b)); - - r.skip(); - - return part; - } - - /** - * Decodes a Replace operation. - * - * @param blockSize_S - * length of a S block - * @param blockSize_E - * length of a E block - * @param blockSize_L - * length of a L block - * @return DiffPart, Replace operation - * - * @throws UnsupportedEncodingException - * if the character encoding is unsupported - * @throws DecodingException - * if the decoding failed - */ - private DiffPart decodeReplace(final int blockSize_S, - final int blockSize_E, final int blockSize_L) - throws UnsupportedEncodingException, DecodingException - { - - if (blockSize_S < 1 || blockSize_E < 1 || blockSize_L < 1) { - throw new DecodingException("Invalid value for blockSize_S: " - + blockSize_S + ", blockSize_E: " + blockSize_E - + " or blockSize_L: " + blockSize_L); - } - - int s = r.read(blockSize_S); - int e = r.read(blockSize_E); - int l = r.read(blockSize_L); - - ByteArrayOutputStream output = new ByteArrayOutputStream(); - for (int i = 0; i < l; i++) { - output.write(r.readByte()); - } - - DiffPart part = new DiffPart(DiffAction.REPLACE); - part.setStart(s); - part.setLength(e); - part.setText(output.toString(WIKIPEDIA_ENCODING)); - - return part; - } - - /** - * Inflates the zipped input. - * - * @param zipinput - * zipped input - * @param start - * start position - * @return inflated input - */ - private byte[] inflateInput(final byte[] zipinput, final int start) - { - ByteArrayOutputStream stream; - try { - byte[] compressedInput = zipinput; - Inflater decompresser = new Inflater(); - decompresser.setInput(compressedInput, start, - compressedInput.length - start); - - byte[] output = new byte[1000]; - stream = new ByteArrayOutputStream(); - - int cLength; - do { - cLength = decompresser.inflate(output); - stream.write(output, 0, cLength); - } - while (cLength == 1000); - - } - catch (DataFormatException e) { - throw new RuntimeException(e); - } - - return stream.toByteArray(); - } - - /** - * Assigns the binary input. - * - * @param input - * binary encoded diff - */ - public void setInput(final byte[] input) - { - - if (input[0] == -128) { - r = new BitReader(inflateInput(input, 1)); - } - else { - r = new BitReader(input); - } - } - - /** - * Assigns an input stream. - * - * @param input - * Reference to an input stream - * @param binary - * flag, whether the data is binary or not - * - * @throws IOException - * if an error occurs while reading the stream - */ - public void setInput(final InputStream input, final boolean binary) - throws IOException { - - if (!binary) { - - int v = input.read(); - StringBuilder buffer = new StringBuilder(); - - // Check for the no-zip flag - boolean zipFlag = (char) v == '_'; - if (zipFlag) { - v = input.read(); - } - - while (v != -1) { - buffer.append((char) v); - v = input.read(); - } - - Base64.Decoder decoder = Base64.getDecoder(); - - if (zipFlag) { - r = new BitReader(inflateInput( - decoder.decode(buffer.toString()), 0)); - } - else { - r = new BitReader(decoder.decode(buffer.toString())); - } - } - else { - - ByteArrayOutputStream stream = new ByteArrayOutputStream(); - - byte[] bData; - int l = input.available(); - while (l != 0) { - - bData = new byte[l]; - - if (input.read(bData) != l) { - throw new RuntimeException("ILLEGAL NUMBER OF BYTES READ"); - } - stream.write(bData); - - l = input.available(); - } - - if (input.read() != -1) { - throw new RuntimeException("END OF STREAM NOT REACHED"); - } - - bData = stream.toByteArray(); - - boolean zipFlag = bData[0] == -128; - - if (zipFlag) { - r = new BitReader(inflateInput(bData, 1)); - } - else { - r = new BitReader(bData); - } - } - } - - /** - * Assigns base 64 encoded input. - * - * @param input - * base 64 encoded diff - * - * @throws DecodingException - * if the decoding fails - */ - public void setInput(final String input) throws DecodingException { - - boolean zipFlag = input.charAt(0) == '_'; - Base64.Decoder decoder = Base64.getDecoder(); - if (zipFlag) { - r = new BitReader(inflateInput( - decoder.decode(input.substring(1)), 0)); - } - else { - byte[] data = decoder.decode(input); - if (data == null) { - - for (int i = 0; i < input.length(); i++) { - System.err.println(i + ": " + (int) input.charAt(i) - + " <> " + input.charAt(i)); - } - - throw new DecodingException("BASE 64 DECODING FAILED: " + input); - } - r = new BitReader(data); - } - } +public class RevisionDecoder { + + /** + * Reference to the BitReader + */ + private BitReader r; + + /** + * Configuration Parameter - Wikipedia Encoding + */ + private final String WIKIPEDIA_ENCODING; + + /** + * (Constructor) Creates a new RevisionDecoder object. + * + * @throws ConfigurationException if an error occurs while accessing the configuration + * parameters + */ + private RevisionDecoder() + throws ConfigurationException { + + // Load config parameters + ConfigurationManager config = ConfigurationManager.getInstance(); + + WIKIPEDIA_ENCODING = (String) config + .getConfigParameter(ConfigurationKeys.WIKIPEDIA_ENCODING); + } + + /** + * (Constructor) Creates a new RevisionDecoder object. + * + * @param wikipediaEncoding Character encoding + */ + public RevisionDecoder(final String wikipediaEncoding) { + + WIKIPEDIA_ENCODING = wikipediaEncoding; + } + + /** + * (Constructor) Creates a new RevisionDecoder object. + * + * @param input binary encoded diff + * @throws ConfigurationException if an error occurs while accessing the configuration + * parameters + */ + public RevisionDecoder(final byte[] input) + throws ConfigurationException { + + this(); + if (input[0] == -128) { + r = new BitReader(inflateInput(input, 1)); + } else { + r = new BitReader(input); + } + } + + /** + * Decodes the information and returns the Diff. + * + * @return Diff + * @throws UnsupportedEncodingException if the character encoding is unsupported + * @throws DecodingException if the decoding failed + */ + public Diff decode() + throws UnsupportedEncodingException, DecodingException { + + int header = r.read(3); + if (DiffAction.parse(header) != DiffAction.DECODER_DATA) { + + throw new DecodingException("Invalid codecData code: " + header); + } + + int blockSize_C = 3; + int blockSize_S = r.read(5); + int blockSize_E = r.read(5); + int blockSize_B = r.read(5); + int blockSize_L = r.read(5); + r.read(1); + + if (blockSize_S < 0 || blockSize_S > 31) { + throw new DecodingException("blockSize_S out of range: " + + blockSize_S); + } + if (blockSize_E < 0 || blockSize_E > 31) { + throw new DecodingException("blockSize_E out of range: " + + blockSize_E); + } + if (blockSize_B < 0 || blockSize_B > 31) { + throw new DecodingException("blockSize_B out of range: " + + blockSize_B); + } + if (blockSize_L < 0 || blockSize_L > 31) { + throw new DecodingException("blockSize_L out of range: " + + blockSize_L); + } + + return decode(blockSize_C, blockSize_S, blockSize_E, blockSize_B, + blockSize_L); + } + + /** + * Decodes the information, after the codec was successfully decoded, and + * returns the Diff. + * + * @param blockSize_C length of a C block + * @param blockSize_S length of a S block + * @param blockSize_E length of a E block + * @param blockSize_B length of a B block + * @param blockSize_L length of a L block + * @return Diff + * @throws UnsupportedEncodingException if the character encoding is unsupported + * @throws DecodingException if the decoding failed + */ + private Diff decode(final int blockSize_C, final int blockSize_S, + final int blockSize_E, final int blockSize_B, final int blockSize_L) + throws UnsupportedEncodingException, DecodingException { + + int code = r.read(blockSize_C); + Diff diff = new Diff(); + + while (code != -1) { + // System.out.print(code + "\t"); + + switch (DiffAction.parse(code)) { + case FULL_REVISION_UNCOMPRESSED: + diff.add(decodeFullRevision(blockSize_L)); + break; + case INSERT: + diff.add(decodeAdd(blockSize_S, blockSize_L)); + break; + case DELETE: + diff.add(decodeDelete(blockSize_S, blockSize_E)); + break; + case REPLACE: + diff.add(decodeReplace(blockSize_S, blockSize_E, blockSize_L)); + break; + case CUT: + diff.add(decodeCut(blockSize_S, blockSize_E, blockSize_B)); + break; + case PASTE: + diff.add(decodePaste(blockSize_S, blockSize_B, r)); + break; + default: + throw new DecodingException("Invalid block_c code: " + code); + } + + // System.out.println(); + code = r.read(blockSize_C); + } + + return diff; + } + + /** + * Decodes an Add operation. + * + * @param blockSize_S length of a S block + * @param blockSize_L length of a L block + * @return DiffPart, Add operation + * @throws UnsupportedEncodingException if the character encoding is unsupported + * @throws DecodingException if the decoding failed + */ + private DiffPart decodeAdd(final int blockSize_S, final int blockSize_L) + throws UnsupportedEncodingException, DecodingException { + + if (blockSize_S < 1 || blockSize_L < 1) { + throw new DecodingException("Invalid value for blockSize_S: " + + blockSize_S + " or blockSize_L: " + blockSize_L); + } + + int s = r.read(blockSize_S); + int l = r.read(blockSize_L); + + ByteArrayOutputStream output = new ByteArrayOutputStream(); + for (int i = 0; i < l; i++) { + output.write(r.readByte()); + } + + DiffPart part = new DiffPart(DiffAction.INSERT); + part.setStart(s); + part.setText(output.toString(WIKIPEDIA_ENCODING)); + + return part; + } + + /** + * Decodes a Cut operation. + * + * @param blockSize_S length of a S block + * @param blockSize_E length of a E block + * @param blockSize_B length of a B block + * @return DiffPart, Cut operation + * @throws DecodingException if the decoding failed + */ + private DiffPart decodeCut(final int blockSize_S, final int blockSize_E, + final int blockSize_B) + throws DecodingException { + + if (blockSize_S < 1 || blockSize_E < 1 || blockSize_B < 1) { + throw new DecodingException("Invalid value for blockSize_S: " + + blockSize_S + ", blockSize_E: " + blockSize_E + + " or blockSize_B: " + blockSize_B); + } + + int s = r.read(blockSize_S); + int e = r.read(blockSize_E); + int b = r.read(blockSize_B); + + DiffPart part = new DiffPart(DiffAction.CUT); + part.setStart(s); + part.setLength(e); + part.setText(Integer.toString(b)); + + r.skip(); + + return part; + } + + /** + * Decodes a Delete operation. + * + * @param blockSize_S length of a S block + * @param blockSize_E length of a E block + * @return DiffPart, Delete operation + * @throws DecodingException if the decoding failed + */ + private DiffPart decodeDelete(final int blockSize_S, final int blockSize_E) + throws DecodingException { + + if (blockSize_S < 1 || blockSize_E < 1) { + throw new DecodingException("Invalid value for blockSize_S: " + + blockSize_S + " or blockSize_E: " + blockSize_E); + } + + int s = r.read(blockSize_S); + int e = r.read(blockSize_E); + + DiffPart part = new DiffPart(DiffAction.DELETE); + part.setStart(s); + part.setLength(e); + + r.skip(); + + return part; + } + + /** + * Decodes a FullRevision operation. + * + * @param blockSize_L length of a L block + * @return DiffPart, FullRevision + * @throws UnsupportedEncodingException if the character encoding is unsupported + * @throws DecodingException if the decoding failed + */ + private DiffPart decodeFullRevision(final int blockSize_L) + throws UnsupportedEncodingException, DecodingException { + + if (blockSize_L < 1) { + throw new DecodingException("Invalid value for blockSize_L: " + + blockSize_L); + } + + int l = r.read(blockSize_L); + + ByteArrayOutputStream output = new ByteArrayOutputStream(); + for (int i = 0; i < l; i++) { + output.write(r.readByte()); + } + DiffPart part = new DiffPart(DiffAction.FULL_REVISION_UNCOMPRESSED); + part.setText(output.toString(WIKIPEDIA_ENCODING)); + + return part; + } + + /** + * Decodes a Paste operation. + * + * @param blockSize_S length of a S block + * @param blockSize_B length of a B block + * @return DiffPart, Paste operation + * @throws DecodingException if the decoding failed + */ + private DiffPart decodePaste(final int blockSize_S, final int blockSize_B, + final BitReader r) + throws DecodingException { + + if (blockSize_S < 1 || blockSize_B < 1) { + throw new DecodingException("Invalid value for blockSize_S: " + + blockSize_S + " or blockSize_B: " + blockSize_B); + } + + int s = r.read(blockSize_S); + int b = r.read(blockSize_B); + + DiffPart part = new DiffPart(DiffAction.PASTE); + part.setStart(s); + part.setText(Integer.toString(b)); + + r.skip(); + + return part; + } + + /** + * Decodes a Replace operation. + * + * @param blockSize_S length of a S block + * @param blockSize_E length of a E block + * @param blockSize_L length of a L block + * @return DiffPart, Replace operation + * @throws UnsupportedEncodingException if the character encoding is unsupported + * @throws DecodingException if the decoding failed + */ + private DiffPart decodeReplace(final int blockSize_S, + final int blockSize_E, final int blockSize_L) + throws UnsupportedEncodingException, DecodingException { + + if (blockSize_S < 1 || blockSize_E < 1 || blockSize_L < 1) { + throw new DecodingException("Invalid value for blockSize_S: " + + blockSize_S + ", blockSize_E: " + blockSize_E + + " or blockSize_L: " + blockSize_L); + } + + int s = r.read(blockSize_S); + int e = r.read(blockSize_E); + int l = r.read(blockSize_L); + + ByteArrayOutputStream output = new ByteArrayOutputStream(); + for (int i = 0; i < l; i++) { + output.write(r.readByte()); + } + + DiffPart part = new DiffPart(DiffAction.REPLACE); + part.setStart(s); + part.setLength(e); + part.setText(output.toString(WIKIPEDIA_ENCODING)); + + return part; + } + + /** + * Inflates the zipped input. + * + * @param zipinput zipped input + * @param start start position + * @return inflated input + */ + private byte[] inflateInput(final byte[] zipinput, final int start) { + ByteArrayOutputStream stream; + try { + byte[] compressedInput = zipinput; + Inflater decompresser = new Inflater(); + decompresser.setInput(compressedInput, start, + compressedInput.length - start); + + byte[] output = new byte[1000]; + stream = new ByteArrayOutputStream(); + + int cLength; + do { + cLength = decompresser.inflate(output); + stream.write(output, 0, cLength); + } + while (cLength == 1000); + + } catch (DataFormatException e) { + throw new RuntimeException(e); + } + + return stream.toByteArray(); + } + + /** + * Assigns the binary input. + * + * @param input binary encoded diff + */ + public void setInput(final byte[] input) { + + if (input[0] == -128) { + r = new BitReader(inflateInput(input, 1)); + } else { + r = new BitReader(input); + } + } + + /** + * Assigns an input stream. + * + * @param input Reference to an input stream + * @param binary flag, whether the data is binary or not + * @throws IOException if an error occurs while reading the stream + */ + public void setInput(final InputStream input, final boolean binary) + throws IOException { + + if (!binary) { + + int v = input.read(); + StringBuilder buffer = new StringBuilder(); + + // Check for the no-zip flag + boolean zipFlag = (char) v == '_'; + if (zipFlag) { + v = input.read(); + } + + while (v != -1) { + buffer.append((char) v); + v = input.read(); + } + + Base64.Decoder decoder = Base64.getDecoder(); + + if (zipFlag) { + r = new BitReader(inflateInput( + decoder.decode(buffer.toString()), 0)); + } else { + r = new BitReader(decoder.decode(buffer.toString())); + } + } else { + + ByteArrayOutputStream stream = new ByteArrayOutputStream(); + + byte[] bData; + int l = input.available(); + while (l != 0) { + + bData = new byte[l]; + + if (input.read(bData) != l) { + throw new RuntimeException("ILLEGAL NUMBER OF BYTES READ"); + } + stream.write(bData); + + l = input.available(); + } + + if (input.read() != -1) { + throw new RuntimeException("END OF STREAM NOT REACHED"); + } + + bData = stream.toByteArray(); + + boolean zipFlag = bData[0] == -128; + + if (zipFlag) { + r = new BitReader(inflateInput(bData, 1)); + } else { + r = new BitReader(bData); + } + } + } + + /** + * Assigns base 64 encoded input. + * + * @param input base 64 encoded diff + * @throws DecodingException if the decoding fails + */ + public void setInput(final String input) throws DecodingException { + + boolean zipFlag = input.charAt(0) == '_'; + Base64.Decoder decoder = Base64.getDecoder(); + if (zipFlag) { + r = new BitReader(inflateInput( + decoder.decode(input.substring(1)), 0)); + } else { + byte[] data = decoder.decode(input); + if (data == null) { + + for (int i = 0; i < input.length(); i++) { + System.err.println(i + ": " + (int) input.charAt(i) + + " <> " + input.charAt(i)); + } + + throw new DecodingException("BASE 64 DECODING FAILED: " + input); + } + r = new BitReader(data); + } + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/codec/RevisionEncoder.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/codec/RevisionEncoder.java index 292175ed..dc227907 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/codec/RevisionEncoder.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/codec/RevisionEncoder.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -34,402 +34,366 @@ * The RevisionApi class contains methods to encode the diff information. */ public class RevisionEncoder - implements RevisionEncoderInterface -{ - - /** Reference to the codec */ - private RevisionCodecData codecData; - - /** Reference to the BitWriter */ - private BitWriter data; - - /** Configuration Parameter - Zip Compression */ - private final boolean MODE_ZIP_COMPRESSION; - - /** Configuration Parameter - Wikipedia Encoding */ - private final String WIKIPEDIA_ENCODING; - - /** - * (Constructor) Creates a new RevisionEnocder object. - * - * @throws ConfigurationException - * if an error occurs while accessing the configuration - * parameters - */ - public RevisionEncoder() - throws ConfigurationException - { - - ConfigurationManager config = ConfigurationManager.getInstance(); - - WIKIPEDIA_ENCODING = (String) config - .getConfigParameter(ConfigurationKeys.WIKIPEDIA_ENCODING); - - MODE_ZIP_COMPRESSION = (Boolean) config - .getConfigParameter(ConfigurationKeys.MODE_ZIP_COMPRESSION_ENABLED); - } - - /* - * (non-Javadoc) - * - * @see - * de.tud.ukp.kulessa.delta.data.codec.RevisionEncoderInterface#binaryDiff - * (de.tud.ukp.kulessa.delta.data.codec.RevisionCodecData, - * de.tud.ukp.kulessa.delta.data.tasks.content.Diff) - */ - @Override - public byte[] binaryDiff(final RevisionCodecData codecData, final Diff diff) - throws UnsupportedEncodingException, EncodingException - { - - byte[] bData = encode(codecData, diff); - if (MODE_ZIP_COMPRESSION) { - - Deflater compresser = new Deflater(); - compresser.setInput(bData); - compresser.finish(); - - byte[] output = new byte[1000]; - ByteArrayOutputStream stream = new ByteArrayOutputStream(); - - int cLength; - do { - cLength = compresser.deflate(output); - stream.write(output, 0, cLength); - } - while (cLength == 1000); - - output = stream.toByteArray(); - if (bData.length + 1 < output.length) { - return bData; - } - else { - - stream = new ByteArrayOutputStream(); - stream.write(new byte[] { -128 }, 0, 1); - stream.write(output, 0, output.length); - - return stream.toByteArray(); - } - } - - return bData; - } - - /** - * Creates the binary encoding of the diff while using the codec - * information. - * - * @param codecData - * codec - * @param diff - * diff - * @return binary data - * - * @throws UnsupportedEncodingException - * if the character encoding is unsupported - * @throws EncodingException - * if the encoding failed - */ - private byte[] encode(final RevisionCodecData codecData, final Diff diff) - throws UnsupportedEncodingException, EncodingException - { - - this.data = new BitWriter(codecData.totalSizeInBits()); - encodeCodecData(codecData); - - DiffPart part; - - Iterator partIt = diff.iterator(); - while (partIt.hasNext()) { - part = partIt.next(); - - switch (part.getAction()) { - case FULL_REVISION_UNCOMPRESSED: - encodeFullRevisionUncompressed(part); - break; - case INSERT: - encodeInsert(part); - break; - case DELETE: - encodeDelete(part); - break; - case REPLACE: - encodeReplace(part); - break; - case CUT: - encodeCut(part); - break; - case PASTE: - encodePaste(part); - break; - /* - * case FULL_REVISION_COMPRESSED: - * encodeFullRevisionCompressed(part); break; - */ - default: - throw new RuntimeException(); - } - } - - return data.toByteArray(); - } - - /** - * Encodes the codecData. - * - * @param codecData - * Reference to the codec - * - * @throws EncodingException - * if the encoding failed - */ - private void encodeCodecData(final RevisionCodecData codecData) - throws EncodingException - { - - this.codecData = codecData; - - // C - data.writeBit(0); - data.writeBit(0); - data.writeBit(0); - - // BLOCK SIZES - S E B L - this.data.writeValue(5, codecData.getBlocksizeS()); - this.data.writeValue(5, codecData.getBlocksizeE()); - this.data.writeValue(5, codecData.getBlocksizeB()); - this.data.writeValue(5, codecData.getBlocksizeL()); - - // 1 Bit - data.writeFillBits(); - } - - /** - * Encodes a Cut operation. - * - * @param part - * Reference to the Cut operation - * - * @throws EncodingException - * if the encoding failed - */ - private void encodeCut(final DiffPart part) - throws EncodingException - { - - // C - data.writeBit(1); - data.writeBit(0); - data.writeBit(1); - - // S - data.writeValue(codecData.getBlocksizeS(), part.getStart()); - - // E - data.writeValue(codecData.getBlocksizeE(), part.getLength()); - - // B - data.writeValue(codecData.getBlocksizeB(), - Integer.parseInt(part.getText())); - - data.writeFillBits(); - - } - - /** - * Encodes a Delete operation. - * - * @param part - * Reference to the Delete operation - * - * @throws EncodingException - * if the encoding failed - */ - private void encodeDelete(final DiffPart part) - throws EncodingException - { - - // C - data.writeBit(0); - data.writeBit(1); - data.writeBit(1); - - // S - data.writeValue(codecData.getBlocksizeS(), part.getStart()); - - // E - data.writeValue(codecData.getBlocksizeE(), part.getLength()); - - data.writeFillBits(); - } - - /* - * (non-Javadoc) - * - * @see - * de.tud.ukp.kulessa.delta.data.codec.RevisionEncoderInterface#encodeDiff - * (de.tud.ukp.kulessa.delta.data.codec.RevisionCodecData, - * de.tud.ukp.kulessa.delta.data.tasks.content.Diff) - */ - @Override - public String encodeDiff(final RevisionCodecData codecData, final Diff diff) - throws UnsupportedEncodingException, EncodingException { - - String sEncoding; - byte[] bData = encode(codecData, diff); - Base64.Encoder encoder = Base64.getEncoder(); - if (MODE_ZIP_COMPRESSION) { - - Deflater compresser = new Deflater(); - compresser.setInput(bData); - compresser.finish(); - - byte[] output = new byte[1000]; - ByteArrayOutputStream stream = new ByteArrayOutputStream(); - - int cLength; - do { - cLength = compresser.deflate(output); - stream.write(output, 0, cLength); - } - while (cLength == 1000); - - output = stream.toByteArray(); - - if (bData.length + 1 < output.length) { - sEncoding = encoder.encodeToString(bData); - } - else { - sEncoding = "_" + encoder.encodeToString(output); - } - } - else { - sEncoding = encoder.encodeToString(bData); - } - - return sEncoding; - } - - /** - * Encodes a FullRevision operation. - * - * @param part - * Reference to the FullRevision operation - * - * @throws UnsupportedEncodingException - * if the character encoding is unsupported - * @throws EncodingException - * if the encoding failed - */ - private void encodeFullRevisionUncompressed(final DiffPart part) - throws UnsupportedEncodingException, EncodingException - { - - // C - data.writeBit(0); - data.writeBit(0); - data.writeBit(1); - - // L T - String text = part.getText(); - byte[] bText = text.getBytes(WIKIPEDIA_ENCODING); - - data.writeValue(codecData.getBlocksizeL(), bText.length); - data.write(bText); - - } - - /** - * Encodes an Insert operation. - * - * @param part - * Reference to the Insert operation - * - * @throws UnsupportedEncodingException - * if the character encoding is unsupported - * @throws EncodingException - * if the encoding failed - */ - private void encodeInsert(final DiffPart part) - throws UnsupportedEncodingException, EncodingException - { - - // C - data.writeBit(0); - data.writeBit(1); - data.writeBit(0); - - // S - data.writeValue(codecData.getBlocksizeS(), part.getStart()); - - // L T - String text = part.getText(); - byte[] bText = text.getBytes(WIKIPEDIA_ENCODING); - - data.writeValue(codecData.getBlocksizeL(), bText.length); - data.write(bText); - } - - /** - * Encodes a Paste operation. - * - * @param part - * Reference to the Paste operation - * - * @throws EncodingException - * if the encoding failed - */ - private void encodePaste(final DiffPart part) - throws EncodingException - { - - // C - data.writeBit(1); - data.writeBit(1); - data.writeBit(0); - - // S - data.writeValue(codecData.getBlocksizeS(), part.getStart()); - - // B - data.writeValue(codecData.getBlocksizeB(), - Integer.parseInt(part.getText())); - - data.writeFillBits(); - } - - /** - * Encodes a Replace operation. - * - * @param part - * Reference to the replace operation - * - * @throws UnsupportedEncodingException - * if the character encoding is unsupported - * @throws EncodingException - * if the encoding failed - */ - private void encodeReplace(final DiffPart part) - throws UnsupportedEncodingException, EncodingException - { - - // C - data.writeBit(1); - data.writeBit(0); - data.writeBit(0); - - // S - data.writeValue(codecData.getBlocksizeS(), part.getStart()); - - // E - data.writeValue(codecData.getBlocksizeE(), part.getLength()); - - // L T - String text = part.getText(); - byte[] bText = text.getBytes(WIKIPEDIA_ENCODING); - - data.writeValue(codecData.getBlocksizeL(), bText.length); - data.write(bText); - } + implements RevisionEncoderInterface { + + /** + * Reference to the codec + */ + private RevisionCodecData codecData; + + /** + * Reference to the BitWriter + */ + private BitWriter data; + + /** + * Configuration Parameter - Zip Compression + */ + private final boolean MODE_ZIP_COMPRESSION; + + /** + * Configuration Parameter - Wikipedia Encoding + */ + private final String WIKIPEDIA_ENCODING; + + /** + * (Constructor) Creates a new RevisionEnocder object. + * + * @throws ConfigurationException if an error occurs while accessing the configuration + * parameters + */ + public RevisionEncoder() + throws ConfigurationException { + + ConfigurationManager config = ConfigurationManager.getInstance(); + + WIKIPEDIA_ENCODING = (String) config + .getConfigParameter(ConfigurationKeys.WIKIPEDIA_ENCODING); + + MODE_ZIP_COMPRESSION = (Boolean) config + .getConfigParameter(ConfigurationKeys.MODE_ZIP_COMPRESSION_ENABLED); + } + + /* + * (non-Javadoc) + * + * @see + * de.tud.ukp.kulessa.delta.data.codec.RevisionEncoderInterface#binaryDiff + * (de.tud.ukp.kulessa.delta.data.codec.RevisionCodecData, + * de.tud.ukp.kulessa.delta.data.tasks.content.Diff) + */ + @Override + public byte[] binaryDiff(final RevisionCodecData codecData, final Diff diff) + throws UnsupportedEncodingException, EncodingException { + + byte[] bData = encode(codecData, diff); + if (MODE_ZIP_COMPRESSION) { + + Deflater compresser = new Deflater(); + compresser.setInput(bData); + compresser.finish(); + + byte[] output = new byte[1000]; + ByteArrayOutputStream stream = new ByteArrayOutputStream(); + + int cLength; + do { + cLength = compresser.deflate(output); + stream.write(output, 0, cLength); + } + while (cLength == 1000); + + output = stream.toByteArray(); + if (bData.length + 1 < output.length) { + return bData; + } else { + + stream = new ByteArrayOutputStream(); + stream.write(new byte[]{-128}, 0, 1); + stream.write(output, 0, output.length); + + return stream.toByteArray(); + } + } + + return bData; + } + + /** + * Creates the binary encoding of the diff while using the codec + * information. + * + * @param codecData codec + * @param diff diff + * @return binary data + * @throws UnsupportedEncodingException if the character encoding is unsupported + * @throws EncodingException if the encoding failed + */ + private byte[] encode(final RevisionCodecData codecData, final Diff diff) + throws UnsupportedEncodingException, EncodingException { + + this.data = new BitWriter(codecData.totalSizeInBits()); + encodeCodecData(codecData); + + DiffPart part; + + Iterator partIt = diff.iterator(); + while (partIt.hasNext()) { + part = partIt.next(); + + switch (part.getAction()) { + case FULL_REVISION_UNCOMPRESSED: + encodeFullRevisionUncompressed(part); + break; + case INSERT: + encodeInsert(part); + break; + case DELETE: + encodeDelete(part); + break; + case REPLACE: + encodeReplace(part); + break; + case CUT: + encodeCut(part); + break; + case PASTE: + encodePaste(part); + break; + /* + * case FULL_REVISION_COMPRESSED: + * encodeFullRevisionCompressed(part); break; + */ + default: + throw new RuntimeException(); + } + } + + return data.toByteArray(); + } + + /** + * Encodes the codecData. + * + * @param codecData Reference to the codec + * @throws EncodingException if the encoding failed + */ + private void encodeCodecData(final RevisionCodecData codecData) + throws EncodingException { + + this.codecData = codecData; + + // C + data.writeBit(0); + data.writeBit(0); + data.writeBit(0); + + // BLOCK SIZES - S E B L + this.data.writeValue(5, codecData.getBlocksizeS()); + this.data.writeValue(5, codecData.getBlocksizeE()); + this.data.writeValue(5, codecData.getBlocksizeB()); + this.data.writeValue(5, codecData.getBlocksizeL()); + + // 1 Bit + data.writeFillBits(); + } + + /** + * Encodes a Cut operation. + * + * @param part Reference to the Cut operation + * @throws EncodingException if the encoding failed + */ + private void encodeCut(final DiffPart part) + throws EncodingException { + + // C + data.writeBit(1); + data.writeBit(0); + data.writeBit(1); + + // S + data.writeValue(codecData.getBlocksizeS(), part.getStart()); + + // E + data.writeValue(codecData.getBlocksizeE(), part.getLength()); + + // B + data.writeValue(codecData.getBlocksizeB(), + Integer.parseInt(part.getText())); + + data.writeFillBits(); + + } + + /** + * Encodes a Delete operation. + * + * @param part Reference to the Delete operation + * @throws EncodingException if the encoding failed + */ + private void encodeDelete(final DiffPart part) + throws EncodingException { + + // C + data.writeBit(0); + data.writeBit(1); + data.writeBit(1); + + // S + data.writeValue(codecData.getBlocksizeS(), part.getStart()); + + // E + data.writeValue(codecData.getBlocksizeE(), part.getLength()); + + data.writeFillBits(); + } + + /* + * (non-Javadoc) + * + * @see + * de.tud.ukp.kulessa.delta.data.codec.RevisionEncoderInterface#encodeDiff + * (de.tud.ukp.kulessa.delta.data.codec.RevisionCodecData, + * de.tud.ukp.kulessa.delta.data.tasks.content.Diff) + */ + @Override + public String encodeDiff(final RevisionCodecData codecData, final Diff diff) + throws UnsupportedEncodingException, EncodingException { + + String sEncoding; + byte[] bData = encode(codecData, diff); + Base64.Encoder encoder = Base64.getEncoder(); + if (MODE_ZIP_COMPRESSION) { + + Deflater compresser = new Deflater(); + compresser.setInput(bData); + compresser.finish(); + + byte[] output = new byte[1000]; + ByteArrayOutputStream stream = new ByteArrayOutputStream(); + + int cLength; + do { + cLength = compresser.deflate(output); + stream.write(output, 0, cLength); + } + while (cLength == 1000); + + output = stream.toByteArray(); + + if (bData.length + 1 < output.length) { + sEncoding = encoder.encodeToString(bData); + } else { + sEncoding = "_" + encoder.encodeToString(output); + } + } else { + sEncoding = encoder.encodeToString(bData); + } + + return sEncoding; + } + + /** + * Encodes a FullRevision operation. + * + * @param part Reference to the FullRevision operation + * @throws UnsupportedEncodingException if the character encoding is unsupported + * @throws EncodingException if the encoding failed + */ + private void encodeFullRevisionUncompressed(final DiffPart part) + throws UnsupportedEncodingException, EncodingException { + + // C + data.writeBit(0); + data.writeBit(0); + data.writeBit(1); + + // L T + String text = part.getText(); + byte[] bText = text.getBytes(WIKIPEDIA_ENCODING); + + data.writeValue(codecData.getBlocksizeL(), bText.length); + data.write(bText); + + } + + /** + * Encodes an Insert operation. + * + * @param part Reference to the Insert operation + * @throws UnsupportedEncodingException if the character encoding is unsupported + * @throws EncodingException if the encoding failed + */ + private void encodeInsert(final DiffPart part) + throws UnsupportedEncodingException, EncodingException { + + // C + data.writeBit(0); + data.writeBit(1); + data.writeBit(0); + + // S + data.writeValue(codecData.getBlocksizeS(), part.getStart()); + + // L T + String text = part.getText(); + byte[] bText = text.getBytes(WIKIPEDIA_ENCODING); + + data.writeValue(codecData.getBlocksizeL(), bText.length); + data.write(bText); + } + + /** + * Encodes a Paste operation. + * + * @param part Reference to the Paste operation + * @throws EncodingException if the encoding failed + */ + private void encodePaste(final DiffPart part) + throws EncodingException { + + // C + data.writeBit(1); + data.writeBit(1); + data.writeBit(0); + + // S + data.writeValue(codecData.getBlocksizeS(), part.getStart()); + + // B + data.writeValue(codecData.getBlocksizeB(), + Integer.parseInt(part.getText())); + + data.writeFillBits(); + } + + /** + * Encodes a Replace operation. + * + * @param part Reference to the replace operation + * @throws UnsupportedEncodingException if the character encoding is unsupported + * @throws EncodingException if the encoding failed + */ + private void encodeReplace(final DiffPart part) + throws UnsupportedEncodingException, EncodingException { + + // C + data.writeBit(1); + data.writeBit(0); + data.writeBit(0); + + // S + data.writeValue(codecData.getBlocksizeS(), part.getStart()); + + // E + data.writeValue(codecData.getBlocksizeE(), part.getLength()); + + // L T + String text = part.getText(); + byte[] bText = text.getBytes(WIKIPEDIA_ENCODING); + + data.writeValue(codecData.getBlocksizeL(), bText.length); + data.write(bText); + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/codec/RevisionEncoderInterface.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/codec/RevisionEncoderInterface.java index f2f010dc..96a2e271 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/codec/RevisionEncoderInterface.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/codec/RevisionEncoderInterface.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -24,49 +24,33 @@ /** * The RevisionApi Interface describes the link to the diff encoding unit. - * - * - * */ -public interface RevisionEncoderInterface -{ +public interface RevisionEncoderInterface { - /** - * Returns the textual encoding of the given Diff. - * - * @param codecData - * CodecData used to encode the diff-data - * @param diff - * diff-data - * @return base 64 encoded diff - * - * @throws UnsupportedEncodingException - * if the CharacterSet defined in the configuration is not - * supported by JAVA. - * - * @throws EncodingException - * if the encoding process fails - */ + /** + * Returns the textual encoding of the given Diff. + * + * @param codecData CodecData used to encode the diff-data + * @param diff diff-data + * @return base 64 encoded diff + * @throws UnsupportedEncodingException if the CharacterSet defined in the configuration is not + * supported by JAVA. + * @throws EncodingException if the encoding process fails + */ String encodeDiff(final RevisionCodecData codecData, final Diff diff) - throws UnsupportedEncodingException, EncodingException; + throws UnsupportedEncodingException, EncodingException; - /** - * Returns the binary encoding of the given Diff. - * - * @param codecData - * CodecData used to encode the diff-data - * @param diff - * diff-data - * @return binary encoded diff - * - * @throws UnsupportedEncodingException - * if the CharacterSet defined in the configuration is not - * supported by JAVA. - * - * @throws EncodingException - * if the encoding process fails - */ + /** + * Returns the binary encoding of the given Diff. + * + * @param codecData CodecData used to encode the diff-data + * @param diff diff-data + * @return binary encoded diff + * @throws UnsupportedEncodingException if the CharacterSet defined in the configuration is not + * supported by JAVA. + * @throws EncodingException if the encoding process fails + */ byte[] binaryDiff(final RevisionCodecData codecData, final Diff diff) - throws UnsupportedEncodingException, EncodingException; + throws UnsupportedEncodingException, EncodingException; } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/tasks/ISizeable.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/tasks/ISizeable.java index 794b466f..ee87c504 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/tasks/ISizeable.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/tasks/ISizeable.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -19,17 +19,13 @@ /** * This interface defines a method for size estimations. - * - * - * */ -public interface ISizeable -{ +public interface ISizeable { - /** - * This method should return a size estimation of the data. - * - * @return size estimation in byte - */ + /** + * This method should return a size estimation of the data. + * + * @return size estimation in byte + */ long byteSize(); } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/tasks/Task.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/tasks/Task.java index 7d288da9..8019bf4b 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/tasks/Task.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/tasks/Task.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -25,249 +25,231 @@ /** * The task class contains the information of a task. * - * - * - * - * @param - * Class of data the task contains + * @param Class of data the task contains */ -public class Task -{ - - /* - * +STATICS++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ - */ - - /** - * Creates a dummy task without data. - * - * @return dummy task - */ - @SuppressWarnings("rawtypes") - public static Task createDummy() - { - return new Task(TaskTypes.DUMMY); - } - - /** - * Creates an end task. - * - * @return end task - */ - @SuppressWarnings("rawtypes") - public static Task createEndTask() - { - return new Task(TaskTypes.ENDTASK); - } - - /** - * Creates a banned task. - * - * @return banned task - */ - @SuppressWarnings("rawtypes") - public static Task createBannedTask() - { - return new Task(TaskTypes.BANNED_TASK); - } - - /* - * +ATTRIBUTES+AND+CONSTRUCTORS++++++++++++++++++++++++++++++++++++++++++++++ - */ - - /** Type of the task */ - private TaskTypes taskType; - - /** Additional information concerning the article */ - private ArticleInformation header; - - /** Data of the task */ - private final ArrayList container; - - /** Counter of the task parts (1-based) */ - private final int partCounter; - - /** Size of this task */ - private int byteSize; - - /** - * Constructor - A new task object of the specified type will be created. - * - * @param taskType - * Type of task - */ - protected Task(final TaskTypes taskType) - { - this.taskType = taskType; - this.container = null; - - this.byteSize = 0; - this.partCounter = 0; - } - - /** - * Constructor - A new task object of the type TASK_FULL will be created. - * - * @param header - * reference to the article information - * @param taskPartCounter - * task part counter - */ - public Task(final ArticleInformation header, final int taskPartCounter) - { - this.header = header; - - this.byteSize = 0; - this.partCounter = taskPartCounter; - this.taskType = TaskTypes.TASK_FULL; - - this.container = new ArrayList<>(); - } - - /* - * +METHODS++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ - */ - - /** - * Returns the reference to the article header. - */ - public ArticleInformation getHeader() - { - return this.header; - } - - /** - * Returns the type of this task. - * - * @return TaskType - */ - public TaskTypes getTaskType() - { - return this.taskType; - } - - /** - * Adds data to this task. - * - * @param data - * Reference to the data object. - */ - public void add(final D data) - { - this.container.add(data); - - // if the size of data is known add the value to the task size - if (data instanceof ISizeable) { - this.byteSize += ((ISizeable) data).byteSize(); - } - } - - /** - * Returns the data of this task. - * - * @return data - */ - public ArrayList getContainer() - { - return this.container; - } - - /** - * Returns the data at the specified index. - *

- * The index will not be check whether it is out of range or not. If you do - * not know the appropriate index call the size() method before calling this - * method. - * - * @param index - * index - * @return data - */ - public D get(final int index) - { - return this.container.get(index); - } - - /** - * Returns the number of data parts the task contains. - * - * @return number of data parts. - */ - public int size() - { - return this.container.size(); - } - - /** - * Returns an iterator over the data. - * - * @return Iterator - */ - public Iterator iterator() - { - return this.container.iterator(); - } - - /** - * Returns the size estimation of this task in bytes. - *

- * The size can only be estimated if the data contains the ISizeable - * interface. - * - * @return size estimation - */ - public int byteSize() - { - return this.byteSize; - } - - /** - * Returns the type of the task. - * - * @param taskType - * TaskType - */ - public void setTaskType(final TaskTypes taskType) - { - this.taskType = taskType; - } - - /** - * Returns the part counter. - * - * @return Part counter - */ - public int getPartCounter() - { - return this.partCounter; - } - - /** - * Returns an unique task identifier consisting of article id and part - * counter. - * - * @return unique task identifier - */ - public String uniqueIdentifier() - { - return this.header.getArticleId() + "-" + this.partCounter; - } - - /* - * +DELEGATERS+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ - */ - - /** - * Returns a string representation of the task. - * - * @return string representation - */ - @Override - public String toString() - { - return "[" + this.taskType.toString() + " <" + this.partCounter + ">" - + "\t" + this.byteSize + "\t| " + this.header.getArticleId() - + "\tR" + this.container.size() + "\t" - + this.header.getArticleName() + "]"; - } +public class Task { + + /* + * +STATICS++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + */ + + /** + * Creates a dummy task without data. + * + * @return dummy task + */ + @SuppressWarnings("rawtypes") + public static Task createDummy() { + return new Task(TaskTypes.DUMMY); + } + + /** + * Creates an end task. + * + * @return end task + */ + @SuppressWarnings("rawtypes") + public static Task createEndTask() { + return new Task(TaskTypes.ENDTASK); + } + + /** + * Creates a banned task. + * + * @return banned task + */ + @SuppressWarnings("rawtypes") + public static Task createBannedTask() { + return new Task(TaskTypes.BANNED_TASK); + } + + /* + * +ATTRIBUTES+AND+CONSTRUCTORS++++++++++++++++++++++++++++++++++++++++++++++ + */ + + /** + * Type of the task + */ + private TaskTypes taskType; + + /** + * Additional information concerning the article + */ + private ArticleInformation header; + + /** + * Data of the task + */ + private final ArrayList container; + + /** + * Counter of the task parts (1-based) + */ + private final int partCounter; + + /** + * Size of this task + */ + private int byteSize; + + /** + * Constructor - A new task object of the specified type will be created. + * + * @param taskType Type of task + */ + protected Task(final TaskTypes taskType) { + this.taskType = taskType; + this.container = null; + + this.byteSize = 0; + this.partCounter = 0; + } + + /** + * Constructor - A new task object of the type TASK_FULL will be created. + * + * @param header reference to the article information + * @param taskPartCounter task part counter + */ + public Task(final ArticleInformation header, final int taskPartCounter) { + this.header = header; + + this.byteSize = 0; + this.partCounter = taskPartCounter; + this.taskType = TaskTypes.TASK_FULL; + + this.container = new ArrayList<>(); + } + + /* + * +METHODS++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + */ + + /** + * Returns the reference to the article header. + */ + public ArticleInformation getHeader() { + return this.header; + } + + /** + * Returns the type of this task. + * + * @return TaskType + */ + public TaskTypes getTaskType() { + return this.taskType; + } + + /** + * Adds data to this task. + * + * @param data Reference to the data object. + */ + public void add(final D data) { + this.container.add(data); + + // if the size of data is known add the value to the task size + if (data instanceof ISizeable) { + this.byteSize += ((ISizeable) data).byteSize(); + } + } + + /** + * Returns the data of this task. + * + * @return data + */ + public ArrayList getContainer() { + return this.container; + } + + /** + * Returns the data at the specified index. + *

+ * The index will not be check whether it is out of range or not. If you do + * not know the appropriate index call the size() method before calling this + * method. + * + * @param index index + * @return data + */ + public D get(final int index) { + return this.container.get(index); + } + + /** + * Returns the number of data parts the task contains. + * + * @return number of data parts. + */ + public int size() { + return this.container.size(); + } + + /** + * Returns an iterator over the data. + * + * @return Iterator + */ + public Iterator iterator() { + return this.container.iterator(); + } + + /** + * Returns the size estimation of this task in bytes. + *

+ * The size can only be estimated if the data contains the ISizeable + * interface. + * + * @return size estimation + */ + public int byteSize() { + return this.byteSize; + } + + /** + * Returns the type of the task. + * + * @param taskType TaskType + */ + public void setTaskType(final TaskTypes taskType) { + this.taskType = taskType; + } + + /** + * Returns the part counter. + * + * @return Part counter + */ + public int getPartCounter() { + return this.partCounter; + } + + /** + * Returns an unique task identifier consisting of article id and part + * counter. + * + * @return unique task identifier + */ + public String uniqueIdentifier() { + return this.header.getArticleId() + "-" + this.partCounter; + } + + /* + * +DELEGATERS+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + */ + + /** + * Returns a string representation of the task. + * + * @return string representation + */ + @Override + public String toString() { + return "[" + this.taskType.toString() + " <" + this.partCounter + ">" + + "\t" + this.byteSize + "\t| " + this.header.getArticleId() + + "\tR" + this.container.size() + "\t" + + this.header.getArticleName() + "]"; + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/tasks/TaskTypes.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/tasks/TaskTypes.java index 88e208b9..0c9a89bf 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/tasks/TaskTypes.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/tasks/TaskTypes.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -19,31 +19,41 @@ /** * This Enumerator lists the different types of tasks. - * - * - * */ -public enum TaskTypes -{ - - /** dummy task */ - DUMMY, - - /** if this task is received from a consumer, it will shutdown afterwards */ - ENDTASK, - - /** if the article id is black listed */ - BANNED_TASK, - - /** full task containing all revisions of one article */ - TASK_FULL, - - /** task containing the first part of revisions of one article */ - TASK_PARTIAL_FIRST, - - /** task containing some revisions of one article */ - TASK_PARTIAL, - - /** task containing the last part of revisions from one article */ - TASK_PARTIAL_LAST +public enum TaskTypes { + + /** + * dummy task + */ + DUMMY, + + /** + * if this task is received from a consumer, it will shutdown afterwards + */ + ENDTASK, + + /** + * if the article id is black listed + */ + BANNED_TASK, + + /** + * full task containing all revisions of one article + */ + TASK_FULL, + + /** + * task containing the first part of revisions of one article + */ + TASK_PARTIAL_FIRST, + + /** + * task containing some revisions of one article + */ + TASK_PARTIAL, + + /** + * task containing the last part of revisions from one article + */ + TASK_PARTIAL_LAST } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/tasks/content/Diff.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/tasks/content/Diff.java index ebe37cf4..ee58e403 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/tasks/content/Diff.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/tasks/content/Diff.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -28,352 +28,333 @@ /** * This class contains the diff information used to create single revision. - * - * - * */ public class Diff - implements ISizeable -{ - - /** Reference to the codec */ - private RevisionCodecData codecData; - - /** List of DiffParts */ - private final List parts; - - /** Revision counter */ - private int revisionCounter; - - /** Revision ID */ - private int revisionID; - - /** Timestamp */ - private Timestamp timeStamp; - - /** Username/IP of the contributor who created this revision */ - private String contributorName; - - /** ID of the contributor who created this revision */ - private Integer contributorId; - - /** Determine whether the contributor was registered. - * True: contributorName= username - * False: contributorName= IP - */ - private boolean contributorIsRegistered; - - /** The user comment for this revision*/ - private String comment; - - /** Determine whether revision is a minor revision */ - private boolean isMinor = false; - - /** - * (Constructor) Creates a new empty Diff. - */ - public Diff() - { - this.parts = new ArrayList<>(); - } - - /** - * Adds a DiffPart. - * - * @param diff - * DiffPart - */ - public void add(final DiffPart diff) - { - this.parts.add(diff); - } - - /** - * Builds the current revision. - * - * @param previousRevision - * content of the previous revision - * @return current revision - */ - public String buildRevision(final char[] previousRevision) - { - String prevRev = null; - if (previousRevision != null) { - prevRev = String.valueOf(previousRevision); - } - - return buildRevision(prevRev); - } - - /** - * Builds the current revision. - * - * @param previousRevision - * content of the previous revision - * @return current revision - */ - public String buildRevision(final String previousRevision) - { - - HashMap bufferMap = new HashMap<>(); - - StringBuilder output = new StringBuilder(); - if (previousRevision != null) { - output.append(previousRevision); - } - - int size = parts.size(); - DiffPart part; - - for (int i = 0; i < size; i++) { - - part = parts.get(i); - - switch (part.getAction()) { - case FULL_REVISION_UNCOMPRESSED: - output = new StringBuilder(); - output.insert(0, part.getText()); - break; - case INSERT: - output.insert(part.getStart(), part.getText()); - break; - case DELETE: - output.delete(part.getStart(), part.getEnd()); - break; - case REPLACE: - output.replace(part.getStart(), part.getEnd(), part.getText()); - break; - case CUT: - bufferMap.put(part.getText(), - output.substring(part.getStart(), part.getEnd())); - output.delete(part.getStart(), part.getEnd()); - break; - case PASTE: - output.insert(part.getStart(), bufferMap.remove(part.getText())); - break; - default: - throw new RuntimeException("UNKNOWN PART ACTION"); - } - } - - return output.toString(); - } - - /** - * Returns an estimation of the size used to stored the data. - * - * @return estimated size - */ - public long byteSize() - { - - long byteSize = 3; - - int size = parts.size(); - - for (int i = 0; i < size; i++) { - byteSize += this.parts.get(i).byteSize(); - } - - return byteSize; - } - - /** - * Returns the referenced diff part. - * - * @param index - * index of the diff part - * @return diff part - */ - public DiffPart get(final int index) - { - return this.parts.get(index); - } - - /** - * Returns the codec data. - * - * @return codec - */ - public RevisionCodecData getCodecData() - { - return codecData; - } - - /** - * Returns the revision counter. - * - * @return revision counter - */ - public int getRevisionCounter() - { - return this.revisionCounter; - } - - /* - * (non-Javadoc) - * - * @see de.tud.ukp.kulessa.delta.data.IRevisionChange#getRevisionID() - */ - public int getRevisionID() - { - return revisionID; - } - - /* - * (non-Javadoc) - * - * @see de.tud.ukp.kulessa.delta.data.IRevisionChange#getTimeStamp() - */ - public Timestamp getTimeStamp() - { - return timeStamp; - } - - /** - * Returns whether the revision described by this diff is a full revision or - * not. - * - * @return TRUE | FALSE - */ - public boolean isFullRevision() - { - if (this.parts.size() == 1) { - DiffPart p = this.parts.get(0); - if (p.getAction() == DiffAction.FULL_REVISION_UNCOMPRESSED) { - return true; - } - } - - return false; - } - - /* - * (non-Javadoc) - * - * @see de.tud.ukp.kulessa.delta.data.IRevisionChange#iterator() - */ - public Iterator iterator() - { - return this.parts.iterator(); - } - - /** - * Sets the codec data. - * - * @param codecData - * coded data - */ - public void setCodecData(final RevisionCodecData codecData) - { - this.codecData = codecData; - } - - /** - * Sets the revision counter. - * - * @param revisionCounter - * revision counter - */ - public void setRevisionCoutner(final int revisionCounter) - { - this.revisionCounter = revisionCounter; - } - - /* - * (non-Javadoc) - * - * @see de.tud.ukp.kulessa.delta.data.IRevisionChange#setRevisionID(int) - */ - public void setRevisionID(final int revisionID) - { - this.revisionID = revisionID; - } - - /* - * (non-Javadoc) - * - * @see - * de.tud.ukp.kulessa.delta.data.IRevisionChange#setTimeStamp(java.lang. - * String) - */ - public void setTimeStamp(final Timestamp timeStamp) - { - this.timeStamp = timeStamp; - } - - /** - * Returns the number of stored diff parts. - * - * @return number of diff parts - */ - public int size() - { - return this.parts.size(); - } - - /** - * Returns the string representation of the diff content. - * - * @return string representation of the diff parts - */ - @Override - public String toString() - { - StringBuilder builder = new StringBuilder(); - for (int i = 0; i < parts.size(); i++) { - builder.append(parts.get(i).toString() + "\n"); - } - return builder.toString(); - } - - public void setComment(String comment) - { - this.comment = comment; - } - - public String getComment() - { - return comment; - } - - public void setMinor(boolean isMinor) - { - this.isMinor = isMinor; - } - - public boolean isMinor() - { - return isMinor; - } - - public void setContributorName(String contributorName) - { - this.contributorName = contributorName; - } - - public String getContributorName() - { - return contributorName; - } - - public void setContributorIsRegistered(boolean contributorIsRegistered) - { - this.contributorIsRegistered = contributorIsRegistered; - } - - public boolean getContributorIsRegistered() - { - return contributorIsRegistered; - } - - public void setContributorId(Integer contributorId) - { - this.contributorId = contributorId; - } - - public Integer getContributorId() - { - return contributorId; - } + implements ISizeable { + + /** + * Reference to the codec + */ + private RevisionCodecData codecData; + + /** + * List of DiffParts + */ + private final List parts; + + /** + * Revision counter + */ + private int revisionCounter; + + /** + * Revision ID + */ + private int revisionID; + + /** + * Timestamp + */ + private Timestamp timeStamp; + + /** + * Username/IP of the contributor who created this revision + */ + private String contributorName; + + /** + * ID of the contributor who created this revision + */ + private Integer contributorId; + + /** + * Determine whether the contributor was registered. + * True: contributorName= username + * False: contributorName= IP + */ + private boolean contributorIsRegistered; + + /** + * The user comment for this revision + */ + private String comment; + + /** + * Determine whether revision is a minor revision + */ + private boolean isMinor = false; + + /** + * (Constructor) Creates a new empty Diff. + */ + public Diff() { + this.parts = new ArrayList<>(); + } + + /** + * Adds a DiffPart. + * + * @param diff DiffPart + */ + public void add(final DiffPart diff) { + this.parts.add(diff); + } + + /** + * Builds the current revision. + * + * @param previousRevision content of the previous revision + * @return current revision + */ + public String buildRevision(final char[] previousRevision) { + String prevRev = null; + if (previousRevision != null) { + prevRev = String.valueOf(previousRevision); + } + + return buildRevision(prevRev); + } + + /** + * Builds the current revision. + * + * @param previousRevision content of the previous revision + * @return current revision + */ + public String buildRevision(final String previousRevision) { + + HashMap bufferMap = new HashMap<>(); + + StringBuilder output = new StringBuilder(); + if (previousRevision != null) { + output.append(previousRevision); + } + + int size = parts.size(); + DiffPart part; + + for (int i = 0; i < size; i++) { + + part = parts.get(i); + + switch (part.getAction()) { + case FULL_REVISION_UNCOMPRESSED: + output = new StringBuilder(); + output.insert(0, part.getText()); + break; + case INSERT: + output.insert(part.getStart(), part.getText()); + break; + case DELETE: + output.delete(part.getStart(), part.getEnd()); + break; + case REPLACE: + output.replace(part.getStart(), part.getEnd(), part.getText()); + break; + case CUT: + bufferMap.put(part.getText(), + output.substring(part.getStart(), part.getEnd())); + output.delete(part.getStart(), part.getEnd()); + break; + case PASTE: + output.insert(part.getStart(), bufferMap.remove(part.getText())); + break; + default: + throw new RuntimeException("UNKNOWN PART ACTION"); + } + } + + return output.toString(); + } + + /** + * Returns an estimation of the size used to stored the data. + * + * @return estimated size + */ + public long byteSize() { + + long byteSize = 3; + + int size = parts.size(); + + for (int i = 0; i < size; i++) { + byteSize += this.parts.get(i).byteSize(); + } + + return byteSize; + } + + /** + * Returns the referenced diff part. + * + * @param index index of the diff part + * @return diff part + */ + public DiffPart get(final int index) { + return this.parts.get(index); + } + + /** + * Returns the codec data. + * + * @return codec + */ + public RevisionCodecData getCodecData() { + return codecData; + } + + /** + * Returns the revision counter. + * + * @return revision counter + */ + public int getRevisionCounter() { + return this.revisionCounter; + } + + /* + * (non-Javadoc) + * + * @see de.tud.ukp.kulessa.delta.data.IRevisionChange#getRevisionID() + */ + public int getRevisionID() { + return revisionID; + } + + /* + * (non-Javadoc) + * + * @see de.tud.ukp.kulessa.delta.data.IRevisionChange#getTimeStamp() + */ + public Timestamp getTimeStamp() { + return timeStamp; + } + + /** + * Returns whether the revision described by this diff is a full revision or + * not. + * + * @return TRUE | FALSE + */ + public boolean isFullRevision() { + if (this.parts.size() == 1) { + DiffPart p = this.parts.get(0); + if (p.getAction() == DiffAction.FULL_REVISION_UNCOMPRESSED) { + return true; + } + } + + return false; + } + + /* + * (non-Javadoc) + * + * @see de.tud.ukp.kulessa.delta.data.IRevisionChange#iterator() + */ + public Iterator iterator() { + return this.parts.iterator(); + } + + /** + * Sets the codec data. + * + * @param codecData coded data + */ + public void setCodecData(final RevisionCodecData codecData) { + this.codecData = codecData; + } + + /** + * Sets the revision counter. + * + * @param revisionCounter revision counter + */ + public void setRevisionCoutner(final int revisionCounter) { + this.revisionCounter = revisionCounter; + } + + /* + * (non-Javadoc) + * + * @see de.tud.ukp.kulessa.delta.data.IRevisionChange#setRevisionID(int) + */ + public void setRevisionID(final int revisionID) { + this.revisionID = revisionID; + } + + /* + * (non-Javadoc) + * + * @see + * de.tud.ukp.kulessa.delta.data.IRevisionChange#setTimeStamp(java.lang. + * String) + */ + public void setTimeStamp(final Timestamp timeStamp) { + this.timeStamp = timeStamp; + } + + /** + * Returns the number of stored diff parts. + * + * @return number of diff parts + */ + public int size() { + return this.parts.size(); + } + + /** + * Returns the string representation of the diff content. + * + * @return string representation of the diff parts + */ + @Override + public String toString() { + StringBuilder builder = new StringBuilder(); + for (int i = 0; i < parts.size(); i++) { + builder.append(parts.get(i).toString() + "\n"); + } + return builder.toString(); + } + + public void setComment(String comment) { + this.comment = comment; + } + + public String getComment() { + return comment; + } + + public void setMinor(boolean isMinor) { + this.isMinor = isMinor; + } + + public boolean isMinor() { + return isMinor; + } + + public void setContributorName(String contributorName) { + this.contributorName = contributorName; + } + + public String getContributorName() { + return contributorName; + } + + public void setContributorIsRegistered(boolean contributorIsRegistered) { + this.contributorIsRegistered = contributorIsRegistered; + } + + public boolean getContributorIsRegistered() { + return contributorIsRegistered; + } + + public void setContributorId(Integer contributorId) { + this.contributorId = contributorId; + } + + public Integer getContributorId() { + return contributorId; + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/tasks/content/DiffAction.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/tasks/content/DiffAction.java index 2a403385..cdc1f931 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/tasks/content/DiffAction.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/tasks/content/DiffAction.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -25,97 +25,102 @@ /** * This class contains the constants for the DiffActions. - * - * - * */ -public enum DiffAction implements Serializable -{ +public enum DiffAction implements Serializable { - /** Codec */ - DECODER_DATA((byte) 0), + /** + * Codec + */ + DECODER_DATA((byte) 0), - /** Full Revision */ - FULL_REVISION_UNCOMPRESSED((byte) 1), + /** + * Full Revision + */ + FULL_REVISION_UNCOMPRESSED((byte) 1), - /** Insert operation */ - INSERT((byte) 2), + /** + * Insert operation + */ + INSERT((byte) 2), - /** Delete operation */ - DELETE((byte) 3), + /** + * Delete operation + */ + DELETE((byte) 3), - /** Replace operation */ - REPLACE((byte) 4), + /** + * Replace operation + */ + REPLACE((byte) 4), - /** Cut operation */ - CUT((byte) 5), + /** + * Cut operation + */ + CUT((byte) 5), - /** Paste operation */ - PASTE((byte) 6)/* - * , - * - * FULL_REVISION_COMPRESSED((byte)7) - */; + /** + * Paste operation + */ + PASTE((byte) 6)/* + * , + * + * FULL_REVISION_COMPRESSED((byte)7) + */; - /** byte constant */ - private final byte code; + /** + * byte constant + */ + private final byte code; - /** - * Creates a DiffAction. - * - * @param code - * byte constant - */ - DiffAction(final byte code) - { - this.code = code; - } + /** + * Creates a DiffAction. + * + * @param code byte constant + */ + DiffAction(final byte code) { + this.code = code; + } - /** - * Returns the byte constant - * - * @return value of the constant - */ - public byte getValue() - { - return code; - } + /** + * Returns the byte constant + * + * @return value of the constant + */ + public byte getValue() { + return code; + } - /** - * Returns the appropriate DiffAction value. - * - * @param val - * byte value - * @return DiffAction - * - * @throws DecodingException - * if the value does not match one of the predefined byte - * constants - */ - public static DiffAction parse(final int val) - throws DecodingException - { + /** + * Returns the appropriate DiffAction value. + * + * @param val byte value + * @return DiffAction + * @throws DecodingException if the value does not match one of the predefined byte + * constants + */ + public static DiffAction parse(final int val) + throws DecodingException { - switch (val) { - case 0: - return DECODER_DATA; - case 1: - return FULL_REVISION_UNCOMPRESSED; - case 2: - return INSERT; - case 3: - return DELETE; - case 4: - return REPLACE; - case 5: - return CUT; - case 6: - return PASTE; - // case 7: return FULL_REVISION_COMPRESSED; - default: - throw ErrorFactory.createDecodingException( - ErrorKeys.DIFFTOOL_ENCODING_INVALID_VALUE, - "Invalid value: " + val); - } - } + switch (val) { + case 0: + return DECODER_DATA; + case 1: + return FULL_REVISION_UNCOMPRESSED; + case 2: + return INSERT; + case 3: + return DELETE; + case 4: + return REPLACE; + case 5: + return CUT; + case 6: + return PASTE; + // case 7: return FULL_REVISION_COMPRESSED; + default: + throw ErrorFactory.createDecodingException( + ErrorKeys.DIFFTOOL_ENCODING_INVALID_VALUE, + "Invalid value: " + val); + } + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/tasks/content/DiffPart.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/tasks/content/DiffPart.java index 67d425bc..85d0db84 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/tasks/content/DiffPart.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/tasks/content/DiffPart.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -22,168 +22,157 @@ /** * The DiffPart class represents the operation used to create a new revision * from an older revision. - * - * - * */ -public class DiffPart implements Serializable -{ - - private static final long serialVersionUID = 6208903899064982679L; - - /** Start position of the text block */ - private int start; - - /** Lengthof the text block */ - private int length; - - /** DiffAction value */ - private final DiffAction action; - - /** Textual information */ - private String text; - - /** - * (Constructor) Creates a new DiffPart object. - * - * @param action - * DiffAction - */ - public DiffPart(final DiffAction action) - { - - this.action = action; - } - - /** - * Returns the length of the text block. - * - * @return length of the text block - */ - public int getLength() - { - return length; - } - - /** - * Sets the length of the text block. - * - * @param length - * length of the text block - */ - public void setLength(final int length) - { - this.length = length; - } - - /** - * Returns the start position of the text block. - * - * @return start position - */ - public int getStart() - { - return start; - } - - /** - * Returns the end position of the text block. - * - * @return end position - */ - public int getEnd() - { - return start + length; - } - - /** - * Sets the start position of the text block. - * - * @param start - * start position - */ - public void setStart(final int start) - { - this.start = start; - } - - /** - * Sets the textual information. - * - * @param text - * content - */ - public void setText(final String text) - { - this.text = text; - } - - /** - * Returns the DiffAction value. - * - * @return DiffAction - */ - public DiffAction getAction() - { - return this.action; - } - - /** - * Returns the textual information. - * - * @return content - */ - public String getText() - { - return this.text; - } - - /** - * Returns a representation of the DiffAction content. - * - * @return [ DiffAction, start position, length, content ] - */ - @Override - public String toString() - { - return "[" + action + " " + start + " " + length + " " + text + "]\n"; - } - - /** - * Returns the estimated number of bytes used to encode the contained - * information. - * - * @return estimated size in bytes - */ - public int byteSize() - { - if (text == null) { - return 9; - } - return 9 + text.length(); - } - - /* (non-Javadoc) - * @see java.lang.Object#equals(java.lang.Object) - * - * DiffParts are equal if their text, actions and spans are equal - */ - @Override - public boolean equals(Object anObject) { - - if(!(anObject instanceof DiffPart)){ - return false; - }else{ - DiffPart otherRev = (DiffPart)anObject; - if (this.getText().equals(otherRev.getText()) - && this.getAction() == otherRev.getAction() - && this.getStart() == otherRev.getStart() - && this.getEnd() == otherRev.getEnd()) { - return true; - }else{ - return false; - } - } +public class DiffPart implements Serializable { + + private static final long serialVersionUID = 6208903899064982679L; + + /** + * Start position of the text block + */ + private int start; + + /** + * Lengthof the text block + */ + private int length; + + /** + * DiffAction value + */ + private final DiffAction action; + + /** + * Textual information + */ + private String text; + + /** + * (Constructor) Creates a new DiffPart object. + * + * @param action DiffAction + */ + public DiffPart(final DiffAction action) { + + this.action = action; + } + + /** + * Returns the length of the text block. + * + * @return length of the text block + */ + public int getLength() { + return length; + } + + /** + * Sets the length of the text block. + * + * @param length length of the text block + */ + public void setLength(final int length) { + this.length = length; + } + + /** + * Returns the start position of the text block. + * + * @return start position + */ + public int getStart() { + return start; + } + + /** + * Returns the end position of the text block. + * + * @return end position + */ + public int getEnd() { + return start + length; + } + + /** + * Sets the start position of the text block. + * + * @param start start position + */ + public void setStart(final int start) { + this.start = start; + } + + /** + * Sets the textual information. + * + * @param text content + */ + public void setText(final String text) { + this.text = text; + } + + /** + * Returns the DiffAction value. + * + * @return DiffAction + */ + public DiffAction getAction() { + return this.action; + } + + /** + * Returns the textual information. + * + * @return content + */ + public String getText() { + return this.text; + } + + /** + * Returns a representation of the DiffAction content. + * + * @return [ DiffAction, start position, length, content ] + */ + @Override + public String toString() { + return "[" + action + " " + start + " " + length + " " + text + "]\n"; + } + + /** + * Returns the estimated number of bytes used to encode the contained + * information. + * + * @return estimated size in bytes + */ + public int byteSize() { + if (text == null) { + return 9; + } + return 9 + text.length(); + } + + /* (non-Javadoc) + * @see java.lang.Object#equals(java.lang.Object) + * + * DiffParts are equal if their text, actions and spans are equal + */ + @Override + public boolean equals(Object anObject) { + + if (!(anObject instanceof DiffPart)) { + return false; + } else { + DiffPart otherRev = (DiffPart) anObject; + if (this.getText().equals(otherRev.getText()) + && this.getAction() == otherRev.getAction() + && this.getStart() == otherRev.getStart() + && this.getEnd() == otherRev.getEnd()) { + return true; + } else { + return false; + } } + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/tasks/info/ArticleInformation.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/tasks/info/ArticleInformation.java index eb0edd0f..6e883ed7 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/tasks/info/ArticleInformation.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/tasks/info/ArticleInformation.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -22,450 +22,429 @@ /** * This class contains all statistical information related to one article. - * - * - * */ -public class ArticleInformation -{ - - /** Article ID */ - private int articleId; - - /** Name of the article */ - private String articleName; - - /** Diffed size of the article */ - private long diffedSize; - - /** Number of diff parts used */ - private int diffPartCounter; - - /** Encoded size of the article */ - private long encodedSize; - - /** UNCOMPRESSED encoded size of the article */ - private long encodedSQLSize; - - /** Time the task entered the system */ - private long enteringTime; - - /** Time the task exited the system */ - private long exitingTime; - - /** Number of ignored revisions */ - private int ignoredRevisionsCounter; - - /** Original size of the article */ - private long originalSize; - - /** Time used to diff the task */ - private long processingTimeDiff; - - /** Time used to read the task */ - private long processingTimeRead; - - /** Time used to encode the task */ - private long processingTimeSQL; - - /** Number of parsed revisions related to this article */ - private int readRevisionCounter; - - /** Value of the revision counter after finishing the diff processing */ - private int revisionCounter; - - /** - * (Constructor) Creates a new ArticleInformation object. - */ - public ArticleInformation() - { - this.articleId = -1; - // this.timeStamp = null; - this.articleName = null; - - this.revisionCounter = 0; - this.ignoredRevisionsCounter = 0; - this.diffPartCounter = 0; - - this.originalSize = 0; - this.diffedSize = 0; - this.encodedSize = 0; - this.encodedSQLSize = 0; - - this.enteringTime = 0; - this.exitingTime = 0; - } - - /** - * Returns the ID of the article. - * - * @return Article ID - */ - public int getArticleId() - { - return articleId; - } - - /** - * Returns the name of the article. - * - * @return Article name - */ - public String getArticleName() - { - return articleName; - } - - /** - * Returns the diffed size of the article. - * - * @return diffed size - */ - public long getDiffedSize() - { - return diffedSize; - } - - /** - * Returns the number of diff parts. - * - * @return number of diff parts - */ - public int getDiffPartCounter() - { - return diffPartCounter; - } - - /** - * Returns the encoded size of the article. - * - * @return encoded size - */ - public long getEncodedSize() - { - return encodedSize; - } - - /** - * Returns the size of the article after the sql encoding. - * - * @return size after encoding - */ - public long getEncodedSQLSize() - { - return encodedSQLSize; - } - - /** - * Returns the entering time. - * - * @return entering time - */ - public long getEnteringTime() - { - return enteringTime; - } - - /** - * Returns the exiting time. - * - * @return exiting time - */ - public long getExitingTime() - { - return exitingTime; - } - - /** - * Returns the number of ignored revisions. - * - * @return number of ignored revisions - */ - public int getIgnoredRevisionsCounter() - { - return ignoredRevisionsCounter; - } - - /** - * Returns the original size of the article. - * - * @return original size - */ - public long getOriginalSize() - { - return originalSize; - } - - /** - * Returns the time used for the diff encoding. - * - * @return processing time diff - */ - public long getProcessingTimeDiff() - { - return processingTimeDiff; - } - - /** - * Returns the time used for reading the task. - * - * @return processing time reading - */ - public long getProcessingTimeRead() - { - return processingTimeRead; - } - - /** - * Returns the time used for the sql encoding. - * - * @return processing time encoding - */ - public long getProcessingTimeSQL() - { - return processingTimeSQL; - } - - /** - * Returns the number of parsed revisions. - * - * @return number of parsed revisions - */ - public int getReadRevisionCounter() - { - return readRevisionCounter; - } - - /** - * Returns the revision counter. - * - * @return revision counter - */ - public int getRevisionCounter() - { - return revisionCounter; - } - - /** - * Sets the ID of the article. - * - * @param articleId - * Article ID - */ - public void setArticleId(final int articleId) - { - this.articleId = articleId; - } - - /** - * Sets the name of the article. - * - * @param articleName - * Article name - */ - public void setArticleName(final String articleName) - { - this.articleName = articleName; - } - - /** - * Sets the diffed size of the article. - * - * @param diffedSize - * diffed size - */ - public void setDiffedSize(final long diffedSize) - { - this.diffedSize = diffedSize; - } - - /** - * Sets the number of diff parts. - * - * @param diffPartCounter - * number of diff parts - */ - public void setDiffPartCounter(final int diffPartCounter) - { - this.diffPartCounter = diffPartCounter; - } - - /** - * Sets the encoded size of the article. - * - * @param encodedSize - * encoded size - */ - public void setEncodedSize(final long encodedSize) - { - this.encodedSize = encodedSize; - } - - /** - * Sets the size of the article after the sql encoding. - * - * @param encodedSQLSize - * size after encoding - */ - public void setEncodedSQLSize(final long encodedSQLSize) - { - this.encodedSQLSize = encodedSQLSize; - } - - /** - * Sets the entering time of the first task for this article. - * - * @param enteringTime - * entering time - */ - public void setEnteringTime(final long enteringTime) - { - this.enteringTime = enteringTime; - } - - /** - * Sets the exiting time of the last task for this article. - * - * @param exitingTime - * exiting time - */ - public void setExitingTime(final long exitingTime) - { - this.exitingTime = exitingTime; - } - - /** - * Sets the number of ignored revisions. - * - * @param ignoredRevisionsCounter - * number of ignored revisions - */ - public void setIgnoredRevisionsCounter(final int ignoredRevisionsCounter) - { - this.ignoredRevisionsCounter = ignoredRevisionsCounter; - } - - /** - * Sets the original size of the article. - * - * @param originalSize - * original size - */ - public void setOriginalSize(final long originalSize) - { - this.originalSize = originalSize; - } - - /** - * Sets the time used for the diff encoding. - * - * @param processingTimeDiff - * processing time diff - */ - public void setProcessingTimeDiff(final long processingTimeDiff) - { - this.processingTimeDiff = processingTimeDiff; - } - - /** - * Sets the time used for reading the task. - * - * @param processingTimeRead - * processing time reading - */ - public void setProcessingTimeRead(final long processingTimeRead) - { - this.processingTimeRead = processingTimeRead; - } - - /** - * Sets the time used for the sql encoding. - * - * @param processingTimeSQL - * processing time encoding - */ - public void setProcessingTimeSQL(final long processingTimeSQL) - { - this.processingTimeSQL = processingTimeSQL; - } - - /** - * Sets the number of parsed revisions. - * - * @param readRevisionCounter - * number of parsed revisions - */ - public void setReadRevisionCounter(final int readRevisionCounter) - { - this.readRevisionCounter = readRevisionCounter; - } - - /** - * Sets the revision counter. - * - * @param nrRevisions - * revision counter - */ - public void setRevisionCounter(final int nrRevisions) - { - this.revisionCounter = nrRevisions; - } - - /** - * Returns the string representation of this object. Used for logging the - * statistical data. - * - * @return content representation - */ - public String toString() - { - - long sysTime = this.exitingTime - this.enteringTime; - - StringBuilder b = new StringBuilder(); - b.append("\n[\tARTICLEID: \t"); - b.append(articleId); - b.append("\r\n\tARTICLENAME: \t"); - b.append(articleName); - b.append("\r\n\r\n\tNUMBER REVISIONS:\t["); - b.append(this.revisionCounter); - b.append(" + "); - b.append(this.ignoredRevisionsCounter); - b.append(" = "); - b.append(this.readRevisionCounter); - b.append("]\r\n\tNUMBER DIFFPARTS:\t"); - b.append(this.diffPartCounter); - b.append("\r\n\r\n\tSYSTEM TIME: \t[ 100% ]\t"); - b.append(Time.toClock(sysTime)); - b.append("\r\n\tREADING TIME: \t["); - b.append(MathUtilities.percentFrom(this.processingTimeRead, sysTime)); - b.append("]\t"); - b.append(Time.toClock(this.processingTimeRead)); - b.append("\r\n\tDIFFING TIME: \t["); - b.append(MathUtilities.percentFrom(this.processingTimeDiff, sysTime)); - b.append("]\t"); - b.append(Time.toClock(this.processingTimeDiff)); - b.append("\r\n\tENCODING TIME: \t["); - b.append(MathUtilities.percentFrom(this.processingTimeSQL, sysTime)); - b.append("]\t"); - b.append(Time.toClock(this.processingTimeSQL)); - b.append("\r\n\r\n\tORIGINAL SIZE: \t[ 100% ]\t"); - b.append(this.originalSize); - b.append("\r\n\tDIFFED SIZE: \t["); - b.append(MathUtilities.percentFrom(this.diffedSize, this.originalSize)); - b.append("]\t"); - b.append(this.diffedSize); - b.append("\r\n\tENCODED SIZE: \t["); - b.append(MathUtilities.percentFrom(this.encodedSize, this.originalSize)); - b.append("]\t"); - b.append(this.encodedSize); - b.append("\r\n\tENCODED UNCOMPRESSED SIZE: \t["); - b.append(MathUtilities.percentFrom(this.encodedSQLSize, - this.originalSize)); - b.append("]\t"); - b.append(this.encodedSQLSize); - b.append("\r\n]\r\n"); - - return b.toString(); - } +public class ArticleInformation { + + /** + * Article ID + */ + private int articleId; + + /** + * Name of the article + */ + private String articleName; + + /** + * Diffed size of the article + */ + private long diffedSize; + + /** + * Number of diff parts used + */ + private int diffPartCounter; + + /** + * Encoded size of the article + */ + private long encodedSize; + + /** + * UNCOMPRESSED encoded size of the article + */ + private long encodedSQLSize; + + /** + * Time the task entered the system + */ + private long enteringTime; + + /** + * Time the task exited the system + */ + private long exitingTime; + + /** + * Number of ignored revisions + */ + private int ignoredRevisionsCounter; + + /** + * Original size of the article + */ + private long originalSize; + + /** + * Time used to diff the task + */ + private long processingTimeDiff; + + /** + * Time used to read the task + */ + private long processingTimeRead; + + /** + * Time used to encode the task + */ + private long processingTimeSQL; + + /** + * Number of parsed revisions related to this article + */ + private int readRevisionCounter; + + /** + * Value of the revision counter after finishing the diff processing + */ + private int revisionCounter; + + /** + * (Constructor) Creates a new ArticleInformation object. + */ + public ArticleInformation() { + this.articleId = -1; + // this.timeStamp = null; + this.articleName = null; + + this.revisionCounter = 0; + this.ignoredRevisionsCounter = 0; + this.diffPartCounter = 0; + + this.originalSize = 0; + this.diffedSize = 0; + this.encodedSize = 0; + this.encodedSQLSize = 0; + + this.enteringTime = 0; + this.exitingTime = 0; + } + + /** + * Returns the ID of the article. + * + * @return Article ID + */ + public int getArticleId() { + return articleId; + } + + /** + * Returns the name of the article. + * + * @return Article name + */ + public String getArticleName() { + return articleName; + } + + /** + * Returns the diffed size of the article. + * + * @return diffed size + */ + public long getDiffedSize() { + return diffedSize; + } + + /** + * Returns the number of diff parts. + * + * @return number of diff parts + */ + public int getDiffPartCounter() { + return diffPartCounter; + } + + /** + * Returns the encoded size of the article. + * + * @return encoded size + */ + public long getEncodedSize() { + return encodedSize; + } + + /** + * Returns the size of the article after the sql encoding. + * + * @return size after encoding + */ + public long getEncodedSQLSize() { + return encodedSQLSize; + } + + /** + * Returns the entering time. + * + * @return entering time + */ + public long getEnteringTime() { + return enteringTime; + } + + /** + * Returns the exiting time. + * + * @return exiting time + */ + public long getExitingTime() { + return exitingTime; + } + + /** + * Returns the number of ignored revisions. + * + * @return number of ignored revisions + */ + public int getIgnoredRevisionsCounter() { + return ignoredRevisionsCounter; + } + + /** + * Returns the original size of the article. + * + * @return original size + */ + public long getOriginalSize() { + return originalSize; + } + + /** + * Returns the time used for the diff encoding. + * + * @return processing time diff + */ + public long getProcessingTimeDiff() { + return processingTimeDiff; + } + + /** + * Returns the time used for reading the task. + * + * @return processing time reading + */ + public long getProcessingTimeRead() { + return processingTimeRead; + } + + /** + * Returns the time used for the sql encoding. + * + * @return processing time encoding + */ + public long getProcessingTimeSQL() { + return processingTimeSQL; + } + + /** + * Returns the number of parsed revisions. + * + * @return number of parsed revisions + */ + public int getReadRevisionCounter() { + return readRevisionCounter; + } + + /** + * Returns the revision counter. + * + * @return revision counter + */ + public int getRevisionCounter() { + return revisionCounter; + } + + /** + * Sets the ID of the article. + * + * @param articleId Article ID + */ + public void setArticleId(final int articleId) { + this.articleId = articleId; + } + + /** + * Sets the name of the article. + * + * @param articleName Article name + */ + public void setArticleName(final String articleName) { + this.articleName = articleName; + } + + /** + * Sets the diffed size of the article. + * + * @param diffedSize diffed size + */ + public void setDiffedSize(final long diffedSize) { + this.diffedSize = diffedSize; + } + + /** + * Sets the number of diff parts. + * + * @param diffPartCounter number of diff parts + */ + public void setDiffPartCounter(final int diffPartCounter) { + this.diffPartCounter = diffPartCounter; + } + + /** + * Sets the encoded size of the article. + * + * @param encodedSize encoded size + */ + public void setEncodedSize(final long encodedSize) { + this.encodedSize = encodedSize; + } + + /** + * Sets the size of the article after the sql encoding. + * + * @param encodedSQLSize size after encoding + */ + public void setEncodedSQLSize(final long encodedSQLSize) { + this.encodedSQLSize = encodedSQLSize; + } + + /** + * Sets the entering time of the first task for this article. + * + * @param enteringTime entering time + */ + public void setEnteringTime(final long enteringTime) { + this.enteringTime = enteringTime; + } + + /** + * Sets the exiting time of the last task for this article. + * + * @param exitingTime exiting time + */ + public void setExitingTime(final long exitingTime) { + this.exitingTime = exitingTime; + } + + /** + * Sets the number of ignored revisions. + * + * @param ignoredRevisionsCounter number of ignored revisions + */ + public void setIgnoredRevisionsCounter(final int ignoredRevisionsCounter) { + this.ignoredRevisionsCounter = ignoredRevisionsCounter; + } + + /** + * Sets the original size of the article. + * + * @param originalSize original size + */ + public void setOriginalSize(final long originalSize) { + this.originalSize = originalSize; + } + + /** + * Sets the time used for the diff encoding. + * + * @param processingTimeDiff processing time diff + */ + public void setProcessingTimeDiff(final long processingTimeDiff) { + this.processingTimeDiff = processingTimeDiff; + } + + /** + * Sets the time used for reading the task. + * + * @param processingTimeRead processing time reading + */ + public void setProcessingTimeRead(final long processingTimeRead) { + this.processingTimeRead = processingTimeRead; + } + + /** + * Sets the time used for the sql encoding. + * + * @param processingTimeSQL processing time encoding + */ + public void setProcessingTimeSQL(final long processingTimeSQL) { + this.processingTimeSQL = processingTimeSQL; + } + + /** + * Sets the number of parsed revisions. + * + * @param readRevisionCounter number of parsed revisions + */ + public void setReadRevisionCounter(final int readRevisionCounter) { + this.readRevisionCounter = readRevisionCounter; + } + + /** + * Sets the revision counter. + * + * @param nrRevisions revision counter + */ + public void setRevisionCounter(final int nrRevisions) { + this.revisionCounter = nrRevisions; + } + + /** + * Returns the string representation of this object. Used for logging the + * statistical data. + * + * @return content representation + */ + public String toString() { + + long sysTime = this.exitingTime - this.enteringTime; + + StringBuilder b = new StringBuilder(); + b.append("\n[\tARTICLEID: \t"); + b.append(articleId); + b.append("\r\n\tARTICLENAME: \t"); + b.append(articleName); + b.append("\r\n\r\n\tNUMBER REVISIONS:\t["); + b.append(this.revisionCounter); + b.append(" + "); + b.append(this.ignoredRevisionsCounter); + b.append(" = "); + b.append(this.readRevisionCounter); + b.append("]\r\n\tNUMBER DIFFPARTS:\t"); + b.append(this.diffPartCounter); + b.append("\r\n\r\n\tSYSTEM TIME: \t[ 100% ]\t"); + b.append(Time.toClock(sysTime)); + b.append("\r\n\tREADING TIME: \t["); + b.append(MathUtilities.percentFrom(this.processingTimeRead, sysTime)); + b.append("]\t"); + b.append(Time.toClock(this.processingTimeRead)); + b.append("\r\n\tDIFFING TIME: \t["); + b.append(MathUtilities.percentFrom(this.processingTimeDiff, sysTime)); + b.append("]\t"); + b.append(Time.toClock(this.processingTimeDiff)); + b.append("\r\n\tENCODING TIME: \t["); + b.append(MathUtilities.percentFrom(this.processingTimeSQL, sysTime)); + b.append("]\t"); + b.append(Time.toClock(this.processingTimeSQL)); + b.append("\r\n\r\n\tORIGINAL SIZE: \t[ 100% ]\t"); + b.append(this.originalSize); + b.append("\r\n\tDIFFED SIZE: \t["); + b.append(MathUtilities.percentFrom(this.diffedSize, this.originalSize)); + b.append("]\t"); + b.append(this.diffedSize); + b.append("\r\n\tENCODED SIZE: \t["); + b.append(MathUtilities.percentFrom(this.encodedSize, this.originalSize)); + b.append("]\t"); + b.append(this.encodedSize); + b.append("\r\n\tENCODED UNCOMPRESSED SIZE: \t["); + b.append(MathUtilities.percentFrom(this.encodedSQLSize, + this.originalSize)); + b.append("]\t"); + b.append(this.encodedSQLSize); + b.append("\r\n]\r\n"); + + return b.toString(); + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/index/IndexGenerator.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/index/IndexGenerator.java index 2931347f..501b7fdf 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/index/IndexGenerator.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/index/IndexGenerator.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -33,198 +33,183 @@ /** * Generates the indices for the database. - * - * - * */ -public class IndexGenerator -{ - - /** Reference to the configuration */ - private final RevisionAPIConfiguration config; - - /** - * (Constructor) Creates a new IndexGenerator object. - * - * @param config - * Reference to the configuration - */ - public IndexGenerator(final RevisionAPIConfiguration config) - { - this.config = config; - } - - /** - * Starts the generation of the indices. - * - * @throws WikiApiException - * if an error occurs - */ - public void generate() - throws WikiApiException - { - Indexer data = null; - try { - data = new Indexer(config); - - System.out.println("GENERATING INDEX STARTED"); - - long bufferSize = config.getBufferSize(); - Revision rev; - long count = 0; - long last = 0, now, start = System.currentTimeMillis(); - - Iterator it = new IndexIterator(config); - while (it.hasNext()) { - - if (++count % bufferSize == 0) { - now = System.currentTimeMillis() - start; - System.out.println(Time.toClock(now) + "\t" + (now - last) - + "\tINDEXING " + count); - last = now; - } - - rev = it.next(); - data.index(rev); - } - - System.out.println("GENERATING INDEX ENDED + (" - + Time.toClock(System.currentTimeMillis() - start) + ")"); - - } - catch (Exception e) { - - throw new WikiApiException(e); - - } - finally { - if (data != null) { - data.close(); - } - } - } - - /** - * Starts index generation using the database credentials in the - * properties file specified in args[0].
- * The properties file should have the following structure: - *

  • host=dbhost
  • - *
  • db=revisiondb
  • - *
  • user=username
  • - *
  • password=pwd
  • - *
  • output=outputFile
  • - *
  • writeDirectlyToDB=true|false (optional)
  • - *
  • charset=UTF8 (or others) (optional)
  • - *
  • buffer=15000 (optional)
  • - *
  • maxAllowedPackets=16760832 (optional)
- *
- * - * @param args allows only one entry that contains the path to the config file - */ - public static void main(String[] args) - { - - if(args==null||args.length!=1){ - System.out.println(("You need to specify the database configuration file. \n" + - "It should contain the access credentials to you revision database in the following format: \n" + - " host=dbhost \n" + - " db=revisiondb \n" + - " user=username \n" + - " password=pwd \n" + - " output=outputFile \n"+ - " outputDatabase=true|false (optional)\n" + - " outputDatafile=true|false (optional)\n" + - " charset=UTF8 (optional)\n" + - " buffer=15000 (optional)\n"+ - " maxAllowedPackets=16760832 (optional)\n\n" + - " The default output mode is SQL Dump")); - throw new IllegalArgumentException(); - }else{ - Properties props = load(args[0]); - - RevisionAPIConfiguration config = new RevisionAPIConfiguration(); - - config.setHost(props.getProperty("host")); - config.setDatabase(props.getProperty("db")); - config.setUser(props.getProperty("user")); - config.setPassword(props.getProperty("password")); - - String charset=props.getProperty("charset"); - String buffer=props.getProperty("buffer"); - String maxAllowedPackets=props.getProperty("maxAllowedPackets"); +public class IndexGenerator { + + /** + * Reference to the configuration + */ + private final RevisionAPIConfiguration config; + + /** + * (Constructor) Creates a new IndexGenerator object. + * + * @param config Reference to the configuration + */ + public IndexGenerator(final RevisionAPIConfiguration config) { + this.config = config; + } + + /** + * Starts the generation of the indices. + * + * @throws WikiApiException if an error occurs + */ + public void generate() + throws WikiApiException { + Indexer data = null; + try { + data = new Indexer(config); + + System.out.println("GENERATING INDEX STARTED"); + + long bufferSize = config.getBufferSize(); + Revision rev; + long count = 0; + long last = 0, now, start = System.currentTimeMillis(); + + Iterator it = new IndexIterator(config); + while (it.hasNext()) { + + if (++count % bufferSize == 0) { + now = System.currentTimeMillis() - start; + System.out.println(Time.toClock(now) + "\t" + (now - last) + + "\tINDEXING " + count); + last = now; + } + + rev = it.next(); + data.index(rev); + } + + System.out.println("GENERATING INDEX ENDED + (" + + Time.toClock(System.currentTimeMillis() - start) + ")"); + + } catch (Exception e) { + + throw new WikiApiException(e); + + } finally { + if (data != null) { + data.close(); + } + } + } + + /** + * Starts index generation using the database credentials in the + * properties file specified in args[0].
+ * The properties file should have the following structure: + *
  • host=dbhost
  • + *
  • db=revisiondb
  • + *
  • user=username
  • + *
  • password=pwd
  • + *
  • output=outputFile
  • + *
  • writeDirectlyToDB=true|false (optional)
  • + *
  • charset=UTF8 (or others) (optional)
  • + *
  • buffer=15000 (optional)
  • + *
  • maxAllowedPackets=16760832 (optional)
+ *
+ * + * @param args allows only one entry that contains the path to the config file + */ + public static void main(String[] args) { + + if (args == null || args.length != 1) { + System.out.println(("You need to specify the database configuration file. \n" + + "It should contain the access credentials to you revision database in the following format: \n" + + " host=dbhost \n" + + " db=revisiondb \n" + + " user=username \n" + + " password=pwd \n" + + " output=outputFile \n" + + " outputDatabase=true|false (optional)\n" + + " outputDatafile=true|false (optional)\n" + + " charset=UTF8 (optional)\n" + + " buffer=15000 (optional)\n" + + " maxAllowedPackets=16760832 (optional)\n\n" + + " The default output mode is SQL Dump")); + throw new IllegalArgumentException(); + } else { + Properties props = load(args[0]); + + RevisionAPIConfiguration config = new RevisionAPIConfiguration(); + + config.setHost(props.getProperty("host")); + config.setDatabase(props.getProperty("db")); + config.setUser(props.getProperty("user")); + config.setPassword(props.getProperty("password")); + + String charset = props.getProperty("charset"); + String buffer = props.getProperty("buffer"); + String maxAllowedPackets = props.getProperty("maxAllowedPackets"); config.setCharacterSet(Objects.requireNonNullElse(charset, "UTF-8")); - if(buffer!=null){ - config.setBufferSize(Integer.parseInt(buffer)); - }else{ - config.setBufferSize(15000); - } - - if(maxAllowedPackets!=null){ - config.setMaxAllowedPacket(Long.parseLong(maxAllowedPackets)); - }else{ - config.setMaxAllowedPacket(16 * 1024 * 1023); - } - - if(props.getProperty("outputDatabase")!=null&&Boolean.parseBoolean(props.getProperty("outputDatabase"))){ - config.setOutputType(OutputTypes.DATABASE); - }else if(props.getProperty("outputDatafile")!=null&&Boolean.parseBoolean(props.getProperty("outputDatafile"))){ - config.setOutputType(OutputTypes.DATAFILE); - } - else{ - config.setOutputType(OutputTypes.SQL); - } - - String output = props.getProperty("output"); - File outfile = new File(output); - if(outfile.isDirectory()){ - config.setOutputPath(output); - }else{ - config.setOutputPath(outfile.getParentFile().getPath()); - } - - try { - new IndexGenerator(config).generate(); - } - catch (Exception e) { - e.printStackTrace(); - } - - System.out.println("TERMINATED"); - } - } - - /** - * Load a properties file from the classpath - * - * @param configFilePath - * path to the configuration file - * @return Properties the properties object containing the configuration - * data - */ - private static Properties load(String configFilePath) { - Properties props = new Properties(); - BufferedInputStream fis = null; - try { - File configFile = new File(configFilePath); - fis = new BufferedInputStream(new FileInputStream(configFile)); - props.load(fis); - } - catch(IOException e){ - System.err.println("Could not load configuration file "+configFilePath); - } - finally{ - if(fis!=null){ - try { - fis.close(); - } - catch (IOException e) { - System.err.println("Error closing file stream of configuration file "+configFilePath); - } - } + if (buffer != null) { + config.setBufferSize(Integer.parseInt(buffer)); + } else { + config.setBufferSize(15000); + } + + if (maxAllowedPackets != null) { + config.setMaxAllowedPacket(Long.parseLong(maxAllowedPackets)); + } else { + config.setMaxAllowedPacket(16 * 1024 * 1023); + } + + if (props.getProperty("outputDatabase") != null && Boolean.parseBoolean(props.getProperty("outputDatabase"))) { + config.setOutputType(OutputTypes.DATABASE); + } else if (props.getProperty("outputDatafile") != null && Boolean.parseBoolean(props.getProperty("outputDatafile"))) { + config.setOutputType(OutputTypes.DATAFILE); + } else { + config.setOutputType(OutputTypes.SQL); + } + + String output = props.getProperty("output"); + File outfile = new File(output); + if (outfile.isDirectory()) { + config.setOutputPath(output); + } else { + config.setOutputPath(outfile.getParentFile().getPath()); + } + + try { + new IndexGenerator(config).generate(); + } catch (Exception e) { + e.printStackTrace(); + } + + System.out.println("TERMINATED"); + } + } + + /** + * Load a properties file from the classpath + * + * @param configFilePath path to the configuration file + * @return Properties the properties object containing the configuration + * data + */ + private static Properties load(String configFilePath) { + Properties props = new Properties(); + BufferedInputStream fis = null; + try { + File configFile = new File(configFilePath); + fis = new BufferedInputStream(new FileInputStream(configFile)); + props.load(fis); + } catch (IOException e) { + System.err.println("Could not load configuration file " + configFilePath); + } finally { + if (fis != null) { + try { + fis.close(); + } catch (IOException e) { + System.err.println("Error closing file stream of configuration file " + configFilePath); } - return props; - } + } + } + return props; + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/index/IndexIterator.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/index/IndexIterator.java index 8abbb2b0..ebc97434 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/index/IndexIterator.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/index/IndexIterator.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -32,158 +32,151 @@ /** * Iterates over the database to retrieve the necessary information for the * index generation. - * - * - * */ public class IndexIterator - implements Iterator -{ - - /** Reference to the database connection */ - private final Connection connection; - - /** Reference to the ResultSet */ - private ResultSet result; - - /** Reference to the statement */ - private Statement statement; + implements Iterator { + + /** + * Reference to the database connection + */ + private final Connection connection; + + /** + * Reference to the ResultSet + */ + private ResultSet result; + + /** + * Reference to the statement + */ + private Statement statement; + + /** + * Currently used primary kes + */ + private int primaryKey; + + /** + * Configuration parameter - maximum size of a result set + */ + private final int MAX_NUMBER_RESULTS; + + /** + * (Constructor) Creates the IndexIterator object. + * + * @param config Reference to the configuration + * @throws WikiApiException if an error occurs + */ + public IndexIterator(final RevisionAPIConfiguration config) + throws WikiApiException { + + try { + this.primaryKey = -1; + + this.statement = null; + this.result = null; + + String driverDB = "com.mysql.jdbc.Driver"; + Class.forName(driverDB); + + MAX_NUMBER_RESULTS = config.getBufferSize(); + + this.connection = DriverManager.getConnection("jdbc:mysql://" + + config.getHost() + "/" + config.getDatabase(), + config.getUser(), config.getPassword()); + + } catch (SQLException | ClassNotFoundException e) { + throw new WikiApiException(e); + } + } - /** Currently used primary kes */ - private int primaryKey; + /** + * Queries the database for more revision information. + * + * @return TRUE if the resultset contains elements FALSE otherwise + * @throws SQLException if an error occurs while accessing the database + */ + private boolean query() + throws SQLException { + statement = this.connection.createStatement(); + + String query = "SELECT PrimaryKey, RevisionCounter," + + " RevisionID, ArticleID, Timestamp, FullRevisionID " + + "FROM revisions"; + + if (primaryKey > 0) { + query += " WHERE PrimaryKey > " + primaryKey; + } + + if (MAX_NUMBER_RESULTS > 0) { + query += " LIMIT " + MAX_NUMBER_RESULTS; + } + + result = statement.executeQuery(query); + return result.next(); + } - /** Configuration parameter - maximum size of a result set */ - private final int MAX_NUMBER_RESULTS; + /** + * Returns the next revision information. (Does not contain the encoded + * diff) + * + * @return Revision + */ + public Revision next() { + try { + Revision revision = new Revision(result.getInt(2)); - /** - * (Constructor) Creates the IndexIterator object. - * - * @param config - * Reference to the configuration - * - * @throws WikiApiException - * if an error occurs - */ - public IndexIterator(final RevisionAPIConfiguration config) - throws WikiApiException - { + this.primaryKey = result.getInt(1); + revision.setPrimaryKey(this.primaryKey); - try { - this.primaryKey = -1; + revision.setRevisionID(result.getInt(3)); + revision.setArticleID(result.getInt(4)); + revision.setTimeStamp(new Timestamp(result.getLong(5))); + revision.setFullRevisionID(result.getInt(6)); - this.statement = null; - this.result = null; + return revision; - String driverDB = "com.mysql.jdbc.Driver"; - Class.forName(driverDB); + } catch (Exception e) { - MAX_NUMBER_RESULTS = config.getBufferSize(); + e.printStackTrace(); - this.connection = DriverManager.getConnection("jdbc:mysql://" - + config.getHost() + "/" + config.getDatabase(), - config.getUser(), config.getPassword()); + throw new RuntimeException(e); + } + } - } - catch (SQLException | ClassNotFoundException e) { - throw new WikiApiException(e); - } + /** + * Returns TRUE if another revision information is available. + * + * @return TRUE | FALSE + */ + public boolean hasNext() { + try { + if (result != null && result.next()) { + return true; + } + + if (this.statement != null) { + this.statement.close(); + } + if (this.result != null) { + this.result.close(); + } + + return query(); + + } catch (SQLException e) { + throw new RuntimeException(e); + } } - /** - * Queries the database for more revision information. - * - * @return TRUE if the resultset contains elements FALSE otherwise - * - * @throws SQLException - * if an error occurs while accessing the database - */ - private boolean query() - throws SQLException - { - statement = this.connection.createStatement(); - - String query = "SELECT PrimaryKey, RevisionCounter," - + " RevisionID, ArticleID, Timestamp, FullRevisionID " - + "FROM revisions"; - - if (primaryKey > 0) { - query += " WHERE PrimaryKey > " + primaryKey; - } - - if (MAX_NUMBER_RESULTS > 0) { - query += " LIMIT " + MAX_NUMBER_RESULTS; - } - - result = statement.executeQuery(query); - return result.next(); - } - - /** - * Returns the next revision information. (Does not contain the encoded - * diff) - * - * @return Revision - */ - public Revision next() - { - try { - Revision revision = new Revision(result.getInt(2)); - - this.primaryKey = result.getInt(1); - revision.setPrimaryKey(this.primaryKey); - - revision.setRevisionID(result.getInt(3)); - revision.setArticleID(result.getInt(4)); - revision.setTimeStamp(new Timestamp(result.getLong(5))); - revision.setFullRevisionID(result.getInt(6)); - - return revision; - - } - catch (Exception e) { - - e.printStackTrace(); - - throw new RuntimeException(e); - } - } - - /** - * Returns TRUE if another revision information is available. - * - * @return TRUE | FALSE - */ - public boolean hasNext() - { - try { - if (result != null && result.next()) { - return true; - } - - if (this.statement != null) { - this.statement.close(); - } - if (this.result != null) { - this.result.close(); - } - - return query(); - - } - catch (SQLException e) { - throw new RuntimeException(e); - } - } - - /** - * unsupported method - * - * @deprecated - * @throws UnsupportedOperationException - */ - @Deprecated - public void remove() - { - throw new UnsupportedOperationException(); - } + /** + * unsupported method + * + * @throws UnsupportedOperationException + * @deprecated + */ + @Deprecated + public void remove() { + throw new UnsupportedOperationException(); + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/index/Indexer.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/index/Indexer.java index 9a7310ce..2960af57 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/index/Indexer.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/index/Indexer.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -38,239 +38,240 @@ /** * Forwards the necessary information to the AbstractIndex classes and controls * the writing to the output if one of the index has reached the maximum size. - * - * - * */ -public class Indexer -{ - - /** Currently used article */ - private int currentArticleID; - - /** First appearance of the current article */ - private long startTime; - - /** Last appearance of the current article */ - private long endTime; - - /** Currently used full revision */ - private int currentFullRevisionID; - - /** Previous revision */ - private Revision lastRev; - - /** Reference to the revision index */ - private RevisionIndex revisionIndex=null; - - /** Reference to the currently used article index information */ - private ArticleIndexData info; - - /** List of article index information related to the currently used article */ - private final List infoList; - - /** Reference to the article index */ - private ArticleIndex articleIndex=null; - - /** Reference to the chronological order index */ - private ChronoIndex chronoIndex=null; - - /** Reference to the output writer */ - private IndexWriterInterface indexWriter; - - /** Reference to the database connection */ - private final Connection connection = null; - - /** - * (Constructor) Creates a Index object. - * - * @param config - * Reference to the configuration - * - * @throws ClassNotFoundException - * if the jdbc classes could not be located - * @throws SQLException - * if an error occurred while accessing the database - * @throws IOException - * if an error occurred while writing the output - */ - public Indexer(final RevisionAPIConfiguration config) - throws ClassNotFoundException, SQLException, IOException - { - - this.currentArticleID = -1; - - switch (config.getOutputType()) { - case DATABASE: - case SQL: - //Indices with SQL statements - this.revisionIndex = new RevisionIndex(config.getMaxAllowedPacket()); - this.articleIndex = new ArticleIndex(config.getMaxAllowedPacket()); - this.chronoIndex = new ChronoIndex(config.getMaxAllowedPacket()); - break; - case DATAFILE: - //Indices without SQL statements - this.revisionIndex = new RevisionIndex(); - this.articleIndex = new ArticleIndex(); - this.chronoIndex = new ChronoIndex(); - break; - } - - this.infoList = new ArrayList<>(); - - switch (config.getOutputType()) { - case DATABASE: - this.indexWriter = new DatabaseWriter(config); - break; - case SQL: - this.indexWriter = new SQLFileWriter(config); - break; - case DATAFILE: - this.indexWriter = new DataFileWriter(config); - break; - } - } - - /** - * Checks whether the AbstractIndex classes have output available and - * forward them to the output writer. - * - * @throws IOException - * if an error occurred while writing the output - * @throws SQLException - * if an error occurred while accessing the database - */ - private void send() - throws IOException, SQLException - { - - this.indexWriter.write(articleIndex); - this.indexWriter.write(revisionIndex); - this.indexWriter.write(chronoIndex); - } - - /** - * Processes the given revision. - * - * @param rev - * Reference to a revision - * - * @throws WikiApiException - * if an error occurs - */ - public void index(final Revision rev) - throws WikiApiException - { - - int articleID = rev.getArticleID(); - int fullRevisionID = rev.getFullRevisionID(); - int revisionCounter = rev.getRevisionCounter(); - - if (articleID != currentArticleID) { - - if (lastRev != null) { - info.setEndRevisionCount(lastRev.getRevisionCounter()); - this.infoList.add(info); - - try { - this.articleIndex.add(currentArticleID, startTime, endTime, - infoList); - send(); - } - catch (SQLException | IOException sql) { - sql.printStackTrace(); - throw new WikiApiException(sql); - } +public class Indexer { + + /** + * Currently used article + */ + private int currentArticleID; + + /** + * First appearance of the current article + */ + private long startTime; + + /** + * Last appearance of the current article + */ + private long endTime; + + /** + * Currently used full revision + */ + private int currentFullRevisionID; + + /** + * Previous revision + */ + private Revision lastRev; + + /** + * Reference to the revision index + */ + private RevisionIndex revisionIndex = null; + + /** + * Reference to the currently used article index information + */ + private ArticleIndexData info; + + /** + * List of article index information related to the currently used article + */ + private final List infoList; + + /** + * Reference to the article index + */ + private ArticleIndex articleIndex = null; + + /** + * Reference to the chronological order index + */ + private ChronoIndex chronoIndex = null; + + /** + * Reference to the output writer + */ + private IndexWriterInterface indexWriter; + + /** + * Reference to the database connection + */ + private final Connection connection = null; + + /** + * (Constructor) Creates a Index object. + * + * @param config Reference to the configuration + * @throws ClassNotFoundException if the jdbc classes could not be located + * @throws SQLException if an error occurred while accessing the database + * @throws IOException if an error occurred while writing the output + */ + public Indexer(final RevisionAPIConfiguration config) + throws ClassNotFoundException, SQLException, IOException { + + this.currentArticleID = -1; + + switch (config.getOutputType()) { + case DATABASE: + case SQL: + //Indices with SQL statements + this.revisionIndex = new RevisionIndex(config.getMaxAllowedPacket()); + this.articleIndex = new ArticleIndex(config.getMaxAllowedPacket()); + this.chronoIndex = new ChronoIndex(config.getMaxAllowedPacket()); + break; + case DATAFILE: + //Indices without SQL statements + this.revisionIndex = new RevisionIndex(); + this.articleIndex = new ArticleIndex(); + this.chronoIndex = new ChronoIndex(); + break; + } + + this.infoList = new ArrayList<>(); + + switch (config.getOutputType()) { + case DATABASE: + this.indexWriter = new DatabaseWriter(config); + break; + case SQL: + this.indexWriter = new SQLFileWriter(config); + break; + case DATAFILE: + this.indexWriter = new DataFileWriter(config); + break; + } + } + + /** + * Checks whether the AbstractIndex classes have output available and + * forward them to the output writer. + * + * @throws IOException if an error occurred while writing the output + * @throws SQLException if an error occurred while accessing the database + */ + private void send() + throws IOException, SQLException { + + this.indexWriter.write(articleIndex); + this.indexWriter.write(revisionIndex); + this.indexWriter.write(chronoIndex); + } + + /** + * Processes the given revision. + * + * @param rev Reference to a revision + * @throws WikiApiException if an error occurs + */ + public void index(final Revision rev) + throws WikiApiException { + + int articleID = rev.getArticleID(); + int fullRevisionID = rev.getFullRevisionID(); + int revisionCounter = rev.getRevisionCounter(); + + if (articleID != currentArticleID) { + + if (lastRev != null) { + info.setEndRevisionCount(lastRev.getRevisionCounter()); + this.infoList.add(info); + + try { + this.articleIndex.add(currentArticleID, startTime, endTime, + infoList); + send(); + } catch (SQLException | IOException sql) { + sql.printStackTrace(); + throw new WikiApiException(sql); + } } - if (revisionCounter != 1) { - System.err.println("WARNING : ArticleID (" + articleID - + ") RevisionCounter 1 expected - " + revisionCounter - + " read"); - } + if (revisionCounter != 1) { + System.err.println("WARNING : ArticleID (" + articleID + + ") RevisionCounter 1 expected - " + revisionCounter + + " read"); + } - startTime = Long.MAX_VALUE; - endTime = Long.MIN_VALUE; + startTime = Long.MAX_VALUE; + endTime = Long.MIN_VALUE; - currentArticleID = articleID; - currentFullRevisionID = fullRevisionID; + currentArticleID = articleID; + currentFullRevisionID = fullRevisionID; - info = new ArticleIndexData(); + info = new ArticleIndexData(); - info.setFullRevisionPrimaryKey(rev.getPrimaryKey()); - info.setStartRevisionCount(rev.getRevisionCounter()); + info.setFullRevisionPrimaryKey(rev.getPrimaryKey()); + info.setStartRevisionCount(rev.getRevisionCounter()); - } - else if (fullRevisionID != currentFullRevisionID) { + } else if (fullRevisionID != currentFullRevisionID) { - if (lastRev.getRevisionCounter() + 1 != revisionCounter) { - System.err.println("WARNING : ArticleID (" + articleID + ")" - + " RevisionCounter " - + (lastRev.getRevisionCounter() + 1) + " expected - " - + revisionCounter + " read"); - } + if (lastRev.getRevisionCounter() + 1 != revisionCounter) { + System.err.println("WARNING : ArticleID (" + articleID + ")" + + " RevisionCounter " + + (lastRev.getRevisionCounter() + 1) + " expected - " + + revisionCounter + " read"); + } - info.setEndRevisionCount(lastRev.getRevisionCounter()); - this.infoList.add(info); + info.setEndRevisionCount(lastRev.getRevisionCounter()); + this.infoList.add(info); - currentFullRevisionID = fullRevisionID; - info = new ArticleIndexData(); + currentFullRevisionID = fullRevisionID; + info = new ArticleIndexData(); - info.setFullRevisionPrimaryKey(rev.getPrimaryKey()); - info.setStartRevisionCount(rev.getRevisionCounter()); + info.setFullRevisionPrimaryKey(rev.getPrimaryKey()); + info.setStartRevisionCount(rev.getRevisionCounter()); - } - else if (lastRev.getRevisionCounter() + 1 != revisionCounter) { + } else if (lastRev.getRevisionCounter() + 1 != revisionCounter) { - System.err.println("WARNING : ArticleID (" + articleID + ")" - + " RevisionCounter " + (lastRev.getRevisionCounter() + 1) - + " expected - " + revisionCounter + " read"); - } + System.err.println("WARNING : ArticleID (" + articleID + ")" + + " RevisionCounter " + (lastRev.getRevisionCounter() + 1) + + " expected - " + revisionCounter + " read"); + } - this.startTime = Math.min(rev.getTimeStamp().getTime(), startTime); - this.endTime = Math.max(rev.getTimeStamp().getTime(), endTime); + this.startTime = Math.min(rev.getTimeStamp().getTime(), startTime); + this.endTime = Math.max(rev.getTimeStamp().getTime(), endTime); - revisionIndex.add(rev.getRevisionID(), rev.getPrimaryKey(), - info.getFullRevisionPrimaryKey()); - chronoIndex.add(articleID, rev.getRevisionCounter(), rev.getTimeStamp() - .getTime()); - lastRev = rev; - } + revisionIndex.add(rev.getRevisionID(), rev.getPrimaryKey(), + info.getFullRevisionPrimaryKey()); + chronoIndex.add(articleID, rev.getRevisionCounter(), rev.getTimeStamp() + .getTime()); + lastRev = rev; + } - /** - * Finalizes the indices and sends the rest of the data to the output. - * Afterwards the database connection will be closed. - * - * @throws WikiApiException - * if an error occurs - */ - public void close() - throws WikiApiException - { + /** + * Finalizes the indices and sends the rest of the data to the output. + * Afterwards the database connection will be closed. + * + * @throws WikiApiException if an error occurs + */ + public void close() + throws WikiApiException { - try { - this.revisionIndex.finalizeIndex(); - this.chronoIndex.finalizeIndex(); + try { + this.revisionIndex.finalizeIndex(); + this.chronoIndex.finalizeIndex(); - info.setEndRevisionCount(lastRev.getRevisionCounter()); - this.infoList.add(info); + info.setEndRevisionCount(lastRev.getRevisionCounter()); + this.infoList.add(info); - this.articleIndex.add(currentArticleID, startTime, endTime, - infoList); - this.articleIndex.finalizeIndex(); + this.articleIndex.add(currentArticleID, startTime, endTime, + infoList); + this.articleIndex.finalizeIndex(); - send(); + send(); - this.indexWriter.finish(); + this.indexWriter.finish(); - if (connection != null) { - this.connection.close(); - } + if (connection != null) { + this.connection.close(); + } - } - catch (SQLException | IOException sql) { - sql.printStackTrace(); - throw new WikiApiException(sql); - } + } catch (SQLException | IOException sql) { + sql.printStackTrace(); + throw new WikiApiException(sql); + } } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/index/indices/AbstractIndex.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/index/indices/AbstractIndex.java index 14b36037..a18f85aa 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/index/indices/AbstractIndex.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/index/indices/AbstractIndex.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -22,119 +22,114 @@ /** * This class represents an abstact index. - * - * - * */ -public abstract class AbstractIndex -{ - - /** Current query buffer */ - protected StringBuilder buffer; - - /** List of contained queries. */ - private final List bufferList; - - /** Insert Statement to use */ - protected final String insertStatement; - - /** MAX_ALLOWED_PACKET */ - protected long MAX_ALLOWED_PACKET; - - /** - * (Constructor) Creates an index object. - */ - public AbstractIndex() - { - - this.bufferList = new ArrayList<>(); - this.buffer = null; - - //does not really matter here- should be big to speed up data file creation - this.MAX_ALLOWED_PACKET = 16760832; - - this.insertStatement = ""; - - storeBuffer(); - } - - /** - * (Constructor) Creates an index object. - * - * @param insertStatement - * Insert Statement - * @param MAX_ALLOWED_PACKET - * MAX_ALLOWED_PACKET - */ - public AbstractIndex(final String insertStatement, - final long MAX_ALLOWED_PACKET) - { - - this.bufferList = new ArrayList<>(); - this.buffer = null; - - this.MAX_ALLOWED_PACKET = MAX_ALLOWED_PACKET; - - this.insertStatement = insertStatement; - - storeBuffer(); - } - - /** - * Returns the size of the currently used buffer. - * - * @return size of current query - */ - public int byteSize() - { - return this.buffer.length(); - } - - /** - * Finalizes the query in the currently used buffer and creates a new one. - * The finalized query will be added to the list of queries. - */ - public void finalizeIndex() - { - storeBuffer(); - } - - /** - * Removes a query from the list of queries. - * - * @return Buffer containing a finalized query - */ - public StringBuilder remove() - { - return this.bufferList.remove(0); - } - - /** - * Returns the current number of buffered queries. - * - * @return size of the list of queries - */ - public int size() - { - return bufferList.size(); - } - - /** - * Finalizes the query in the currently used buffer and creates a new one. - * The finalized query will be added to the list of queries. - */ - protected void storeBuffer() - { - - if (buffer != null && buffer.length() > insertStatement.length()) { - if(!insertStatement.isEmpty()) { - //only do this in SQL/DATABASE MODE - this.buffer.append(";"); - } - bufferList.add(buffer); - } - - this.buffer = new StringBuilder(); - this.buffer.append(insertStatement); - } +public abstract class AbstractIndex { + + /** + * Current query buffer + */ + protected StringBuilder buffer; + + /** + * List of contained queries. + */ + private final List bufferList; + + /** + * Insert Statement to use + */ + protected final String insertStatement; + + /** + * MAX_ALLOWED_PACKET + */ + protected long MAX_ALLOWED_PACKET; + + /** + * (Constructor) Creates an index object. + */ + public AbstractIndex() { + + this.bufferList = new ArrayList<>(); + this.buffer = null; + + //does not really matter here- should be big to speed up data file creation + this.MAX_ALLOWED_PACKET = 16760832; + + this.insertStatement = ""; + + storeBuffer(); + } + + /** + * (Constructor) Creates an index object. + * + * @param insertStatement Insert Statement + * @param MAX_ALLOWED_PACKET MAX_ALLOWED_PACKET + */ + public AbstractIndex(final String insertStatement, + final long MAX_ALLOWED_PACKET) { + + this.bufferList = new ArrayList<>(); + this.buffer = null; + + this.MAX_ALLOWED_PACKET = MAX_ALLOWED_PACKET; + + this.insertStatement = insertStatement; + + storeBuffer(); + } + + /** + * Returns the size of the currently used buffer. + * + * @return size of current query + */ + public int byteSize() { + return this.buffer.length(); + } + + /** + * Finalizes the query in the currently used buffer and creates a new one. + * The finalized query will be added to the list of queries. + */ + public void finalizeIndex() { + storeBuffer(); + } + + /** + * Removes a query from the list of queries. + * + * @return Buffer containing a finalized query + */ + public StringBuilder remove() { + return this.bufferList.remove(0); + } + + /** + * Returns the current number of buffered queries. + * + * @return size of the list of queries + */ + public int size() { + return bufferList.size(); + } + + /** + * Finalizes the query in the currently used buffer and creates a new one. + * The finalized query will be added to the list of queries. + */ + protected void storeBuffer() { + + if (buffer != null && buffer.length() > insertStatement.length()) { + if (!insertStatement.isEmpty()) { + //only do this in SQL/DATABASE MODE + this.buffer.append(";"); + } + bufferList.add(buffer); + } + + this.buffer = new StringBuilder(); + this.buffer.append(insertStatement); + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/index/indices/ArticleIndex.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/index/indices/ArticleIndex.java index 9e4d02cb..fc3c8d6b 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/index/indices/ArticleIndex.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/index/indices/ArticleIndex.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -21,107 +21,95 @@ /** * Index for article information. - * - * - * */ public class ArticleIndex - extends AbstractIndex -{ - /** - * (Constructor) Creates a new ArticleIndex object. - */ - public ArticleIndex() - { - - super(); - } - - /** - * (Constructor) Creates a new ArticleIndex object. - * - * @param MAX_ALLOWED_PACKET - * MAX_ALLOWED_PACKET - */ - public ArticleIndex(final long MAX_ALLOWED_PACKET) - { - - super("INSERT INTO index_articleID_rc_ts VALUES ", MAX_ALLOWED_PACKET); - } - - /** - * Adds the information for an new entry in the article index. - * - * @param currentArticleID - * ID of the currently used article - * @param startTime - * First date of appearance - * @param endTime - * Last date of appearance - * @param infoList - * List of revision blocks - */ - public void add(final int currentArticleID, final long startTime, - final long endTime, final List infoList) - { - - // index_articleID_rc_ts - if (!infoList.isEmpty()) { - - StringBuilder fullRevBuffer = new StringBuilder(); - StringBuilder revCountBuffer = new StringBuilder(); - - boolean first = true; - ArticleIndexData info; - while (!infoList.isEmpty()) { - - info = infoList.remove(0); - - if (!first) { - fullRevBuffer.append(" "); - revCountBuffer.append(" "); - } - - fullRevBuffer.append(info.getFullRevisionPrimaryKey()); - - revCountBuffer.append(info.getStartRevisionCount()); - revCountBuffer.append(" "); - revCountBuffer.append(info.getEndRevisionCount()); - - first = false; - } - - boolean sql = !insertStatement.isEmpty(); - if (buffer.length() + fullRevBuffer.length() - + revCountBuffer.length() + 20 >= MAX_ALLOWED_PACKET) { - storeBuffer(); - } - - - if(sql) { - if (buffer.length() > insertStatement.length()) { - buffer.append(","); - } - buffer.append("("); - } - buffer.append(currentArticleID); - buffer.append(","); - buffer.append(sql?"\'":"\""); - buffer.append(fullRevBuffer); - buffer.append(sql?"\'":"\""); - buffer.append(","); - buffer.append(sql?"\'":"\""); - buffer.append(revCountBuffer); - buffer.append(sql?"\'":"\""); - buffer.append(","); - buffer.append(startTime); - buffer.append(","); - buffer.append(endTime); - if(sql) { - buffer.append(")"); - }else{ - buffer.append("\n"); - } - } - } + extends AbstractIndex { + /** + * (Constructor) Creates a new ArticleIndex object. + */ + public ArticleIndex() { + + super(); + } + + /** + * (Constructor) Creates a new ArticleIndex object. + * + * @param MAX_ALLOWED_PACKET MAX_ALLOWED_PACKET + */ + public ArticleIndex(final long MAX_ALLOWED_PACKET) { + + super("INSERT INTO index_articleID_rc_ts VALUES ", MAX_ALLOWED_PACKET); + } + + /** + * Adds the information for an new entry in the article index. + * + * @param currentArticleID ID of the currently used article + * @param startTime First date of appearance + * @param endTime Last date of appearance + * @param infoList List of revision blocks + */ + public void add(final int currentArticleID, final long startTime, + final long endTime, final List infoList) { + + // index_articleID_rc_ts + if (!infoList.isEmpty()) { + + StringBuilder fullRevBuffer = new StringBuilder(); + StringBuilder revCountBuffer = new StringBuilder(); + + boolean first = true; + ArticleIndexData info; + while (!infoList.isEmpty()) { + + info = infoList.remove(0); + + if (!first) { + fullRevBuffer.append(" "); + revCountBuffer.append(" "); + } + + fullRevBuffer.append(info.getFullRevisionPrimaryKey()); + + revCountBuffer.append(info.getStartRevisionCount()); + revCountBuffer.append(" "); + revCountBuffer.append(info.getEndRevisionCount()); + + first = false; + } + + boolean sql = !insertStatement.isEmpty(); + if (buffer.length() + fullRevBuffer.length() + + revCountBuffer.length() + 20 >= MAX_ALLOWED_PACKET) { + storeBuffer(); + } + + + if (sql) { + if (buffer.length() > insertStatement.length()) { + buffer.append(","); + } + buffer.append("("); + } + buffer.append(currentArticleID); + buffer.append(","); + buffer.append(sql ? "\'" : "\""); + buffer.append(fullRevBuffer); + buffer.append(sql ? "\'" : "\""); + buffer.append(","); + buffer.append(sql ? "\'" : "\""); + buffer.append(revCountBuffer); + buffer.append(sql ? "\'" : "\""); + buffer.append(","); + buffer.append(startTime); + buffer.append(","); + buffer.append(endTime); + if (sql) { + buffer.append(")"); + } else { + buffer.append("\n"); + } + } + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/index/indices/ArticleIndexData.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/index/indices/ArticleIndexData.java index 503b31d7..7aaa61c3 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/index/indices/ArticleIndexData.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/index/indices/ArticleIndexData.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -20,106 +20,98 @@ /** * This class represents the data used by the ArticleIndex. One objects * represents one revision block. - * - * - * */ -public class ArticleIndexData -{ +public class ArticleIndexData { - /** Last number of a block of revisions */ - private long endRevisionCount; + /** + * Last number of a block of revisions + */ + private long endRevisionCount; - /** ID of the full revision */ - private long fullRevisionID; + /** + * ID of the full revision + */ + private long fullRevisionID; - /** PK of the full revision */ - private long fullRevisionPrimaryKey; + /** + * PK of the full revision + */ + private long fullRevisionPrimaryKey; - /** First number of a block of revisions */ - private long startRevisionCount; + /** + * First number of a block of revisions + */ + private long startRevisionCount; - /** - * Returns the last revision counter of this block. - * - * @return revision counter - */ - public long getEndRevisionCount() - { - return endRevisionCount; - } + /** + * Returns the last revision counter of this block. + * + * @return revision counter + */ + public long getEndRevisionCount() { + return endRevisionCount; + } - /** - * Returns the ID of the full revision. - * - * @return ID of the full revision - */ - public long getFullRevisionID() - { - return fullRevisionID; - } + /** + * Returns the ID of the full revision. + * + * @return ID of the full revision + */ + public long getFullRevisionID() { + return fullRevisionID; + } - /** - * Returns the PK of the full revision. - * - * @return PK of the full revision - */ - public long getFullRevisionPrimaryKey() - { - return fullRevisionPrimaryKey; - } + /** + * Returns the PK of the full revision. + * + * @return PK of the full revision + */ + public long getFullRevisionPrimaryKey() { + return fullRevisionPrimaryKey; + } - /** - * Returns the first revision counter of this block. - * - * @return revision counter - */ - public long getStartRevisionCount() - { - return startRevisionCount; - } + /** + * Returns the first revision counter of this block. + * + * @return revision counter + */ + public long getStartRevisionCount() { + return startRevisionCount; + } - /** - * Sets the last revision counter of this block. - * - * @param endRevisionCount - * revision counter - */ - public void setEndRevisionCount(final long endRevisionCount) - { - this.endRevisionCount = endRevisionCount; - } + /** + * Sets the last revision counter of this block. + * + * @param endRevisionCount revision counter + */ + public void setEndRevisionCount(final long endRevisionCount) { + this.endRevisionCount = endRevisionCount; + } - /** - * Sets the ID of the full revision. - * - * @param fullRevisionID - * ID of the full revision - */ - public void setFullRevisionID(final long fullRevisionID) - { - this.fullRevisionID = fullRevisionID; - } + /** + * Sets the ID of the full revision. + * + * @param fullRevisionID ID of the full revision + */ + public void setFullRevisionID(final long fullRevisionID) { + this.fullRevisionID = fullRevisionID; + } - /** - * Sets the PK of the full revision. - * - * @param fullRevisionPrimaryKey - * PK of the full revision - */ - public void setFullRevisionPrimaryKey(final long fullRevisionPrimaryKey) - { - this.fullRevisionPrimaryKey = fullRevisionPrimaryKey; - } + /** + * Sets the PK of the full revision. + * + * @param fullRevisionPrimaryKey PK of the full revision + */ + public void setFullRevisionPrimaryKey(final long fullRevisionPrimaryKey) { + this.fullRevisionPrimaryKey = fullRevisionPrimaryKey; + } - /** - * Sets the first revision counter of this block. - * - * @param startRevisionCount - * revision counter - */ - public void setStartRevisionCount(final long startRevisionCount) - { - this.startRevisionCount = startRevisionCount; - } + /** + * Sets the first revision counter of this block. + * + * @param startRevisionCount revision counter + */ + public void setStartRevisionCount(final long startRevisionCount) { + this.startRevisionCount = startRevisionCount; + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/index/indices/ChronoIndex.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/index/indices/ChronoIndex.java index 93c36374..66142097 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/index/indices/ChronoIndex.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/index/indices/ChronoIndex.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -23,160 +23,151 @@ /** * Index for the correct chonological order of revisions. - * - * - * */ public class ChronoIndex - extends AbstractIndex -{ + extends AbstractIndex { + + /** + * ID of the last procesed article + */ + private int articleID; + + /** + * List of ChonoInfo's + */ + private List list; + + + /** + * (Constructor) Creates a new ChronoIndex object. + */ + public ChronoIndex() { + + super(); + + this.list = null; + } + + /** + * (Constructor) Creates a new ChronoIndex object. + * + * @param MAX_ALLOWED_PACKET MAX_ALLOWED_PACKET + */ + public ChronoIndex(final long MAX_ALLOWED_PACKET) { + + super("INSERT INTO index_chronological VALUES ", MAX_ALLOWED_PACKET); + + this.list = null; + } + + /** + * Adds the information for an new entry in the chrono index. + * + * @param articleID ID of the article + * @param revisionCounter Revision counter + * @param timestamp Timestamp + */ + public void add(final int articleID, final int revisionCounter, + final long timestamp) { + + if (this.articleID != articleID) { + + if (list != null) { + addToBuffer(); + } + + this.articleID = articleID; + this.list = new ArrayList<>(); + } + + this.list.add(new ChronoIndexData(timestamp, revisionCounter)); + } + + /** + * Creates the mapping and the reverse mapping. The generated information + * will be added to the query buffer. This list will be cleared afterwards. + */ + private void addToBuffer() { + + if (list != null && !list.isEmpty()) { + + ChronoIndexData info; + + // Real index in revision history mapped to RevisionCounter + // Sorted by real index (time) in ascending order + Collections.sort(list); + + StringBuilder reverseMapping = new StringBuilder(); + + int size = list.size(); + for (int i = 1; i <= size; i++) { + + info = list.get(i - 1); + if (info.getRevisionCounter() != i) { + + if (reverseMapping.length() > 0) { + reverseMapping.append(" "); + } + + reverseMapping.append(i); + reverseMapping.append(" "); + reverseMapping.append(info.getRevisionCounter()); + } + + info.setIndex(i); + info.setSortFlag(false); + } + + // RevisionCounter mapped to real index in revision history + // Sorted by revisionCounters in ascending order + Collections.sort(list); + StringBuilder mapping = new StringBuilder(); - /** ID of the last procesed article */ - private int articleID; + while (!list.isEmpty()) { - /** List of ChonoInfo's */ - private List list; + info = list.remove(0); + if (info.getRevisionCounter() != info.getIndex()) { + if (mapping.length() > 0) { + mapping.append(" "); + } - /** - * (Constructor) Creates a new ChronoIndex object. - */ - public ChronoIndex() - { + mapping.append(info.getRevisionCounter()); + mapping.append(" "); + mapping.append(info.getIndex()); + } + } - super(); + if (mapping.length() > 0) { - this.list = null; - } + boolean sql = !insertStatement.isEmpty(); + String val = (sql ? "(" : "") + articleID + (sql ? ",'" : ",\"") + mapping + + (sql ? "','" : "\",\"") + reverseMapping + (sql ? "')" : "\""); - /** - * (Constructor) Creates a new ChronoIndex object. - * - * @param MAX_ALLOWED_PACKET - * MAX_ALLOWED_PACKET - */ - public ChronoIndex(final long MAX_ALLOWED_PACKET) - { + if (buffer.length() + val.length() >= MAX_ALLOWED_PACKET) { + storeBuffer(); + } - super("INSERT INTO index_chronological VALUES ", MAX_ALLOWED_PACKET); + if (sql && buffer.length() > insertStatement.length()) { + buffer.append(","); + } - this.list = null; - } + buffer.append(val); - /** - * Adds the information for an new entry in the chrono index. - * - * @param articleID - * ID of the article - * @param revisionCounter - * Revision counter - * @param timestamp - * Timestamp - */ - public void add(final int articleID, final int revisionCounter, - final long timestamp) - { - - if (this.articleID != articleID) { - - if (list != null) { - addToBuffer(); - } - - this.articleID = articleID; - this.list = new ArrayList<>(); - } - - this.list.add(new ChronoIndexData(timestamp, revisionCounter)); - } - - /** - * Creates the mapping and the reverse mapping. The generated information - * will be added to the query buffer. This list will be cleared afterwards. - */ - private void addToBuffer() - { - - if (list != null && !list.isEmpty()) { - - ChronoIndexData info; - - // Real index in revision history mapped to RevisionCounter - // Sorted by real index (time) in ascending order - Collections.sort(list); - - StringBuilder reverseMapping = new StringBuilder(); - - int size = list.size(); - for (int i = 1; i <= size; i++) { - - info = list.get(i - 1); - if (info.getRevisionCounter() != i) { - - if (reverseMapping.length() > 0) { - reverseMapping.append(" "); - } - - reverseMapping.append(i); - reverseMapping.append(" "); - reverseMapping.append(info.getRevisionCounter()); - } - - info.setIndex(i); - info.setSortFlag(false); - } - - // RevisionCounter mapped to real index in revision history - // Sorted by revisionCounters in ascending order - Collections.sort(list); - StringBuilder mapping = new StringBuilder(); - - while (!list.isEmpty()) { - - info = list.remove(0); - if (info.getRevisionCounter() != info.getIndex()) { - - if (mapping.length() > 0) { - mapping.append(" "); - } - - mapping.append(info.getRevisionCounter()); - mapping.append(" "); - mapping.append(info.getIndex()); - } - } - - if (mapping.length() > 0) { - - boolean sql = !insertStatement.isEmpty(); - String val = (sql?"(":"") + articleID + (sql?",'":",\"") + mapping - + (sql?"','":"\",\"") + reverseMapping +(sql?"')":"\""); - - if (buffer.length() + val.length() >= MAX_ALLOWED_PACKET) { - storeBuffer(); - } - - if (sql&&buffer.length() > insertStatement.length()) { - buffer.append(","); - } - - buffer.append(val); - - if(!sql){ - buffer.append("\n"); - } - } - } - } - - /** - * Finalizes the query in the currently used buffer and creates a new one. - * The finalized query will be added to the list of queries. - */ - @Override - public void finalizeIndex() - { - addToBuffer(); - storeBuffer(); - } + if (!sql) { + buffer.append("\n"); + } + } + } + } + + /** + * Finalizes the query in the currently used buffer and creates a new one. + * The finalized query will be added to the list of queries. + */ + @Override + public void finalizeIndex() { + addToBuffer(); + storeBuffer(); + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/index/indices/ChronoIndexData.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/index/indices/ChronoIndexData.java index 0fc7bb87..2678ef4e 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/index/indices/ChronoIndexData.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/index/indices/ChronoIndexData.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -19,144 +19,130 @@ /** * This class represents the data used by the ChronoIndex. - * - * - * - * */ public class ChronoIndexData - implements Comparable -{ - - /** - * Flag - whether the data should be sorted choronlogical or in order of the - * revision counter - */ - private boolean chronoSort; - - /** Index value (Chronological order position) */ - private int index; - - /** Revision counter */ - private final int revisionCounter; - - /** Timestamp value */ - private final long time; - - /** - * (Constructor) Creates a new ChronoInfo object. - * - * @param time - * Timestamp value - * @param revisionCounter - * RevisionCounter - */ - public ChronoIndexData(final long time, final int revisionCounter) - { - this.time = time; - this.revisionCounter = revisionCounter; - this.chronoSort = true; - } - - /** - * Compares this ChronoInfo to the given info. - * - * @return a negative integer, zero, or a positive integer as this object is - * less than, equal to, or greater than the specified object. - */ - public int compareTo(final ChronoIndexData info) - { - - long value; - - if (chronoSort) { - value = this.time - info.time; - } - else { - value = this.revisionCounter - info.revisionCounter; - } - - if (value == 0) { - return 0; - } - else if (value > 0) { - return 1; - } - else { - return -1; - } - } - - /* - * (non-Javadoc) - * - * @see java.lang.Object#equals(java.lang.Object) - */ - @Override - public boolean equals(Object obj) - { - if (this == obj) { - return true; - } - if (obj == null) { - return false; - } - if (getClass() != obj.getClass()) { - return false; - } - return (this != (ChronoIndexData) obj) ? false : true; - } - - /** - * Returns the index value. - * - * @return index value - */ - public int getIndex() - { - return this.index; - } - - /** - * Returns the revision counter. - * - * @return revision counter - */ - public int getRevisionCounter() - { - return revisionCounter; - } - - /** - * Returns the timestamp value. - * - * @return timestamp value - */ - public long getTime() - { - return time; - } - - /** - * Sets the index value. - * - * @param index - * index value - */ - public void setIndex(final int index) - { - this.index = index; - } - - /** - * Sets the sort flag. - * - * @param chronoSort - * TRUE for chronological sorting, FALSE for revision counter - * sorting - */ - public void setSortFlag(final boolean chronoSort) - { - this.chronoSort = chronoSort; - } + implements Comparable { + + /** + * Flag - whether the data should be sorted choronlogical or in order of the + * revision counter + */ + private boolean chronoSort; + + /** + * Index value (Chronological order position) + */ + private int index; + + /** + * Revision counter + */ + private final int revisionCounter; + + /** + * Timestamp value + */ + private final long time; + + /** + * (Constructor) Creates a new ChronoInfo object. + * + * @param time Timestamp value + * @param revisionCounter RevisionCounter + */ + public ChronoIndexData(final long time, final int revisionCounter) { + this.time = time; + this.revisionCounter = revisionCounter; + this.chronoSort = true; + } + + /** + * Compares this ChronoInfo to the given info. + * + * @return a negative integer, zero, or a positive integer as this object is + * less than, equal to, or greater than the specified object. + */ + public int compareTo(final ChronoIndexData info) { + + long value; + + if (chronoSort) { + value = this.time - info.time; + } else { + value = this.revisionCounter - info.revisionCounter; + } + + if (value == 0) { + return 0; + } else if (value > 0) { + return 1; + } else { + return -1; + } + } + + /* + * (non-Javadoc) + * + * @see java.lang.Object#equals(java.lang.Object) + */ + @Override + public boolean equals(Object obj) { + if (this == obj) { + return true; + } + if (obj == null) { + return false; + } + if (getClass() != obj.getClass()) { + return false; + } + return (this != (ChronoIndexData) obj) ? false : true; + } + + /** + * Returns the index value. + * + * @return index value + */ + public int getIndex() { + return this.index; + } + + /** + * Returns the revision counter. + * + * @return revision counter + */ + public int getRevisionCounter() { + return revisionCounter; + } + + /** + * Returns the timestamp value. + * + * @return timestamp value + */ + public long getTime() { + return time; + } + + /** + * Sets the index value. + * + * @param index index value + */ + public void setIndex(final int index) { + this.index = index; + } + + /** + * Sets the sort flag. + * + * @param chronoSort TRUE for chronological sorting, FALSE for revision counter + * sorting + */ + public void setSortFlag(final boolean chronoSort) { + this.chronoSort = chronoSort; + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/index/indices/RevisionIndex.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/index/indices/RevisionIndex.java index a5306aca..69514315 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/index/indices/RevisionIndex.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/index/indices/RevisionIndex.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -19,64 +19,52 @@ /** * Index for revision information. - * - * - * */ public class RevisionIndex - extends AbstractIndex -{ + extends AbstractIndex { - /** - * (Constructor) Creates a new RevisionIndex object. - * - */ - public RevisionIndex() - { + /** + * (Constructor) Creates a new RevisionIndex object. + */ + public RevisionIndex() { - super(); - } + super(); + } - /** - * (Constructor) Creates a new RevisionIndex object. - * - * @param MAX_ALLOWED_PACKET - * MAX_ALLOWED_PACKET - */ - public RevisionIndex(final long MAX_ALLOWED_PACKET) - { + /** + * (Constructor) Creates a new RevisionIndex object. + * + * @param MAX_ALLOWED_PACKET MAX_ALLOWED_PACKET + */ + public RevisionIndex(final long MAX_ALLOWED_PACKET) { - super("INSERT INTO index_revisionID VALUES ", MAX_ALLOWED_PACKET); - } + super("INSERT INTO index_revisionID VALUES ", MAX_ALLOWED_PACKET); + } - /** - * Adds the information for an new entry in the revision index. - * - * @param revisionID - * ID of the revision - * @param revisionPrimaryKey - * PK of the revison - * @param fullRevisionPrimaryKey - * PK of the related full revison - */ - public void add(final int revisionID, final long revisionPrimaryKey, - final long fullRevisionPrimaryKey) - { + /** + * Adds the information for an new entry in the revision index. + * + * @param revisionID ID of the revision + * @param revisionPrimaryKey PK of the revison + * @param fullRevisionPrimaryKey PK of the related full revison + */ + public void add(final int revisionID, final long revisionPrimaryKey, + final long fullRevisionPrimaryKey) { - boolean sql = !insertStatement.isEmpty(); - if (sql&&buffer.length() != insertStatement.length()) { - this.buffer.append(","); - } + boolean sql = !insertStatement.isEmpty(); + if (sql && buffer.length() != insertStatement.length()) { + this.buffer.append(","); + } - this.buffer.append((sql?"(":"") + revisionID + "," + revisionPrimaryKey + "," - + fullRevisionPrimaryKey + (sql?")":"")); + this.buffer.append((sql ? "(" : "") + revisionID + "," + revisionPrimaryKey + "," + + fullRevisionPrimaryKey + (sql ? ")" : "")); - if(!sql){ - buffer.append("\n"); - } + if (!sql) { + buffer.append("\n"); + } - if (buffer.length() + 100 >= MAX_ALLOWED_PACKET) { - storeBuffer(); - } - } + if (buffer.length() + 100 >= MAX_ALLOWED_PACKET) { + storeBuffer(); + } + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/index/writer/DataFileWriter.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/index/writer/DataFileWriter.java index 6a81abde..61ae7b4d 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/index/writer/DataFileWriter.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/index/writer/DataFileWriter.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -31,35 +31,30 @@ /** * This class writes the output of the index generator to an sql file. - * - * - * */ public class DataFileWriter - implements IndexWriterInterface -{ - - /** Reference to the Writer object */ - private final Writer chronoIdxWriter; - private final Writer revisionIdxWriter; - private final Writer articleIdxWriter; - - /** - * (Constructor) Creates a new SQLFileWriter. - * - * @param config - * Reference to the configuration paramters - * @throws IOException - * if an error occurred while writing the file - */ - public DataFileWriter(final RevisionAPIConfiguration config) - throws IOException - { - - File path=new File(config.getOutputPath()); - chronoIdxWriter = new BufferedWriter(new FileWriter(new File(path,"chronoIndex.csv"))); - revisionIdxWriter = new BufferedWriter(new FileWriter(new File(path,"revisionIndex.csv"))); - articleIdxWriter = new BufferedWriter(new FileWriter(new File(path,"articleIndex.csv"))); + implements IndexWriterInterface { + + /** + * Reference to the Writer object + */ + private final Writer chronoIdxWriter; + private final Writer revisionIdxWriter; + private final Writer articleIdxWriter; + + /** + * (Constructor) Creates a new SQLFileWriter. + * + * @param config Reference to the configuration paramters + * @throws IOException if an error occurred while writing the file + */ + public DataFileWriter(final RevisionAPIConfiguration config) + throws IOException { + + File path = new File(config.getOutputPath()); + chronoIdxWriter = new BufferedWriter(new FileWriter(new File(path, "chronoIndex.csv"))); + revisionIdxWriter = new BufferedWriter(new FileWriter(new File(path, "revisionIndex.csv"))); + articleIdxWriter = new BufferedWriter(new FileWriter(new File(path, "articleIndex.csv"))); // writer.write("CREATE TABLE index_articleID_rc_ts (" // + "ArticleID INTEGER UNSIGNED NOT NULL, " @@ -88,71 +83,65 @@ public DataFileWriter(final RevisionAPIConfiguration config) // writer.write("ALTER TABLE index_chronological DISABLE KEYS;\r\n"); // // writer.flush(); - } - - /** - * Writes the buffered finalzed queries to the output. - * - * @param index - * Reference to an index - * @throws IOException - * if an error occurred while writing the output - */ - public void write(final AbstractIndex index) - throws IOException - { - - StringBuilder cmd; - - while (index.size() > 0) { - - System.out.println("Transmit Index [" + index + "]"); - - cmd = index.remove(); - - if(index instanceof ArticleIndex){ - articleIdxWriter.write(cmd.toString()); - }else if(index instanceof ChronoIndex){ - chronoIdxWriter.write(cmd.toString()); - }else if(index instanceof RevisionIndex){ - revisionIdxWriter.write(cmd.toString()); - } - - } - - if(index instanceof ArticleIndex){ - articleIdxWriter.flush(); - }else if(index instanceof ChronoIndex){ - chronoIdxWriter.flush(); - }else if(index instanceof RevisionIndex){ - revisionIdxWriter.flush(); - } - } - - /** - * Closes the file or the database connection. - * - * @throws IOException - * if an error occurred while closing the file - */ - public void close() - throws IOException - { - articleIdxWriter.close(); - chronoIdxWriter.close(); - revisionIdxWriter.close(); - } - - /** - * Wraps up the index generation process and writes all remaining statements - * e.g. concerning UNCOMPRESSED-Indexes on the created tables. - * - * @throws IOException - * if an error occurred while writing to the file - */ - public void finish() throws IOException{ - articleIdxWriter.flush(); - chronoIdxWriter.flush(); - revisionIdxWriter.flush(); - } + } + + /** + * Writes the buffered finalzed queries to the output. + * + * @param index Reference to an index + * @throws IOException if an error occurred while writing the output + */ + public void write(final AbstractIndex index) + throws IOException { + + StringBuilder cmd; + + while (index.size() > 0) { + + System.out.println("Transmit Index [" + index + "]"); + + cmd = index.remove(); + + if (index instanceof ArticleIndex) { + articleIdxWriter.write(cmd.toString()); + } else if (index instanceof ChronoIndex) { + chronoIdxWriter.write(cmd.toString()); + } else if (index instanceof RevisionIndex) { + revisionIdxWriter.write(cmd.toString()); + } + + } + + if (index instanceof ArticleIndex) { + articleIdxWriter.flush(); + } else if (index instanceof ChronoIndex) { + chronoIdxWriter.flush(); + } else if (index instanceof RevisionIndex) { + revisionIdxWriter.flush(); + } + } + + /** + * Closes the file or the database connection. + * + * @throws IOException if an error occurred while closing the file + */ + public void close() + throws IOException { + articleIdxWriter.close(); + chronoIdxWriter.close(); + revisionIdxWriter.close(); + } + + /** + * Wraps up the index generation process and writes all remaining statements + * e.g. concerning UNCOMPRESSED-Indexes on the created tables. + * + * @throws IOException if an error occurred while writing to the file + */ + public void finish() throws IOException { + articleIdxWriter.flush(); + chronoIdxWriter.flush(); + revisionIdxWriter.flush(); + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/index/writer/DatabaseWriter.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/index/writer/DatabaseWriter.java index cd7ee765..b944f784 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/index/writer/DatabaseWriter.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/index/writer/DatabaseWriter.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -27,140 +27,126 @@ /** * This class writes the output of the index generator to a database. - * - * - * */ public class DatabaseWriter - implements IndexWriterInterface -{ - - /** Reference to the database connection */ - private final Connection connection; - - /** - * (Constructor) Creates a new DatabaseWriter. - * - * @param config - * Reference to the configuration paramters - * - * @throws ClassNotFoundException - * if the JDBC Driver could not be located - * - * @throws SQLException - * if an error occurred while creating the index tables - */ - public DatabaseWriter(final RevisionAPIConfiguration config) - throws ClassNotFoundException, SQLException - { - - String driverDB = "com.mysql.jdbc.Driver"; - Class.forName(driverDB); - - this.connection = DriverManager - .getConnection("jdbc:mysql://" + config.getHost() + "/" - + config.getDatabase(), config.getUser(), - config.getPassword()); - - Statement statement = connection.createStatement(); - statement.execute("CREATE TABLE index_articleID_rc_ts (" - + "ArticleID INTEGER UNSIGNED NOT NULL, " - + "FullRevisionPKs MEDIUMTEXT NOT NULL, " - + "RevisionCounter MEDIUMTEXT NOT NULL, " - + "FirstAppearance BIGINT NOT NULL, " - + "LastAppearance BIGINT NOT NULL, " - + "PRIMARY KEY(ArticleID));"); - statement.close(); - - statement = connection.createStatement(); - statement.execute("CREATE TABLE index_revisionID (" - + "RevisionID INTEGER UNSIGNED NOT NULL, " - + "RevisionPK INTEGER UNSIGNED NOT NULL, " - + "FullRevisionPK INTEGER UNSIGNED NOT NULL, " - + "PRIMARY KEY(RevisionID));"); - statement.close(); - - statement = connection.createStatement(); - statement.execute("CREATE TABLE index_chronological (" - + "ArticleID INTEGER UNSIGNED NOT NULL, " - + "Mapping MEDIUMTEXT NOT NULL, " - + "ReverseMapping MEDIUMTEXT NOT NULL, " - + "PRIMARY KEY(ArticleID));"); - statement.close(); - - //disable keys now - reenable after inserts - - statement = connection.createStatement(); - statement.execute("ALTER TABLE index_articleID_rc_ts DISABLE KEYS;"); - statement.close(); - statement = connection.createStatement(); - statement.execute("ALTER TABLE index_revisionID DISABLE KEYS;"); - statement.close(); - - statement = connection.createStatement(); - statement.execute("ALTER TABLE index_chronological DISABLE KEYS;"); - statement.close(); - } - - /** - * Writes the buffered finalzed queries to the output. - * - * @param index - * Reference to an index - * @throws SQLException - * if an error occurred while transmitting the output - */ - public void write(final AbstractIndex index) - throws SQLException - { - - Statement statement; - StringBuilder cmd; - - while (index.size() > 0) { - - System.out.println("Transmit Index [" + index + "]"); - - cmd = index.remove(); - // System.out.println(cmd.toString()); - - statement = connection.createStatement(); - statement.execute(cmd.toString()); - statement.close(); - } - } - - /** - * Wraps up the index generation process and writes all remaining statements - * e.g. concerning UNCOMPRESSED-Indexes on the created tables. - * - * @throws SQLException - * if an error occurred while accessing the database - */ - public void finish() throws SQLException{ - Statement statement = connection.createStatement(); - statement.execute("CREATE INDEX articleIdx on revisions(ArticleID);"); - statement.close(); - statement = connection.createStatement(); - statement.execute("ALTER TABLE index_articleID_rc_ts ENABLE KEYS;"); - statement.close(); - statement = connection.createStatement(); - statement.execute("ALTER TABLE index_revisionID ENABLE KEYS;"); - statement.close(); - statement = connection.createStatement(); - statement.execute("ALTER TABLE index_chronological ENABLE KEYS;"); - statement.close(); - } - - /** - * Closes the file or the database connection. - * - * @throws SQLException - * if an error occurred while closing the database connection - */ - public void close() - throws SQLException - { - this.connection.close(); - } + implements IndexWriterInterface { + + /** + * Reference to the database connection + */ + private final Connection connection; + + /** + * (Constructor) Creates a new DatabaseWriter. + * + * @param config Reference to the configuration paramters + * @throws ClassNotFoundException if the JDBC Driver could not be located + * @throws SQLException if an error occurred while creating the index tables + */ + public DatabaseWriter(final RevisionAPIConfiguration config) + throws ClassNotFoundException, SQLException { + + String driverDB = "com.mysql.jdbc.Driver"; + Class.forName(driverDB); + + this.connection = DriverManager + .getConnection("jdbc:mysql://" + config.getHost() + "/" + + config.getDatabase(), config.getUser(), + config.getPassword()); + + Statement statement = connection.createStatement(); + statement.execute("CREATE TABLE index_articleID_rc_ts (" + + "ArticleID INTEGER UNSIGNED NOT NULL, " + + "FullRevisionPKs MEDIUMTEXT NOT NULL, " + + "RevisionCounter MEDIUMTEXT NOT NULL, " + + "FirstAppearance BIGINT NOT NULL, " + + "LastAppearance BIGINT NOT NULL, " + + "PRIMARY KEY(ArticleID));"); + statement.close(); + + statement = connection.createStatement(); + statement.execute("CREATE TABLE index_revisionID (" + + "RevisionID INTEGER UNSIGNED NOT NULL, " + + "RevisionPK INTEGER UNSIGNED NOT NULL, " + + "FullRevisionPK INTEGER UNSIGNED NOT NULL, " + + "PRIMARY KEY(RevisionID));"); + statement.close(); + + statement = connection.createStatement(); + statement.execute("CREATE TABLE index_chronological (" + + "ArticleID INTEGER UNSIGNED NOT NULL, " + + "Mapping MEDIUMTEXT NOT NULL, " + + "ReverseMapping MEDIUMTEXT NOT NULL, " + + "PRIMARY KEY(ArticleID));"); + statement.close(); + + //disable keys now - reenable after inserts + + statement = connection.createStatement(); + statement.execute("ALTER TABLE index_articleID_rc_ts DISABLE KEYS;"); + statement.close(); + statement = connection.createStatement(); + statement.execute("ALTER TABLE index_revisionID DISABLE KEYS;"); + statement.close(); + + statement = connection.createStatement(); + statement.execute("ALTER TABLE index_chronological DISABLE KEYS;"); + statement.close(); + } + + /** + * Writes the buffered finalzed queries to the output. + * + * @param index Reference to an index + * @throws SQLException if an error occurred while transmitting the output + */ + public void write(final AbstractIndex index) + throws SQLException { + + Statement statement; + StringBuilder cmd; + + while (index.size() > 0) { + + System.out.println("Transmit Index [" + index + "]"); + + cmd = index.remove(); + // System.out.println(cmd.toString()); + + statement = connection.createStatement(); + statement.execute(cmd.toString()); + statement.close(); + } + } + + /** + * Wraps up the index generation process and writes all remaining statements + * e.g. concerning UNCOMPRESSED-Indexes on the created tables. + * + * @throws SQLException if an error occurred while accessing the database + */ + public void finish() throws SQLException { + Statement statement = connection.createStatement(); + statement.execute("CREATE INDEX articleIdx on revisions(ArticleID);"); + statement.close(); + statement = connection.createStatement(); + statement.execute("ALTER TABLE index_articleID_rc_ts ENABLE KEYS;"); + statement.close(); + statement = connection.createStatement(); + statement.execute("ALTER TABLE index_revisionID ENABLE KEYS;"); + statement.close(); + statement = connection.createStatement(); + statement.execute("ALTER TABLE index_chronological ENABLE KEYS;"); + statement.close(); + } + + /** + * Closes the file or the database connection. + * + * @throws SQLException if an error occurred while closing the database connection + */ + public void close() + throws SQLException { + this.connection.close(); + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/index/writer/IndexWriterInterface.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/index/writer/IndexWriterInterface.java index d12fecab..8ce53e45 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/index/writer/IndexWriterInterface.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/index/writer/IndexWriterInterface.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -24,45 +24,34 @@ /** * Interface for the IndexWriter - * - * - * */ -public interface IndexWriterInterface -{ +public interface IndexWriterInterface { - /** - * Writes the buffered finalzed queries to the output. - * - * @param index - * Reference to an index - * @throws IOException - * if an error occurred while writing the output - * @throws SQLException - * if an error occurred while transmitting the output - */ - void write(final AbstractIndex index) - throws IOException, SQLException; + /** + * Writes the buffered finalzed queries to the output. + * + * @param index Reference to an index + * @throws IOException if an error occurred while writing the output + * @throws SQLException if an error occurred while transmitting the output + */ + void write(final AbstractIndex index) + throws IOException, SQLException; - /** - * Closes the file or the database connection. - * - * @throws IOException - * if an error occurred while closing the file - * @throws SQLException - * if an error occurred while closing the database connection - */ - void close() - throws IOException, SQLException; + /** + * Closes the file or the database connection. + * + * @throws IOException if an error occurred while closing the file + * @throws SQLException if an error occurred while closing the database connection + */ + void close() + throws IOException, SQLException; - /** - * Wraps up the index generation process and writes all remaining statements - * e.g. concerning UNCOMPRESSED-Indexes on the created tables. - * - * @throws SQLException - * if an error occurred while accessing the database - * @throws IOException - * if an error occurred while accessing the sql file - */ + /** + * Wraps up the index generation process and writes all remaining statements + * e.g. concerning UNCOMPRESSED-Indexes on the created tables. + * + * @throws SQLException if an error occurred while accessing the database + * @throws IOException if an error occurred while accessing the sql file + */ void finish() throws IOException, SQLException; } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/index/writer/SQLFileWriter.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/index/writer/SQLFileWriter.java index e64728ae..b0c646c9 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/index/writer/SQLFileWriter.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/index/writer/SQLFileWriter.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -28,114 +28,103 @@ /** * This class writes the output of the index generator to an sql file. - * - * - * */ public class SQLFileWriter - implements IndexWriterInterface -{ - - /** Reference to the Writer object */ - private final Writer writer; - - /** - * (Constructor) Creates a new SQLFileWriter. - * - * @param config - * Reference to the configuration paramters - * @throws IOException - * if an error occurred while writing the file - */ - public SQLFileWriter(final RevisionAPIConfiguration config) - throws IOException - { - - writer = new BufferedWriter(new FileWriter(new File(config.getOutputPath(),"revisionIndex.sql"))); - - writer.write("CREATE TABLE index_articleID_rc_ts (" - + "ArticleID INTEGER UNSIGNED NOT NULL, " - + "FullRevisionPKs MEDIUMTEXT NOT NULL, " - + "RevisionCounter MEDIUMTEXT NOT NULL, " - + "FirstAppearance BIGINT NOT NULL, " - + "LastAppearance BIGINT NOT NULL, " - + "PRIMARY KEY(ArticleID));"); - - writer.write("CREATE TABLE index_revisionID (" - + "RevisionID INTEGER UNSIGNED NOT NULL, " - + "RevisionPK INTEGER UNSIGNED NOT NULL, " - + "FullRevisionPK INTEGER UNSIGNED NOT NULL, " - + "PRIMARY KEY(RevisionID));"); - - writer.write("CREATE TABLE index_chronological (" - + "ArticleID INTEGER UNSIGNED NOT NULL, " - + "Mapping MEDIUMTEXT NOT NULL, " - + "ReverseMapping MEDIUMTEXT NOT NULL, " - + "PRIMARY KEY(ArticleID));"); - writer.write("\r\n"); - - //disable keys now - reenable at the end of the sql file - writer.write("ALTER TABLE index_articleID_rc_ts DISABLE KEYS;\r\n"); - writer.write("ALTER TABLE index_revisionID DISABLE KEYS;\r\n"); - writer.write("ALTER TABLE index_chronological DISABLE KEYS;\r\n"); - - writer.flush(); - } - - /** - * Writes the buffered finalzed queries to the output. - * - * @param index - * Reference to an index - * @throws IOException - * if an error occurred while writing the output - */ - public void write(final AbstractIndex index) - throws IOException - { - - StringBuilder cmd; - - while (index.size() > 0) { - - System.out.println("Transmit Index [" + index + "]"); - - cmd = index.remove(); - // System.out.println(cmd.toString()); - - cmd.append("\r\n"); - writer.write(cmd.toString()); - } - - writer.flush(); - } - - /** - * Closes the file or the database connection. - * - * @throws IOException - * if an error occurred while closing the file - */ - public void close() - throws IOException - { - this.writer.close(); - } - - /** - * Wraps up the index generation process and writes all remaining statements - * e.g. concerning UNCOMPRESSED-Indexes on the created tables. - * - * @throws IOException - * if an error occurred while writing to the file - */ - public void finish() throws IOException{ - - writer.write("CREATE INDEX articleIdx ON revisions(ArticleID);\r\n"); - writer.write("ALTER TABLE index_articleID_rc_ts ENABLE KEYS;\r\n"); - writer.write("ALTER TABLE index_revisionID ENABLE KEYS;\r\n"); - writer.write("ALTER TABLE index_chronological ENABLE KEYS;\r\n"); - writer.flush(); - - } + implements IndexWriterInterface { + + /** + * Reference to the Writer object + */ + private final Writer writer; + + /** + * (Constructor) Creates a new SQLFileWriter. + * + * @param config Reference to the configuration paramters + * @throws IOException if an error occurred while writing the file + */ + public SQLFileWriter(final RevisionAPIConfiguration config) + throws IOException { + + writer = new BufferedWriter(new FileWriter(new File(config.getOutputPath(), "revisionIndex.sql"))); + + writer.write("CREATE TABLE index_articleID_rc_ts (" + + "ArticleID INTEGER UNSIGNED NOT NULL, " + + "FullRevisionPKs MEDIUMTEXT NOT NULL, " + + "RevisionCounter MEDIUMTEXT NOT NULL, " + + "FirstAppearance BIGINT NOT NULL, " + + "LastAppearance BIGINT NOT NULL, " + + "PRIMARY KEY(ArticleID));"); + + writer.write("CREATE TABLE index_revisionID (" + + "RevisionID INTEGER UNSIGNED NOT NULL, " + + "RevisionPK INTEGER UNSIGNED NOT NULL, " + + "FullRevisionPK INTEGER UNSIGNED NOT NULL, " + + "PRIMARY KEY(RevisionID));"); + + writer.write("CREATE TABLE index_chronological (" + + "ArticleID INTEGER UNSIGNED NOT NULL, " + + "Mapping MEDIUMTEXT NOT NULL, " + + "ReverseMapping MEDIUMTEXT NOT NULL, " + + "PRIMARY KEY(ArticleID));"); + writer.write("\r\n"); + + //disable keys now - reenable at the end of the sql file + writer.write("ALTER TABLE index_articleID_rc_ts DISABLE KEYS;\r\n"); + writer.write("ALTER TABLE index_revisionID DISABLE KEYS;\r\n"); + writer.write("ALTER TABLE index_chronological DISABLE KEYS;\r\n"); + + writer.flush(); + } + + /** + * Writes the buffered finalzed queries to the output. + * + * @param index Reference to an index + * @throws IOException if an error occurred while writing the output + */ + public void write(final AbstractIndex index) + throws IOException { + + StringBuilder cmd; + + while (index.size() > 0) { + + System.out.println("Transmit Index [" + index + "]"); + + cmd = index.remove(); + // System.out.println(cmd.toString()); + + cmd.append("\r\n"); + writer.write(cmd.toString()); + } + + writer.flush(); + } + + /** + * Closes the file or the database connection. + * + * @throws IOException if an error occurred while closing the file + */ + public void close() + throws IOException { + this.writer.close(); + } + + /** + * Wraps up the index generation process and writes all remaining statements + * e.g. concerning UNCOMPRESSED-Indexes on the created tables. + * + * @throws IOException if an error occurred while writing to the file + */ + public void finish() throws IOException { + + writer.write("CREATE INDEX articleIdx ON revisions(ArticleID);\r\n"); + writer.write("ALTER TABLE index_articleID_rc_ts ENABLE KEYS;\r\n"); + writer.write("ALTER TABLE index_revisionID ENABLE KEYS;\r\n"); + writer.write("ALTER TABLE index_chronological ENABLE KEYS;\r\n"); + writer.flush(); + + } } diff --git a/dkpro-jwpl-revisionmachine/src/test/resources/db/wikiapi_simple_20090119_stripped.script b/dkpro-jwpl-revisionmachine/src/test/resources/db/wikiapi_simple_20090119_stripped.script index de52e2d4..4b1d0f6a 100644 --- a/dkpro-jwpl-revisionmachine/src/test/resources/db/wikiapi_simple_20090119_stripped.script +++ b/dkpro-jwpl-revisionmachine/src/test/resources/db/wikiapi_simple_20090119_stripped.script @@ -29,7 +29,7 @@ SET FILES NIO TRUE SET FILES NIO SIZE 256 SET FILES LOG TRUE SET FILES LOG SIZE 200 -SET FILES CHECK 2544 +SET FILES CHECK 2802 SET DATABASE COLLATION "German" NO PAD CREATE USER SA PASSWORD DIGEST 'd41d8cd98f00b204e9800998ecf8427e' CREATE SCHEMA PUBLIC AUTHORIZATION DBA diff --git a/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/domain/JWPLTimeMachine.java b/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/domain/JWPLTimeMachine.java index ec913c56..45810b2e 100755 --- a/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/domain/JWPLTimeMachine.java +++ b/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/domain/JWPLTimeMachine.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -28,68 +28,57 @@ * The main method gets the path of a configuration file as * argument
*
- * + *

* Refactored on 16 April 2009 by Ivan Galkin . - * - * - * */ public class JWPLTimeMachine { - private static final IEnvironmentFactory environmentFactory = SpringFactory - .getInstance(); + private static final IEnvironmentFactory environmentFactory = SpringFactory.getInstance(); - private static final long startTime = System.currentTimeMillis(); - private static final ILogger logger = environmentFactory.getLogger(); + private static final long startTime = System.currentTimeMillis(); + private static final ILogger logger = environmentFactory.getLogger(); - /** - * Checks given arguments - * - * @param args - *
- * args[0] the settings file like described in - * {@link SettingsXML}
- * - * @return true if all necessary arguments are given and false otherwise - * - * @see SettingsXML - */ - private static boolean checkArgs(String[] args) { - boolean result = (args.length > 0); - if (!result) { - System.out - .println("Usage: java -jar JWPLTimeMachine.jar "); - } - return result; - } + /** + * Checks given arguments + * + * @param args
+ * args[0] the settings file like described in + * {@link SettingsXML}
+ * @return true if all necessary arguments are given and false otherwise + * @see SettingsXML + */ + private static boolean checkArgs(String[] args) { + boolean result = (args.length > 0); + if (!result) { + System.out.println("Usage: java -jar JWPLTimeMachine.jar "); + } + return result; + } - public static void main(String[] args) { + public static void main(String[] args) { - try { - if (checkArgs(args)) { - logger.log("parsing configuration file...."); - Configuration config = SettingsXML.loadConfiguration(args[0], - logger); - TimeMachineFiles files = SettingsXML.loadFiles(args[0], logger); + try { + if (checkArgs(args)) { + logger.log("parsing configuration file...."); + Configuration config = SettingsXML.loadConfiguration(args[0], logger); + TimeMachineFiles files = SettingsXML.loadFiles(args[0], logger); - if (config != null && files != null) { - if (files.checkAll() && config.checkTimeConfig()) { - logger.log("processing data ..."); + if (config != null && files != null) { + if (files.checkAll() && config.checkTimeConfig()) { + logger.log("processing data ..."); - ISnapshotGenerator generator = environmentFactory - .getSnapshotGenerator(); - generator.setConfiguration(config); - generator.setFiles(files); - generator.start(); + ISnapshotGenerator generator = environmentFactory + .getSnapshotGenerator(); + generator.setConfiguration(config); + generator.setFiles(files); + generator.start(); - logger.log("End of the application. Working time = " - + (System.currentTimeMillis() - - startTime) + " ms"); - } - } - } - } catch (Exception e) { - logger.log(e); - } - } + logger.log("End of the application. Working time = " + (System.currentTimeMillis() - startTime) + " ms"); + } + } + } + } catch (Exception e) { + logger.log(e); + } + } } diff --git a/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/domain/Revision.java b/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/domain/Revision.java index a3f857be..9baf8bac 100755 --- a/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/domain/Revision.java +++ b/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/domain/Revision.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -19,72 +19,69 @@ /** * Routines for the conversion of the Wikipedia revisions - * */ public class Revision { - /** - * Calendar.getInstance().set(2000,0,1) out relative time zero to saving - * memory - */ - private static final Long TIME_ZERO = 946724195435L; - /** - * We measure the time not from 1th January 1900 but from 1th January 2000 - */ - private static final Integer MS_IN_SEC = 1000; + /** + * Calendar.getInstance().set(2000,0,1) out relative time zero to saving + * memory + */ + private static final Long TIME_ZERO = 946724195435L; + /** + * We measure the time not from 1th January 1900 but from 1th January 2000 + */ + private static final Integer MS_IN_SEC = 1000; - private Revision() { + private Revision() { - } + } - /** - * Compress time from long- to the integer-format: reduce the resolution to - * "seconds" and zero time to 1th January 2000 - * - * @param date - * date/time in the long format - * @return date/time in the compressed integer format - */ - public static int compressTime(long date) { - Long lowResolutionDate = (date - TIME_ZERO) / MS_IN_SEC; - return lowResolutionDate.intValue(); - } + /** + * Compress time from long- to the integer-format: reduce the resolution to + * "seconds" and zero time to 1th January 2000 + * + * @param date date/time in the long format + * @return date/time in the compressed integer format + */ + public static int compressTime(long date) { + Long lowResolutionDate = (date - TIME_ZERO) / MS_IN_SEC; + return lowResolutionDate.intValue(); + } - /** - * Extract time, that was compressed with - * {@link Revision#compressTime(long)} - * - * @param compressedDate - * compressed date/time in the integer format - * @return date/time in the long format - */ - public static long extractTime(int compressedDate) { - return (long) compressedDate * MS_IN_SEC + TIME_ZERO; - } + /** + * Extract time, that was compressed with + * {@link Revision#compressTime(long)} + * + * @param compressedDate compressed date/time in the integer format + * @return date/time in the long format + */ + public static long extractTime(int compressedDate) { + return (long) compressedDate * MS_IN_SEC + TIME_ZERO; + } - /** - * Merge two unsigned integer values (text id and time stamp) to one long - * value (revision) to use GNU Trove container. - */ - public static long createRevision(int textId, int timestamp) { - return (long) textId << 32 | (long) timestamp; - } + /** + * Merge two unsigned integer values (text id and time stamp) to one long + * value (revision) to use GNU Trove container. + */ + public static long createRevision(int textId, int timestamp) { + return (long) textId << 32 | (long) timestamp; + } - /** - * Extract a time stamp from the revision long. - * - * @return time stamp - */ - public static int getTimestamp(long revision) { - return (int) (revision & 0x00000000FFFFFFFFL); - } + /** + * Extract a time stamp from the revision long. + * + * @return time stamp + */ + public static int getTimestamp(long revision) { + return (int) (revision & 0x00000000FFFFFFFFL); + } - /** - * Extract a text ID from the revision long - * - * @return text ID - */ - public static int getTextId(long revision) { - return (int) (revision >>> 32); - } + /** + * Extract a text ID from the revision long + * + * @return text ID + */ + public static int getTextId(long revision) { + return (int) (revision >>> 32); + } } diff --git a/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/domain/SettingsXML.java b/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/domain/SettingsXML.java index 9980f8fe..2474c223 100755 --- a/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/domain/SettingsXML.java +++ b/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/domain/SettingsXML.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -31,97 +31,90 @@ /** * This is a utility class that generates a template for the configuration file
* The template must be edited prior to be used for the DBMapping tool.
- * - * */ public class SettingsXML { - public static final String OUTPUT_DIRECTORY = "outputDirectory"; - public static final String CATEGORY_LINKS_FILE = "categoryLinksFile"; - public static final String PAGE_LINKS_FILE = "pageLinksFile"; - public static final String META_HISTORY_FILE = "metaHistoryFile"; - public static final String EACH = "each"; - public static final String TO_TIMESTAMP = "toTimestamp"; - public static final String FROM_TIMESTAMP = "fromTimestamp"; - public static final String DISAMBIGUATION_CATEGORY = "disambiguationCategory"; - public static final String MAIN_CATEGORY = "mainCategory"; - public static final String LANGUAGE = "language"; - - private static final String DESCRIPTION = "This a configuration formular for the DBMapping Tool of the JWPL"; - private static final String PLACEHOLDER = "to be edited"; - - - public static void generateSample(String outputFileName) throws IOException { - - Properties p = new Properties(); - p.put(LANGUAGE, PLACEHOLDER); - p.put(MAIN_CATEGORY, PLACEHOLDER); - p.put(DISAMBIGUATION_CATEGORY, PLACEHOLDER); - p.put(FROM_TIMESTAMP, PLACEHOLDER); - p.put(TO_TIMESTAMP, PLACEHOLDER); - p.put(EACH, PLACEHOLDER); - p.put(META_HISTORY_FILE, PLACEHOLDER); - p.put(PAGE_LINKS_FILE, PLACEHOLDER); - p.put(CATEGORY_LINKS_FILE, PLACEHOLDER); - p.put(OUTPUT_DIRECTORY, PLACEHOLDER); - p.storeToXML(new BufferedOutputStream(new FileOutputStream(outputFileName)), DESCRIPTION); - - } - - public static Configuration loadConfiguration(String configFile, - ILogger logger) { - - Configuration result; - try { - result = new Configuration(logger); - Properties properties = new Properties(); - properties.loadFromXML(new BufferedInputStream(new FileInputStream(configFile))); - - result.setLanguage(properties.get(LANGUAGE).toString()); - result.setMainCategory(properties.get(MAIN_CATEGORY).toString()); - result.setDisambiguationCategory(properties.get( - DISAMBIGUATION_CATEGORY).toString()); - result.setFromTimestamp(TimestampUtil.parse(properties.get( - FROM_TIMESTAMP).toString())); - result.setToTimestamp(TimestampUtil.parse(properties.get( - TO_TIMESTAMP).toString())); - result.setEach(Integer.parseInt(properties.get(EACH).toString())); - } catch (Exception e) { - result = null; - } - return result; - } - - public static TimeMachineFiles loadFiles(String configFile, ILogger logger) { - TimeMachineFiles result; - try { - Properties properties = new Properties(); - properties.loadFromXML(new BufferedInputStream(new FileInputStream(configFile))); - result = new TimeMachineFiles(logger); - - result.setMetaHistoryFile(properties.get(META_HISTORY_FILE) - .toString()); - result.setPageLinksFile(properties.get(PAGE_LINKS_FILE).toString()); - result.setCategoryLinksFile(properties.get(CATEGORY_LINKS_FILE) - .toString()); - result.setOutputDirectory(properties.get(OUTPUT_DIRECTORY) - .toString()); - } catch (Exception e) { - logger.log("Could not load config file " + configFile); - result = null; - } - return result; - } - - public static void main(String[] args) { - if (args.length > 0) { - try { - generateSample(args[0]); - } catch (IOException e) { - e.printStackTrace(); - } - } - - } + public static final String OUTPUT_DIRECTORY = "outputDirectory"; + public static final String CATEGORY_LINKS_FILE = "categoryLinksFile"; + public static final String PAGE_LINKS_FILE = "pageLinksFile"; + public static final String META_HISTORY_FILE = "metaHistoryFile"; + public static final String EACH = "each"; + public static final String TO_TIMESTAMP = "toTimestamp"; + public static final String FROM_TIMESTAMP = "fromTimestamp"; + public static final String DISAMBIGUATION_CATEGORY = "disambiguationCategory"; + public static final String MAIN_CATEGORY = "mainCategory"; + public static final String LANGUAGE = "language"; + + private static final String DESCRIPTION = "This a configuration formular for the DBMapping Tool of the JWPL"; + private static final String PLACEHOLDER = "to be edited"; + + + public static void generateSample(String outputFileName) throws IOException { + + Properties p = new Properties(); + p.put(LANGUAGE, PLACEHOLDER); + p.put(MAIN_CATEGORY, PLACEHOLDER); + p.put(DISAMBIGUATION_CATEGORY, PLACEHOLDER); + p.put(FROM_TIMESTAMP, PLACEHOLDER); + p.put(TO_TIMESTAMP, PLACEHOLDER); + p.put(EACH, PLACEHOLDER); + p.put(META_HISTORY_FILE, PLACEHOLDER); + p.put(PAGE_LINKS_FILE, PLACEHOLDER); + p.put(CATEGORY_LINKS_FILE, PLACEHOLDER); + p.put(OUTPUT_DIRECTORY, PLACEHOLDER); + p.storeToXML(new BufferedOutputStream(new FileOutputStream(outputFileName)), DESCRIPTION); + + } + + public static Configuration loadConfiguration(String configFile, ILogger logger) { + Configuration result; + try { + result = new Configuration(logger); + Properties properties = new Properties(); + properties.loadFromXML(new BufferedInputStream(new FileInputStream(configFile))); + + result.setLanguage(properties.get(LANGUAGE).toString()); + result.setMainCategory(properties.get(MAIN_CATEGORY).toString()); + result.setDisambiguationCategory(properties.get( + DISAMBIGUATION_CATEGORY).toString()); + result.setFromTimestamp(TimestampUtil.parse(properties.get( + FROM_TIMESTAMP).toString())); + result.setToTimestamp(TimestampUtil.parse(properties.get( + TO_TIMESTAMP).toString())); + result.setEach(Integer.parseInt(properties.get(EACH).toString())); + } catch (Exception e) { + result = null; + } + return result; + } + + public static TimeMachineFiles loadFiles(String configFile, ILogger logger) { + TimeMachineFiles result; + try { + Properties properties = new Properties(); + properties.loadFromXML(new BufferedInputStream(new FileInputStream(configFile))); + result = new TimeMachineFiles(logger); + + result.setMetaHistoryFile(properties.get(META_HISTORY_FILE).toString()); + result.setPageLinksFile(properties.get(PAGE_LINKS_FILE).toString()); + result.setCategoryLinksFile(properties.get(CATEGORY_LINKS_FILE).toString()); + result.setOutputDirectory(properties.get(OUTPUT_DIRECTORY).toString()); + } catch (Exception e) { + logger.log("Could not load config file " + configFile); + result = null; + } + return result; + } + + public static void main(String[] args) { + if (args.length > 0) { + try { + generateSample(args[0]); + } catch (IOException e) { + e.printStackTrace(); + } + } + + } } diff --git a/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/domain/TimeMachineFiles.java b/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/domain/TimeMachineFiles.java index caf2ede9..72bda788 100644 --- a/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/domain/TimeMachineFiles.java +++ b/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/domain/TimeMachineFiles.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -26,84 +26,79 @@ public class TimeMachineFiles extends Files { - private static final String NO_CATEGORYLINKS = "category links file not found"; - private static final String NO_METAHISTORY = "meta history file not found"; - private static final String NO_PAGELINKS = "page links file not found"; - - private String metaHistoryFile; - private String pageLinksFile; - private String categoryLinksFile; - private String timeStamp = ""; - - public TimeMachineFiles(ILogger logger) { - super(logger); - } - - public TimeMachineFiles(TimeMachineFiles files) { - super(files); - this.metaHistoryFile = files.metaHistoryFile; - this.pageLinksFile = files.metaHistoryFile; - this.categoryLinksFile = files.categoryLinksFile; - } - - /** - * Add a sub directory called "timestamp" to the current output directory - * - * @param timestamp - * - name of a new sub directory - */ - public void setTimestamp(Timestamp timestamp) { - - timeStamp = TimestampUtil.toMediaWikiString(timestamp) + File.separator; - } - - public String getMetaHistoryFile() { - return metaHistoryFile; - } - - public void setMetaHistoryFile(String metaHistroyFile) { - this.metaHistoryFile = metaHistroyFile; - } - - public String getPageLinksFile() { - return pageLinksFile; - } - - public void setPageLinksFile(String pageLinksFile) { - this.pageLinksFile = pageLinksFile; - } - - public String getCategoryLinksFile() { - return categoryLinksFile; - } - - public void setCategoryLinksFile(String categoryLinksFile) { - this.categoryLinksFile = categoryLinksFile; - } - - public boolean checkInputFile(String fileName, String errorMessage) { - File inputFile = new File(fileName); - boolean result = inputFile.exists() && inputFile.canRead(); - if (!result) { - logger.log(errorMessage); - } - return result; - } - - @Override - protected String getOutputPath(String fileName) { - File outputSubDirectory = new File(outputDirectory.getAbsolutePath() - + File.separator + timeStamp); - outputSubDirectory.mkdir(); - return outputDirectory.getAbsolutePath() + File.separator + timeStamp - + fileName; - } - - @Override - public boolean checkAll() { - return checkOutputDirectory() - && checkInputFile(metaHistoryFile, NO_METAHISTORY) - && checkInputFile(pageLinksFile, NO_PAGELINKS) - && checkInputFile(categoryLinksFile, NO_CATEGORYLINKS); - } + private static final String NO_CATEGORYLINKS = "category links file not found"; + private static final String NO_METAHISTORY = "meta history file not found"; + private static final String NO_PAGELINKS = "page links file not found"; + + private String metaHistoryFile; + private String pageLinksFile; + private String categoryLinksFile; + private String timeStamp = ""; + + public TimeMachineFiles(ILogger logger) { + super(logger); + } + + public TimeMachineFiles(TimeMachineFiles files) { + super(files); + this.metaHistoryFile = files.metaHistoryFile; + this.pageLinksFile = files.metaHistoryFile; + this.categoryLinksFile = files.categoryLinksFile; + } + + /** + * Add a sub directory called "timestamp" to the current output directory + * + * @param timestamp - name of a new sub directory + */ + public void setTimestamp(Timestamp timestamp) { + + timeStamp = TimestampUtil.toMediaWikiString(timestamp) + File.separator; + } + + public String getMetaHistoryFile() { + return metaHistoryFile; + } + + public void setMetaHistoryFile(String metaHistroyFile) { + this.metaHistoryFile = metaHistroyFile; + } + + public String getPageLinksFile() { + return pageLinksFile; + } + + public void setPageLinksFile(String pageLinksFile) { + this.pageLinksFile = pageLinksFile; + } + + public String getCategoryLinksFile() { + return categoryLinksFile; + } + + public void setCategoryLinksFile(String categoryLinksFile) { + this.categoryLinksFile = categoryLinksFile; + } + + public boolean checkInputFile(String fileName, String errorMessage) { + File inputFile = new File(fileName); + boolean result = inputFile.exists() && inputFile.canRead(); + if (!result) { + logger.log(errorMessage); + } + return result; + } + + @Override + protected String getOutputPath(String fileName) { + File outputSubDirectory = new File(outputDirectory.getAbsolutePath() + File.separator + timeStamp); + outputSubDirectory.mkdir(); + return outputDirectory.getAbsolutePath() + File.separator + timeStamp + fileName; + } + + @Override + public boolean checkAll() { + return checkOutputDirectory() && checkInputFile(metaHistoryFile, NO_METAHISTORY) + && checkInputFile(pageLinksFile, NO_PAGELINKS) && checkInputFile(categoryLinksFile, NO_CATEGORYLINKS); + } } diff --git a/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/domain/TimeMachineGenerator.java b/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/domain/TimeMachineGenerator.java index 22d31b4e..85eb986f 100755 --- a/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/domain/TimeMachineGenerator.java +++ b/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/domain/TimeMachineGenerator.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -41,166 +41,154 @@ * By specifying a 'from' and a 'to' time stamps and the number of days to take * as interval
* this class produces multiple dump versions. - * - * */ public class TimeMachineGenerator extends AbstractSnapshotGenerator { - private IDumpVersion[] versions = null; - private TimeMachineFiles initialFiles = null; + private IDumpVersion[] versions = null; + private TimeMachineFiles initialFiles = null; - public TimeMachineGenerator(IEnvironmentFactory environmentFactory) { - super(environmentFactory); - } + public TimeMachineGenerator(IEnvironmentFactory environmentFactory) { + super(environmentFactory); + } - @Override - public void setFiles(Files files) { - initialFiles = (TimeMachineFiles) files; - } + @Override + public void setFiles(Files files) { + initialFiles = (TimeMachineFiles) files; + } - private Integer calculateSnapshotsCount(Timestamp from, Timestamp to, - Integer dayInterval) { - Integer result = 0; + private Integer calculateSnapshotsCount(Timestamp from, Timestamp to, + Integer dayInterval) { + Integer result = 0; - for (Timestamp i = from; i.before(to); i = TimestampUtil - .getNextTimestamp(i, dayInterval)) { - result++; - } + for (Timestamp i = from; i.before(to); i = TimestampUtil.getNextTimestamp(i, dayInterval)) { + result++; + } - return result; - } + return result; + } - @Override - public void start() throws Exception { + @Override + public void start() throws Exception { - Timestamp fromTimestamp = configuration.getFromTimestamp(); - Timestamp toTimestamp = configuration.getToTimestamp(); - Integer each = configuration.getEach(); + Timestamp fromTimestamp = configuration.getFromTimestamp(); + Timestamp toTimestamp = configuration.getToTimestamp(); + int each = configuration.getEach(); - Integer snapshotsCount = fromTimestamp.equals(toTimestamp) ? 1 - : calculateSnapshotsCount(fromTimestamp, toTimestamp, each); + int snapshotsCount = fromTimestamp.equals(toTimestamp) ? 1 + : calculateSnapshotsCount(fromTimestamp, toTimestamp, each); - if (snapshotsCount > 0) { + if (snapshotsCount > 0) { - versions = new IDumpVersion[snapshotsCount]; - logger.log("Dumps to be generated:"); + versions = new IDumpVersion[snapshotsCount]; + logger.log("Dumps to be generated:"); - for (int i = 0; i < snapshotsCount; i++) { + for (int i = 0; i < snapshotsCount; i++) { - Timestamp currentTimestamp = TimestampUtil.getNextTimestamp( - fromTimestamp, i * each); - logger.log(currentTimestamp); + Timestamp currentTimestamp = TimestampUtil.getNextTimestamp(fromTimestamp, i * each); + logger.log(currentTimestamp); - MetaData commonMetaData = MetaData - .initWithConfig(configuration); - commonMetaData.setTimestamp(currentTimestamp); + MetaData commonMetaData = MetaData.initWithConfig(configuration); + commonMetaData.setTimestamp(currentTimestamp); - IDumpVersion version = environmentFactory.getDumpVersion(); + IDumpVersion version = environmentFactory.getDumpVersion(); - version.initialize(currentTimestamp); - version.setMetaData(commonMetaData); - TimeMachineFiles currentFiles = new TimeMachineFiles( - initialFiles); - currentFiles.setTimestamp(currentTimestamp); - version.setFiles(currentFiles); - versions[i] = version; - } + version.initialize(currentTimestamp); + version.setMetaData(commonMetaData); + TimeMachineFiles currentFiles = new TimeMachineFiles( + initialFiles); + currentFiles.setTimestamp(currentTimestamp); + version.setFiles(currentFiles); + versions[i] = version; + } - processInputDumps(); + processInputDumps(); - } else { - logger.log("No timestamps."); - } - } + } else { + logger.log("No timestamps."); + } + } - private void processInputDumps() throws IOException { + private void processInputDumps() throws IOException { - dumpVersionProcessor.setDumpVersions(versions); + dumpVersionProcessor.setDumpVersions(versions); - logger.log("Processing the revision table"); - dumpVersionProcessor.processRevision(createRevisionParser()); + logger.log("Processing the revision table"); + dumpVersionProcessor.processRevision(createRevisionParser()); - logger.log("Processing the page table"); - dumpVersionProcessor.processPage(createPageParser()); + logger.log("Processing the page table"); + dumpVersionProcessor.processPage(createPageParser()); - logger.log("Processing the categorylinks table"); - dumpVersionProcessor.processCategorylinks(createCategorylinksParser()); + logger.log("Processing the categorylinks table"); + dumpVersionProcessor.processCategorylinks(createCategorylinksParser()); - logger.log("Processing the pagelinks table"); - dumpVersionProcessor.processPagelinks(createPagelinksParser()); + logger.log("Processing the pagelinks table"); + dumpVersionProcessor.processPagelinks(createPagelinksParser()); - logger.log("Processing the text table"); - dumpVersionProcessor.processText(createTextParser()); + logger.log("Processing the text table"); + dumpVersionProcessor.processText(createTextParser()); - logger.log("Writing meta data"); - dumpVersionProcessor.writeMetaData(); - } + logger.log("Writing meta data"); + dumpVersionProcessor.writeMetaData(); + } - private RevisionParser createRevisionParser() throws IOException { + private RevisionParser createRevisionParser() throws IOException { - String metahistory = initialFiles.getMetaHistoryFile(); + String metahistory = initialFiles.getMetaHistoryFile(); - DumpTableInputStream revisionTableInputStream = environmentFactory - .getDumpTableInputStream(); - revisionTableInputStream.initialize(decompressor - .getInputStream(metahistory), DumpTableEnum.REVISION); + DumpTableInputStream revisionTableInputStream = environmentFactory.getDumpTableInputStream(); + revisionTableInputStream.initialize(decompressor.getInputStream(metahistory), DumpTableEnum.REVISION); - RevisionParser revisionParser = environmentFactory.getRevisionParser(); - revisionParser.setInputStream(revisionTableInputStream); + RevisionParser revisionParser = environmentFactory.getRevisionParser(); + revisionParser.setInputStream(revisionTableInputStream); - return revisionParser; + return revisionParser; - } + } - private PageParser createPageParser() throws IOException { + private PageParser createPageParser() throws IOException { - String metahistory = initialFiles.getMetaHistoryFile(); + String metahistory = initialFiles.getMetaHistoryFile(); - DumpTableInputStream pageTableInputStream = environmentFactory - .getDumpTableInputStream(); - pageTableInputStream.initialize(decompressor - .getInputStream(metahistory), DumpTableEnum.PAGE); + DumpTableInputStream pageTableInputStream = environmentFactory.getDumpTableInputStream(); + pageTableInputStream.initialize(decompressor.getInputStream(metahistory), DumpTableEnum.PAGE); - PageParser pageParser = environmentFactory.getPageParser(); - pageParser.setInputStream(pageTableInputStream); + PageParser pageParser = environmentFactory.getPageParser(); + pageParser.setInputStream(pageTableInputStream); - return pageParser; + return pageParser; - } + } - private CategorylinksParser createCategorylinksParser() throws IOException { + private CategorylinksParser createCategorylinksParser() throws IOException { - String categorylinks = initialFiles.getCategoryLinksFile(); - InputStream categorylinksStream = decompressor - .getInputStream(categorylinks); + String categorylinks = initialFiles.getCategoryLinksFile(); + InputStream categorylinksStream = decompressor.getInputStream(categorylinks); - return new CategorylinksParser(categorylinksStream); + return new CategorylinksParser(categorylinksStream); - } + } - private PagelinksParser createPagelinksParser() throws IOException { + private PagelinksParser createPagelinksParser() throws IOException { - String pagelinks = initialFiles.getPageLinksFile(); + String pagelinks = initialFiles.getPageLinksFile(); - InputStream pagelinksStream = decompressor.getInputStream(pagelinks); - return new PagelinksParser(pagelinksStream); + InputStream pagelinksStream = decompressor.getInputStream(pagelinks); + return new PagelinksParser(pagelinksStream); - } + } - private TextParser createTextParser() throws IOException { + private TextParser createTextParser() throws IOException { - String metahistory = initialFiles.getMetaHistoryFile(); + String metahistory = initialFiles.getMetaHistoryFile(); - DumpTableInputStream textTableIntputStream = environmentFactory - .getDumpTableInputStream(); - textTableIntputStream.initialize(decompressor - .getInputStream(metahistory), DumpTableEnum.TEXT); + DumpTableInputStream textTableIntputStream = environmentFactory.getDumpTableInputStream(); + textTableIntputStream.initialize(decompressor.getInputStream(metahistory), DumpTableEnum.TEXT); - TextParser textParser = environmentFactory.getTextParser(); - textParser.setInputStream(textTableIntputStream); + TextParser textParser = environmentFactory.getTextParser(); + textParser.setInputStream(textTableIntputStream); - return textParser; + return textParser; - } + } } diff --git a/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/version/DumpVersionFastUtilIntKey.java b/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/version/DumpVersionFastUtilIntKey.java index 9104e7c1..2e3f19f1 100644 --- a/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/version/DumpVersionFastUtilIntKey.java +++ b/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/version/DumpVersionFastUtilIntKey.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -22,6 +22,9 @@ import java.util.HashMap; import java.util.Map; +import it.unimi.dsi.fastutil.ints.Int2IntOpenHashMap; +import it.unimi.dsi.fastutil.ints.IntArraySet; +import it.unimi.dsi.fastutil.ints.IntSet; import org.dkpro.jwpl.timemachine.domain.Revision; import org.dkpro.jwpl.wikimachine.dump.sql.CategorylinksParser; import org.dkpro.jwpl.wikimachine.dump.sql.PagelinksParser; @@ -32,339 +35,314 @@ import org.dkpro.jwpl.wikimachine.util.Redirects; import org.dkpro.jwpl.wikimachine.util.TimestampUtil; import org.dkpro.jwpl.wikimachine.util.TxtFileWriter; -import it.unimi.dsi.fastutil.ints.Int2IntOpenHashMap; -import it.unimi.dsi.fastutil.ints.IntArraySet; -import it.unimi.dsi.fastutil.ints.IntSet; public class DumpVersionFastUtilIntKey extends AbstractDumpVersion { - private static final String SQL_NULL = "NULL"; - /** - * maps page id's to Revision objects - */ - private HashMap pageIdRevMap; - /** - * after revision parsing the map will be erased and the keys sorted in the - * array list - */ - private IntSet pageIdRevList; - - /** - * caches the page id's of disambiguation pages. - */ - private IntSet disambiguations; - /** - * maps text id's to the page id's. - */ - private Int2IntOpenHashMap textIdPageIdMap; - /** - * maps page id's of pages to their names - */ - private Map pPageIdNameMap; - /** - * maps names of pages to their page id's. - */ - private Int2IntOpenHashMap pNamePageIdMap; - - /** - * maps names of categories to their page id's. - */ - private Int2IntOpenHashMap cNamePageIdMap; - - /** - * maps page id's of redirects to their names. - */ - private Map rPageIdNameMap; - - @Override - public void freeAfterCategoryLinksParsing() { - String message = "clearing cNamePageIdMap of " + cNamePageIdMap.size() - + " objects"; - logger.log(message); - - cNamePageIdMap.clear(); - - } - - @Override - public void freeAfterPageLinksParsing() { - // nothing to free - } - - @Override - public void freeAfterPageParsing() { - String message = "clearing pageIdRevSet of " + pageIdRevList.size() - + " objects"; - - logger.log(message); - pageIdRevList.clear(); - } - - @Override - public void freeAfterRevisonParsing() { - pageIdRevList = new IntArraySet(pageIdRevMap.keySet().size()); - for (int key : pageIdRevMap.keySet()) { - pageIdRevList.add(key); - } - - pageIdRevMap.clear(); - } - - @Override - public void freeAfterTextParsing() { - pageIdRevMap.clear(); - pageIdRevList.clear(); - disambiguations.clear(); - textIdPageIdMap.clear(); - pPageIdNameMap.clear(); - pNamePageIdMap.clear(); - cNamePageIdMap.clear(); - rPageIdNameMap.clear(); - } - - @Override - public void initialize(Timestamp timestamp) { - this.timestamp = Revision.compressTime(timestamp.getTime()); - - /* - * filled in revisions - */ - pageIdRevMap = new HashMap<>(); - textIdPageIdMap = new Int2IntOpenHashMap(); - - /* - * filled in pages - */ - pPageIdNameMap = new HashMap<>(); - pNamePageIdMap = new Int2IntOpenHashMap(); - - cNamePageIdMap = new Int2IntOpenHashMap(); - rPageIdNameMap = new HashMap<>(); - - /* - * filled in categories - */ - disambiguations = new IntArraySet(); - } - - @Override - public void processCategoryLinksRow(CategorylinksParser clParser) - throws IOException { - String cl_to_text = clParser.getClTo(); - if (cl_to_text != null) { - int cl_to_textHashcode = cl_to_text.hashCode(); - // if category exists - - if (cNamePageIdMap.containsKey(cl_to_textHashcode)) { - int cl_to = cNamePageIdMap.get(cl_to_textHashcode); - // if the link source is a page then write the link in - // category_pages and page_categories - int cl_from = clParser.getClFrom(); - // if exists page - if (pPageIdNameMap.containsKey(cl_from)) { - processCategoryLinksRowPageExists(cl_from, cl_to, - cl_to_text); - } else { - processCateforyLinksRowPageMiss(cl_from, cl_to); - } - } - } - } - - private void processCategoryLinksRowPageExists(Integer cl_from, - Integer cl_to, String cl_to_text) throws IOException { - - categoryPages.addRow(cl_to, cl_from); - pageCategories.addRow(cl_from, cl_to); - if (cl_to_text.equals(metaData.getDisambiguationCategory())) { - disambiguations.add(cl_from.intValue()); - metaData.addDisamb(); - } - } - - private void processCateforyLinksRowPageMiss(Integer cl_from, Integer cl_to) - throws IOException { - // if category page id exists - if (cNamePageIdMap.containsValue(cl_from.intValue())) { - categoryOutlinks.addRow(cl_to, cl_from); - categoryInlinks.addRow(cl_from, cl_to); - } - } - - @Override - public void processPageLinksRow(PagelinksParser plParser) - throws IOException { - int pl_from = plParser.getPlFrom(); - String pl_to = plParser.getPlTo(); - if (pl_to != null) { - int pl_toHashcode = pl_to.hashCode(); - // if page name and page page id exists - - if ((!skipPage || pPageIdNameMap.containsKey(pl_from)) - && pNamePageIdMap.containsKey(pl_toHashcode)) { - int id = pNamePageIdMap.get(pl_toHashcode); - pageOutlinks.addRow(pl_from, id); - pageInlinks.addRow(id, pl_from); - } - } - } - - @Override - public void processPageRow(PageParser pageParser) throws IOException { - switch (pageParser.getPageNamespace()) { - case NS_CATEGORY: { - processPageRowCategory(pageParser); - break; - } - case NS_MAIN: { - processPageRowPage(pageParser); - break; - } - } - } - - private void processPageRowCategory(PageParser pageParser) - throws IOException { - if (!(skipCategory && pageParser.getPageIsRedirect())) { - // retrieve page id and page title - int page_id = pageParser.getPageId(); - // ignore categories, which have no revisions before the time stamp - String page_title = pageParser.getPageTitle(); - if (page_title != null && pageIdRevList.contains(page_id)) { - // cache the retrieved values - // record category - cNamePageIdMap.put(page_title.hashCode(), page_id); - // write a new row in the table Category. - // Note that we also consider the page_id as id - txtFW.addRow(page_id, page_id, page_title); - metaData.addCategory(); - } - } - } - - private void processPageRowPage(PageParser pageParser) throws IOException { - // retrieve page id and title - int page_id = pageParser.getPageId(); - // ignore pages, which have no revisions prior to the time stamp - String page_title = pageParser.getPageTitle(); - if (page_title != null && pageIdRevList.contains(page_id)) { - // distinguish redirects - if (pageParser.getPageIsRedirect()) { - // record redirect - rPageIdNameMap.put(page_id, page_title); - } else { - // record page - pPageIdNameMap.put(page_id, page_title); - pNamePageIdMap.put(page_title.hashCode(), page_id); - } - } - } - - @Override - public void processRevisionRow(RevisionParser revisionParser) { - // get the time stamp of the revision - int rev_timestamp = revisionParser.getRevTimestamp(); - if (rev_timestamp < timestamp) { - // get the rev_page (corresponds to page_id in the table page) - Integer rev_page = revisionParser.getRevPage(); - if (pageIdRevMap.containsKey(rev_page)) { - processRevisionRowContainsKey(revisionParser, rev_page, - rev_timestamp); - } else { - processRevisionRowMissKey(revisionParser, rev_page, - rev_timestamp); - } - } - - } - - private void processRevisionRowContainsKey(RevisionParser revisionParser, - int rev_page, int rev_timestamp) { - - long revisionRecord = pageIdRevMap.get(rev_page); - int old_timestamp = Revision.getTimestamp(revisionRecord); - - // is it a better time stamp ? - if (rev_timestamp > old_timestamp) { - int old_text_id = Revision.getTextId(revisionRecord); - pageIdRevMap.put(rev_page, Revision.createRevision(revisionParser - .getRevTextId(), rev_timestamp)); - textIdPageIdMap.remove(old_text_id); - textIdPageIdMap.put(revisionParser.getRevTextId(), rev_page); - } - } - - private void processRevisionRowMissKey(RevisionParser revisionParser, - int rev_page, int rev_timestamp) { - // this is the first recorded time stamp for that page id - - pageIdRevMap.put(rev_page, Revision.createRevision(revisionParser - .getRevTextId(), rev_timestamp)); - - textIdPageIdMap.put(revisionParser.getRevTextId(), rev_page); - } - - @Override - public void processTextRow(TextParser textParser) throws IOException { - int text_id = textParser.getOldId(); - - if (textIdPageIdMap.containsKey(text_id)) { - int page_id = textIdPageIdMap.get(text_id); - // if exists page page id -> page - if (pPageIdNameMap.containsKey(page_id)) { - processTextRowPage(textParser, page_id); - } else if (rPageIdNameMap.containsKey(page_id)) { - // if exists redirect -> redirect - processTextRowRedirect(textParser, page_id); - } - } - } - - private void processTextRowPage(TextParser textParser, int page_id) - throws IOException { - // get page name - String pageName = pPageIdNameMap.get(page_id); - - page.addRow(page_id, page_id, pageName, textParser.getOldText(), - formatBoolean(disambiguations.contains(page_id))); - - pageMapLine.addRow(page_id, pageName, page_id, SQL_NULL, SQL_NULL); - metaData.addPage(); - } - - private void processTextRowRedirect(TextParser textParser, int page_id) - throws IOException { - String destination = Redirects.getRedirectDestination(textParser - .getOldText()); - - if (destination != null) { - // if page name exists - - int destinationHashcode = destination.hashCode(); - - if (pNamePageIdMap.containsKey(destinationHashcode)) { - int id = pNamePageIdMap.get(destinationHashcode); - String redirectName = rPageIdNameMap.get(page_id); - pageRedirects.addRow(id, redirectName); - pageMapLine.addRow(page_id, redirectName, id, SQL_NULL, - SQL_NULL); - metaData.addRedirect(); - } - } - } - - @Override - public void writeMetaData() throws IOException { - TxtFileWriter outputFile = new TxtFileWriter(versionFiles - .getOutputMetadata()); - // ID,LANGUAGE,DISAMBIGUATION_CATEGORY,MAIN_CATEGORY,nrOfPages,nrOfRedirects,nrOfDisambiguationPages,nrOfCategories,timestamp - outputFile.addRow(metaData.getId(), metaData.getLanguage(), metaData - .getDisambiguationCategory(), metaData.getMainCategory(), - metaData.getNrOfPages(), metaData.getNrOfRedirects(), metaData - .getNrOfDisambiguations(), - metaData.getNrOfCategories(), TimestampUtil - .toMediaWikiString(metaData.getTimestamp())); - outputFile.flush(); - outputFile.close(); - } + private static final String SQL_NULL = "NULL"; + /** + * maps page id's to Revision objects + */ + private HashMap pageIdRevMap; + /** + * after revision parsing the map will be erased and the keys sorted in the + * array list + */ + private IntSet pageIdRevList; + + /** + * caches the page id's of disambiguation pages. + */ + private IntSet disambiguations; + /** + * maps text id's to the page id's. + */ + private Int2IntOpenHashMap textIdPageIdMap; + /** + * maps page id's of pages to their names + */ + private Map pPageIdNameMap; + /** + * maps names of pages to their page id's. + */ + private Int2IntOpenHashMap pNamePageIdMap; + + /** + * maps names of categories to their page id's. + */ + private Int2IntOpenHashMap cNamePageIdMap; + + /** + * maps page id's of redirects to their names. + */ + private Map rPageIdNameMap; + + @Override + public void freeAfterCategoryLinksParsing() { + String message = "clearing cNamePageIdMap of " + cNamePageIdMap.size() + " objects"; + logger.log(message); + cNamePageIdMap.clear(); + } + + @Override + public void freeAfterPageLinksParsing() { + // nothing to free + } + + @Override + public void freeAfterPageParsing() { + logger.log("clearing pageIdRevSet of " + pageIdRevList.size() + " objects"); + pageIdRevList.clear(); + } + + @Override + public void freeAfterRevisonParsing() { + pageIdRevList = new IntArraySet(pageIdRevMap.keySet().size()); + for (int key : pageIdRevMap.keySet()) { + pageIdRevList.add(key); + } + + pageIdRevMap.clear(); + } + + @Override + public void freeAfterTextParsing() { + pageIdRevMap.clear(); + pageIdRevList.clear(); + disambiguations.clear(); + textIdPageIdMap.clear(); + pPageIdNameMap.clear(); + pNamePageIdMap.clear(); + cNamePageIdMap.clear(); + rPageIdNameMap.clear(); + } + + @Override + public void initialize(Timestamp timestamp) { + this.timestamp = Revision.compressTime(timestamp.getTime()); + + /* + * filled in revisions + */ + pageIdRevMap = new HashMap<>(); + textIdPageIdMap = new Int2IntOpenHashMap(); + + /* + * filled in pages + */ + pPageIdNameMap = new HashMap<>(); + pNamePageIdMap = new Int2IntOpenHashMap(); + + cNamePageIdMap = new Int2IntOpenHashMap(); + rPageIdNameMap = new HashMap<>(); + + /* + * filled in categories + */ + disambiguations = new IntArraySet(); + } + + @Override + public void processCategoryLinksRow(CategorylinksParser clParser) throws IOException { + String cl_to_text = clParser.getClTo(); + if (cl_to_text != null) { + int cl_to_textHashcode = cl_to_text.hashCode(); + // if category exists + + if (cNamePageIdMap.containsKey(cl_to_textHashcode)) { + int cl_to = cNamePageIdMap.get(cl_to_textHashcode); + // if the link source is a page then write the link in + // category_pages and page_categories + int cl_from = clParser.getClFrom(); + // if exists page + if (pPageIdNameMap.containsKey(cl_from)) { + processCategoryLinksRowPageExists(cl_from, cl_to, + cl_to_text); + } else { + processCategoryLinksRowPageMiss(cl_from, cl_to); + } + } + } + } + + private void processCategoryLinksRowPageExists(Integer cl_from, Integer cl_to, String cl_to_text) throws IOException { + + categoryPages.addRow(cl_to, cl_from); + pageCategories.addRow(cl_from, cl_to); + if (cl_to_text.equals(metaData.getDisambiguationCategory())) { + disambiguations.add(cl_from.intValue()); + metaData.addDisamb(); + } + } + + private void processCategoryLinksRowPageMiss(Integer cl_from, Integer cl_to) throws IOException { + // if category page id exists + if (cNamePageIdMap.containsValue(cl_from.intValue())) { + categoryOutlinks.addRow(cl_to, cl_from); + categoryInlinks.addRow(cl_from, cl_to); + } + } + + @Override + public void processPageLinksRow(PagelinksParser plParser) throws IOException { + int pl_from = plParser.getPlFrom(); + String pl_to = plParser.getPlTo(); + if (pl_to != null) { + int pl_toHashcode = pl_to.hashCode(); + // if page name and page page id exists + + if ((!skipPage || pPageIdNameMap.containsKey(pl_from)) + && pNamePageIdMap.containsKey(pl_toHashcode)) { + int id = pNamePageIdMap.get(pl_toHashcode); + pageOutlinks.addRow(pl_from, id); + pageInlinks.addRow(id, pl_from); + } + } + } + + @Override + public void processPageRow(PageParser pageParser) throws IOException { + switch (pageParser.getPageNamespace()) { + case NS_CATEGORY: { + processPageRowCategory(pageParser); + break; + } + case NS_MAIN: { + processPageRowPage(pageParser); + break; + } + } + } + + private void processPageRowCategory(PageParser pageParser) throws IOException { + if (!(skipCategory && pageParser.getPageIsRedirect())) { + // retrieve page id and page title + int page_id = pageParser.getPageId(); + // ignore categories, which have no revisions before the time stamp + String page_title = pageParser.getPageTitle(); + if (page_title != null && pageIdRevList.contains(page_id)) { + // cache the retrieved values + // record category + cNamePageIdMap.put(page_title.hashCode(), page_id); + // write a new row in the table Category. + // Note that we also consider the page_id as id + txtFW.addRow(page_id, page_id, page_title); + metaData.addCategory(); + } + } + } + + private void processPageRowPage(PageParser pageParser) throws IOException { + // retrieve page id and title + int page_id = pageParser.getPageId(); + // ignore pages, which have no revisions prior to the time stamp + String page_title = pageParser.getPageTitle(); + if (page_title != null && pageIdRevList.contains(page_id)) { + // distinguish redirects + if (pageParser.getPageIsRedirect()) { + // record redirect + rPageIdNameMap.put(page_id, page_title); + } else { + // record page + pPageIdNameMap.put(page_id, page_title); + pNamePageIdMap.put(page_title.hashCode(), page_id); + } + } + } + + @Override + public void processRevisionRow(RevisionParser revisionParser) { + // get the time stamp of the revision + int rev_timestamp = revisionParser.getRevTimestamp(); + if (rev_timestamp < timestamp) { + // get the rev_page (corresponds to page_id in the table page) + int rev_page = revisionParser.getRevPage(); + if (pageIdRevMap.containsKey(rev_page)) { + processRevisionRowContainsKey(revisionParser, rev_page, + rev_timestamp); + } else { + processRevisionRowMissKey(revisionParser, rev_page, + rev_timestamp); + } + } + + } + + private void processRevisionRowContainsKey(RevisionParser revisionParser, int rev_page, int rev_timestamp) { + + long revisionRecord = pageIdRevMap.get(rev_page); + int old_timestamp = Revision.getTimestamp(revisionRecord); + + // is it a better time stamp ? + if (rev_timestamp > old_timestamp) { + int old_text_id = Revision.getTextId(revisionRecord); + pageIdRevMap.put(rev_page, Revision.createRevision(revisionParser + .getRevTextId(), rev_timestamp)); + textIdPageIdMap.remove(old_text_id); + textIdPageIdMap.put(revisionParser.getRevTextId(), rev_page); + } + } + + private void processRevisionRowMissKey(RevisionParser revisionParser, int rev_page, int rev_timestamp) { + // this is the first recorded time stamp for that page id + pageIdRevMap.put(rev_page, Revision.createRevision(revisionParser.getRevTextId(), rev_timestamp)); + textIdPageIdMap.put(revisionParser.getRevTextId(), rev_page); + } + + @Override + public void processTextRow(TextParser textParser) throws IOException { + int text_id = textParser.getOldId(); + + if (textIdPageIdMap.containsKey(text_id)) { + int page_id = textIdPageIdMap.get(text_id); + // if exists page page id -> page + if (pPageIdNameMap.containsKey(page_id)) { + processTextRowPage(textParser, page_id); + } else if (rPageIdNameMap.containsKey(page_id)) { + // if exists redirect -> redirect + processTextRowRedirect(textParser, page_id); + } + } + } + + private void processTextRowPage(TextParser textParser, int page_id) throws IOException { + // get page name + String pageName = pPageIdNameMap.get(page_id); + + page.addRow(page_id, page_id, pageName, textParser.getOldText(), + formatBoolean(disambiguations.contains(page_id))); + + pageMapLine.addRow(page_id, pageName, page_id, SQL_NULL, SQL_NULL); + metaData.addPage(); + } + + private void processTextRowRedirect(TextParser textParser, int page_id) + throws IOException { + String destination = Redirects.getRedirectDestination(textParser.getOldText()); + + if (destination != null) { + // if page name exists + + int destinationHashcode = destination.hashCode(); + + if (pNamePageIdMap.containsKey(destinationHashcode)) { + int id = pNamePageIdMap.get(destinationHashcode); + String redirectName = rPageIdNameMap.get(page_id); + pageRedirects.addRow(id, redirectName); + pageMapLine.addRow(page_id, redirectName, id, SQL_NULL, SQL_NULL); + metaData.addRedirect(); + } + } + } + + @Override + public void writeMetaData() throws IOException { + TxtFileWriter outputFile = new TxtFileWriter(versionFiles.getOutputMetadata()); + // ID,LANGUAGE,DISAMBIGUATION_CATEGORY,MAIN_CATEGORY,nrOfPages,nrOfRedirects,nrOfDisambiguationPages,nrOfCategories,timestamp + outputFile.addRow(metaData.getId(), metaData.getLanguage(), metaData.getDisambiguationCategory(), + metaData.getMainCategory(), metaData.getNrOfPages(), metaData.getNrOfRedirects(), + metaData.getNrOfDisambiguations(), metaData.getNrOfCategories(), + TimestampUtil.toMediaWikiString(metaData.getTimestamp())); + outputFile.flush(); + outputFile.close(); + } } diff --git a/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/version/DumpVersionJDKGeneric.java b/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/version/DumpVersionJDKGeneric.java index 4d20544b..b2ffd5bb 100644 --- a/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/version/DumpVersionJDKGeneric.java +++ b/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/version/DumpVersionJDKGeneric.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -41,366 +41,333 @@ * Please be sure, that {@code hashCode(String)} of the provided HashAlgorithm type returns the * same type as KeyType * - * - * @param - * the type of the HashMap's key - * @param - * hashing algorithm, returning KeyType
- * + * @param the type of the HashMap's key + * @param hashing algorithm, returning KeyType
*/ -public class DumpVersionJDKGeneric - extends AbstractDumpVersion { - - private static final String SQL_NULL = "NULL"; - - /** - * maps page id's to Revision objects - */ - private HashMap pageIdRevMap; - /** - * after revision parsing the map will be erased and the keys sorted in the - * array list - */ - private Set pageIdRevList; - - /** - * caches the page id's of disambiguation pages. - */ - private Set disambiguations; - /** - * maps text id's to the page id's. - */ - private Map textIdPageIdMap; - /** - * maps page id's of pages to their names - */ - private Map pPageIdNameMap; - /** - * maps names of pages to their page id's. - */ - private Map pNamePageIdMap; - - /** - * maps names of categories to their page id's. - */ - private Map cNamePageIdMap; - - /** - * maps page id's of redirects to their names. - */ - private Map rPageIdNameMap; - - private final IStringHashCode hashAlgorithm; - - @SuppressWarnings("unchecked") - public DumpVersionJDKGeneric(Class hashAlgorithmClass) - throws InstantiationException, IllegalAccessException, NoSuchMethodException, InvocationTargetException { - - hashAlgorithm = hashAlgorithmClass.getDeclaredConstructor().newInstance(); - @SuppressWarnings("unused") - KeyType hashAlgorithmResult = (KeyType) hashAlgorithm.hashCode("test"); - } - - @Override - public void freeAfterCategoryLinksParsing() { - String message = "clearing cNamePageIdMap of " + cNamePageIdMap.size() - + " objects"; - logger.log(message); - cNamePageIdMap.clear(); - } - - @Override - public void freeAfterPageLinksParsing() { - // nothing to free - } - - @Override - public void freeAfterPageParsing() { - String message = "clearing pageIdRevSet of " + pageIdRevList.size() - + " objects"; - logger.log(message); - pageIdRevList.clear(); - } - - @Override - public void freeAfterRevisonParsing() { - pageIdRevList = new HashSet<>(pageIdRevMap.keySet().size()); - pageIdRevList.addAll(pageIdRevMap.keySet()); - - pageIdRevMap.clear(); - } - - @Override - public void freeAfterTextParsing() { - pageIdRevMap.clear(); - pageIdRevList.clear(); - disambiguations.clear(); - textIdPageIdMap.clear(); - pPageIdNameMap.clear(); - pNamePageIdMap.clear(); - cNamePageIdMap.clear(); - rPageIdNameMap.clear(); - } - - @Override - public void initialize(Timestamp timestamp) { - this.timestamp = Revision.compressTime(timestamp.getTime()); - - /* - * filled in revisions - */ - pageIdRevMap = new HashMap<>(); - textIdPageIdMap = new HashMap<>(); - - /* - * filled in pages - */ - pPageIdNameMap = new HashMap<>(); - pNamePageIdMap = new HashMap<>(); - - cNamePageIdMap = new HashMap<>(); - rPageIdNameMap = new HashMap<>(); - - /* - * filled in categories - */ - disambiguations = new HashSet<>(); - } - - @SuppressWarnings("unchecked") - @Override - public void processCategoryLinksRow(CategorylinksParser clParser) - throws IOException { - String cl_to_text = clParser.getClTo(); - if (cl_to_text != null) { - KeyType cl_to_textHashcode = (KeyType) hashAlgorithm - .hashCode(cl_to_text); - // if category exists - - Integer cl_to = cNamePageIdMap.get(cl_to_textHashcode); - if (cl_to != null) { - // if the link source is a page then write the link in - // category_pages and page_categories - int cl_from = clParser.getClFrom(); - // if exists page - if (pPageIdNameMap.containsKey(cl_from)) { - processCategoryLinksRowPageExists(cl_from, cl_to, - cl_to_text); - } else { - processCateforyLinksRowPageMiss(cl_from, cl_to); - } - } - } - - } - - private void processCategoryLinksRowPageExists(Integer cl_from, - Integer cl_to, String cl_to_text) throws IOException { - - categoryPages.addRow(cl_to, cl_from); - pageCategories.addRow(cl_from, cl_to); - if (cl_to_text.equals(metaData.getDisambiguationCategory())) { - disambiguations.add(cl_from); - metaData.addDisamb(); - } - } - - private void processCateforyLinksRowPageMiss(Integer cl_from, Integer cl_to) - throws IOException { - // if category page id exists - if (cNamePageIdMap.containsValue(cl_from)) { - categoryOutlinks.addRow(cl_to, cl_from); - categoryInlinks.addRow(cl_from, cl_to); - } - } - - @SuppressWarnings("unchecked") - @Override - public void processPageLinksRow(PagelinksParser plParser) - throws IOException { - int pl_from = plParser.getPlFrom(); - String pl_to = plParser.getPlTo(); - if (pl_to != null) { - KeyType pl_toHashcode = (KeyType) hashAlgorithm.hashCode(pl_to); - - // if page name and page page id exists - Integer id = pNamePageIdMap.get(pl_toHashcode); - if (id != null - && (!skipPage || pPageIdNameMap.containsKey(pl_from))) { - pageOutlinks.addRow(pl_from, id); - pageInlinks.addRow(id, pl_from); - } - } - } - - @Override - public void processPageRow(PageParser pageParser) throws IOException { - switch (pageParser.getPageNamespace()) { - case NS_CATEGORY: { - processPageRowCategory(pageParser); - break; - } - case NS_MAIN: { - processPageRowPage(pageParser); - break; - } - } - - } - - @SuppressWarnings("unchecked") - private void processPageRowCategory(PageParser pageParser) - throws IOException { - if (!(skipCategory && pageParser.getPageIsRedirect())) { - // retrieve page id and page title - int page_id = pageParser.getPageId(); - // ignore categories, which have no revisions before the time stamp - if (pageIdRevList.contains(page_id)) { - String page_title = pageParser.getPageTitle(); - // cache the retrieved values - // record category - if (page_title != null) { - KeyType page_titleHashcode = (KeyType) hashAlgorithm - .hashCode(page_title); - cNamePageIdMap.put(page_titleHashcode, page_id); - // write a new row in the table Category. - // Note that we also consider the page_id as id - txtFW.addRow(page_id, page_id, page_title); - metaData.addCategory(); - } - } - } - } - - @SuppressWarnings("unchecked") - private void processPageRowPage(PageParser pageParser) throws IOException { - // retrieve page id and title - int page_id = pageParser.getPageId(); - // ignore pages, which have no revisions prior to the time stamp - String page_title = pageParser.getPageTitle(); - if (page_title != null && pageIdRevList.contains(page_id)) { - // distinguish redirects - if (pageParser.getPageIsRedirect()) { - // record redirect - rPageIdNameMap.put(page_id, page_title); - } else { - // record page - KeyType page_titleHashcode = (KeyType) hashAlgorithm - .hashCode(page_title); - pPageIdNameMap.put(page_id, page_title); - pNamePageIdMap.put(page_titleHashcode, page_id); - } - } - } - - @Override - public void processRevisionRow(RevisionParser revisionParser) { - // get the time stamp of the revision - int rev_timestamp = revisionParser.getRevTimestamp(); - if (rev_timestamp < timestamp) { - // get the rev_page (corresponds to page_id in the table page) - Integer rev_page = revisionParser.getRevPage(); - if (pageIdRevMap.containsKey(rev_page)) { - processRevisionRowContainsKey(revisionParser, rev_page, - rev_timestamp); - } else { - processRevisionRowMissKey(revisionParser, rev_page, - rev_timestamp); - } - } - - } - - private void processRevisionRowContainsKey(RevisionParser revisionParser, - int rev_page, int rev_timestamp) { - - long revisionRecord = pageIdRevMap.get(rev_page); - int old_timestamp = Revision.getTimestamp(revisionRecord); - - // is it a better time stamp ? - if (rev_timestamp > old_timestamp) { - int old_text_id = Revision.getTextId(revisionRecord); - pageIdRevMap.put(rev_page, Revision.createRevision(revisionParser - .getRevTextId(), rev_timestamp)); - textIdPageIdMap.remove(old_text_id); - textIdPageIdMap.put(revisionParser.getRevTextId(), rev_page); - } - } - - private void processRevisionRowMissKey(RevisionParser revisionParser, - int rev_page, int rev_timestamp) { - // this is the first recorded time stamp for that page id - - pageIdRevMap.put(rev_page, Revision.createRevision(revisionParser - .getRevTextId(), rev_timestamp)); - - textIdPageIdMap.put(revisionParser.getRevTextId(), rev_page); - } - - @Override - public void processTextRow(TextParser textParser) throws IOException { - int text_id = textParser.getOldId(); - - if (textIdPageIdMap.containsKey(text_id)) { - int page_id = textIdPageIdMap.get(text_id); - // if exists page page id -> page - if (pPageIdNameMap.containsKey(page_id)) { - processTextRowPage(textParser, page_id); - } else if (rPageIdNameMap.containsKey(page_id)) { - // if exists redirect -> redirect - processTextRowRedirect(textParser, page_id); - } - } - - } - - private void processTextRowPage(TextParser textParser, int page_id) - throws IOException { - // get page name - String pageName = pPageIdNameMap.get(page_id); - - page.addRow(page_id, page_id, pageName, textParser.getOldText(), - formatBoolean(disambiguations.contains(page_id))); - - pageMapLine.addRow(page_id, pageName, page_id, SQL_NULL, SQL_NULL); - metaData.addPage(); - } - - @SuppressWarnings("unchecked") - private void processTextRowRedirect(TextParser textParser, int page_id) - throws IOException { - String destination = Redirects.getRedirectDestination(textParser - .getOldText()); - - if (destination != null) { - // if page name exists - - KeyType destinationHashcode = (KeyType) hashAlgorithm - .hashCode(destination); - Integer id = pNamePageIdMap.get(destinationHashcode); - if (id != null) { - String redirectName = rPageIdNameMap.get(page_id); - pageRedirects.addRow(id, redirectName); - pageMapLine.addRow(page_id, redirectName, id, SQL_NULL, - SQL_NULL); - metaData.addRedirect(); - } - } - } - - @Override - public void writeMetaData() throws IOException { - TxtFileWriter outputFile = new TxtFileWriter(versionFiles - .getOutputMetadata()); - // ID,LANGUAGE,DISAMBIGUATION_CATEGORY,MAIN_CATEGORY,nrOfPages,nrOfRedirects,nrOfDisambiguationPages,nrOfCategories,timestamp - outputFile.addRow(metaData.getId(), metaData.getLanguage(), metaData - .getDisambiguationCategory(), metaData.getMainCategory(), - metaData.getNrOfPages(), metaData.getNrOfRedirects(), metaData - .getNrOfDisambiguations(), - metaData.getNrOfCategories(), TimestampUtil - .toMediaWikiString(metaData.getTimestamp())); - outputFile.flush(); - outputFile.close(); - } +public class DumpVersionJDKGeneric extends AbstractDumpVersion { + + private static final String SQL_NULL = "NULL"; + + /** + * maps page id's to Revision objects + */ + private HashMap pageIdRevMap; + /** + * after revision parsing the map will be erased and the keys sorted in the + * array list + */ + private Set pageIdRevList; + + /** + * caches the page id's of disambiguation pages. + */ + private Set disambiguations; + /** + * maps text id's to the page id's. + */ + private Map textIdPageIdMap; + /** + * maps page id's of pages to their names + */ + private Map pPageIdNameMap; + /** + * maps names of pages to their page id's. + */ + private Map pNamePageIdMap; + + /** + * maps names of categories to their page id's. + */ + private Map cNamePageIdMap; + + /** + * maps page id's of redirects to their names. + */ + private Map rPageIdNameMap; + + private final IStringHashCode hashAlgorithm; + + @SuppressWarnings("unchecked") + public DumpVersionJDKGeneric(Class hashAlgorithmClass) + throws InstantiationException, IllegalAccessException, NoSuchMethodException, InvocationTargetException { + + hashAlgorithm = hashAlgorithmClass.getDeclaredConstructor().newInstance(); + @SuppressWarnings("unused") + KeyType hashAlgorithmResult = (KeyType) hashAlgorithm.hashCode("test"); + } + + @Override + public void freeAfterCategoryLinksParsing() { + logger.log("clearing cNamePageIdMap of " + cNamePageIdMap.size() + " objects"); + cNamePageIdMap.clear(); + } + + @Override + public void freeAfterPageLinksParsing() { + // nothing to free + } + + @Override + public void freeAfterPageParsing() { + logger.log("clearing pageIdRevSet of " + pageIdRevList.size() + " objects"); + pageIdRevList.clear(); + } + + @Override + public void freeAfterRevisonParsing() { + pageIdRevList = new HashSet<>(pageIdRevMap.keySet().size()); + pageIdRevList.addAll(pageIdRevMap.keySet()); + pageIdRevMap.clear(); + } + + @Override + public void freeAfterTextParsing() { + pageIdRevMap.clear(); + pageIdRevList.clear(); + disambiguations.clear(); + textIdPageIdMap.clear(); + pPageIdNameMap.clear(); + pNamePageIdMap.clear(); + cNamePageIdMap.clear(); + rPageIdNameMap.clear(); + } + + @Override + public void initialize(Timestamp timestamp) { + this.timestamp = Revision.compressTime(timestamp.getTime()); + + /* + * filled in revisions + */ + pageIdRevMap = new HashMap<>(); + textIdPageIdMap = new HashMap<>(); + + /* + * filled in pages + */ + pPageIdNameMap = new HashMap<>(); + pNamePageIdMap = new HashMap<>(); + + cNamePageIdMap = new HashMap<>(); + rPageIdNameMap = new HashMap<>(); + + /* + * filled in categories + */ + disambiguations = new HashSet<>(); + } + + @SuppressWarnings("unchecked") + @Override + public void processCategoryLinksRow(CategorylinksParser clParser) + throws IOException { + String cl_to_text = clParser.getClTo(); + if (cl_to_text != null) { + KeyType cl_to_textHashcode = (KeyType) hashAlgorithm.hashCode(cl_to_text); + // if category exists + + Integer cl_to = cNamePageIdMap.get(cl_to_textHashcode); + if (cl_to != null) { + // if the link source is a page then write the link in + // category_pages and page_categories + int cl_from = clParser.getClFrom(); + // if exists page + if (pPageIdNameMap.containsKey(cl_from)) { + processCategoryLinksRowPageExists(cl_from, cl_to, cl_to_text); + } else { + processCateforyLinksRowPageMiss(cl_from, cl_to); + } + } + } + } + + private void processCategoryLinksRowPageExists(Integer cl_from, Integer cl_to, String cl_to_text) throws IOException { + categoryPages.addRow(cl_to, cl_from); + pageCategories.addRow(cl_from, cl_to); + if (cl_to_text.equals(metaData.getDisambiguationCategory())) { + disambiguations.add(cl_from); + metaData.addDisamb(); + } + } + + private void processCateforyLinksRowPageMiss(Integer cl_from, Integer cl_to) throws IOException { + // if category page id exists + if (cNamePageIdMap.containsValue(cl_from)) { + categoryOutlinks.addRow(cl_to, cl_from); + categoryInlinks.addRow(cl_from, cl_to); + } + } + + @SuppressWarnings("unchecked") + @Override + public void processPageLinksRow(PagelinksParser plParser) throws IOException { + int pl_from = plParser.getPlFrom(); + String pl_to = plParser.getPlTo(); + if (pl_to != null) { + KeyType pl_toHashcode = (KeyType) hashAlgorithm.hashCode(pl_to); + + // if page name and page page id exists + Integer id = pNamePageIdMap.get(pl_toHashcode); + if (id != null + && (!skipPage || pPageIdNameMap.containsKey(pl_from))) { + pageOutlinks.addRow(pl_from, id); + pageInlinks.addRow(id, pl_from); + } + } + } + + @Override + public void processPageRow(PageParser pageParser) throws IOException { + switch (pageParser.getPageNamespace()) { + case NS_CATEGORY: { + processPageRowCategory(pageParser); + break; + } + case NS_MAIN: { + processPageRowPage(pageParser); + break; + } + } + + } + + @SuppressWarnings("unchecked") + private void processPageRowCategory(PageParser pageParser) throws IOException { + if (!(skipCategory && pageParser.getPageIsRedirect())) { + // retrieve page id and page title + int page_id = pageParser.getPageId(); + // ignore categories, which have no revisions before the time stamp + if (pageIdRevList.contains(page_id)) { + String page_title = pageParser.getPageTitle(); + // cache the retrieved values + // record category + if (page_title != null) { + KeyType page_titleHashcode = (KeyType) hashAlgorithm + .hashCode(page_title); + cNamePageIdMap.put(page_titleHashcode, page_id); + // write a new row in the table Category. + // Note that we also consider the page_id as id + txtFW.addRow(page_id, page_id, page_title); + metaData.addCategory(); + } + } + } + } + + @SuppressWarnings("unchecked") + private void processPageRowPage(PageParser pageParser) throws IOException { + // retrieve page id and title + int page_id = pageParser.getPageId(); + // ignore pages, which have no revisions prior to the time stamp + String page_title = pageParser.getPageTitle(); + if (page_title != null && pageIdRevList.contains(page_id)) { + // distinguish redirects + if (pageParser.getPageIsRedirect()) { + // record redirect + rPageIdNameMap.put(page_id, page_title); + } else { + // record page + KeyType page_titleHashcode = (KeyType) hashAlgorithm + .hashCode(page_title); + pPageIdNameMap.put(page_id, page_title); + pNamePageIdMap.put(page_titleHashcode, page_id); + } + } + } + + @Override + public void processRevisionRow(RevisionParser revisionParser) { + // get the time stamp of the revision + int rev_timestamp = revisionParser.getRevTimestamp(); + if (rev_timestamp < timestamp) { + // get the rev_page (corresponds to page_id in the table page) + int rev_page = revisionParser.getRevPage(); + if (pageIdRevMap.containsKey(rev_page)) { + processRevisionRowContainsKey(revisionParser, rev_page, + rev_timestamp); + } else { + processRevisionRowMissKey(revisionParser, rev_page, + rev_timestamp); + } + } + + } + + private void processRevisionRowContainsKey(RevisionParser revisionParser, int rev_page, int rev_timestamp) { + + long revisionRecord = pageIdRevMap.get(rev_page); + int old_timestamp = Revision.getTimestamp(revisionRecord); + + // is it a better time stamp ? + if (rev_timestamp > old_timestamp) { + int old_text_id = Revision.getTextId(revisionRecord); + pageIdRevMap.put(rev_page, Revision.createRevision(revisionParser + .getRevTextId(), rev_timestamp)); + textIdPageIdMap.remove(old_text_id); + textIdPageIdMap.put(revisionParser.getRevTextId(), rev_page); + } + } + + private void processRevisionRowMissKey(RevisionParser revisionParser, int rev_page, int rev_timestamp) { + // this is the first recorded time stamp for that page id + pageIdRevMap.put(rev_page, Revision.createRevision(revisionParser.getRevTextId(), rev_timestamp)); + textIdPageIdMap.put(revisionParser.getRevTextId(), rev_page); + } + + @Override + public void processTextRow(TextParser textParser) throws IOException { + int text_id = textParser.getOldId(); + + if (textIdPageIdMap.containsKey(text_id)) { + int page_id = textIdPageIdMap.get(text_id); + // if exists page page id -> page + if (pPageIdNameMap.containsKey(page_id)) { + processTextRowPage(textParser, page_id); + } else if (rPageIdNameMap.containsKey(page_id)) { + // if exists redirect -> redirect + processTextRowRedirect(textParser, page_id); + } + } + + } + + private void processTextRowPage(TextParser textParser, int page_id) throws IOException { + // get page name + String pageName = pPageIdNameMap.get(page_id); + + page.addRow(page_id, page_id, pageName, textParser.getOldText(), formatBoolean(disambiguations.contains(page_id))); + pageMapLine.addRow(page_id, pageName, page_id, SQL_NULL, SQL_NULL); + metaData.addPage(); + } + + @SuppressWarnings("unchecked") + private void processTextRowRedirect(TextParser textParser, int page_id) throws IOException { + String destination = Redirects.getRedirectDestination(textParser.getOldText()); + + if (destination != null) { + // if page name exists + + KeyType destinationHashcode = (KeyType) hashAlgorithm.hashCode(destination); + Integer id = pNamePageIdMap.get(destinationHashcode); + if (id != null) { + String redirectName = rPageIdNameMap.get(page_id); + pageRedirects.addRow(id, redirectName); + pageMapLine.addRow(page_id, redirectName, id, SQL_NULL, SQL_NULL); + metaData.addRedirect(); + } + } + } + + @Override + public void writeMetaData() throws IOException { + TxtFileWriter outputFile = new TxtFileWriter(versionFiles.getOutputMetadata()); + // ID,LANGUAGE,DISAMBIGUATION_CATEGORY,MAIN_CATEGORY,nrOfPages,nrOfRedirects,nrOfDisambiguationPages,nrOfCategories,timestamp + outputFile.addRow(metaData.getId(), metaData.getLanguage(), metaData.getDisambiguationCategory(), + metaData.getMainCategory(), metaData.getNrOfPages(), metaData.getNrOfRedirects(), + metaData.getNrOfDisambiguations(), metaData.getNrOfCategories(), + TimestampUtil.toMediaWikiString(metaData.getTimestamp())); + outputFile.flush(); + outputFile.close(); + } } diff --git a/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/version/DumpVersionJDKIntKeyFactory.java b/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/version/DumpVersionJDKIntKeyFactory.java index ab5c43a4..990b52a3 100644 --- a/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/version/DumpVersionJDKIntKeyFactory.java +++ b/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/version/DumpVersionJDKIntKeyFactory.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -23,15 +23,14 @@ public class DumpVersionJDKIntKeyFactory implements IDumpVersionFactory { - @Override - public IDumpVersion getDumpVersion() { - IDumpVersion dumpVersion; - try { - dumpVersion = new DumpVersionJDKGeneric( - StringHashCodeJDK.class); - } catch (Exception e) { - dumpVersion = null; - } - return dumpVersion; - } + @Override + public IDumpVersion getDumpVersion() { + IDumpVersion dumpVersion; + try { + dumpVersion = new DumpVersionJDKGeneric(StringHashCodeJDK.class); + } catch (Exception e) { + dumpVersion = null; + } + return dumpVersion; + } } diff --git a/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/version/DumpVersionJDKLongKeyFactory.java b/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/version/DumpVersionJDKLongKeyFactory.java index b576cb24..cd636aa7 100644 --- a/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/version/DumpVersionJDKLongKeyFactory.java +++ b/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/version/DumpVersionJDKLongKeyFactory.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -23,16 +23,15 @@ public class DumpVersionJDKLongKeyFactory implements IDumpVersionFactory { - @Override - public IDumpVersion getDumpVersion() { - IDumpVersion dumpVersion; - try { - dumpVersion = new DumpVersionJDKGeneric( - StringHashCodeJBoss.class); - } catch (Exception e) { - dumpVersion = null; - } - return dumpVersion; - } + @Override + public IDumpVersion getDumpVersion() { + IDumpVersion dumpVersion; + try { + dumpVersion = new DumpVersionJDKGeneric(StringHashCodeJBoss.class); + } catch (Exception e) { + dumpVersion = null; + } + return dumpVersion; + } } diff --git a/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/version/DumpVersionJDKStringKeyFactory.java b/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/version/DumpVersionJDKStringKeyFactory.java index c75084ad..ac8ac04d 100644 --- a/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/version/DumpVersionJDKStringKeyFactory.java +++ b/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/version/DumpVersionJDKStringKeyFactory.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -23,16 +23,15 @@ public class DumpVersionJDKStringKeyFactory implements IDumpVersionFactory { - @Override - public IDumpVersion getDumpVersion() { - IDumpVersion dumpVersion; - try { - dumpVersion = new DumpVersionJDKGeneric( - StringHashCodeDisabled.class); - } catch (Exception e) { - dumpVersion = null; - } - return dumpVersion; - } + @Override + public IDumpVersion getDumpVersion() { + IDumpVersion dumpVersion; + try { + dumpVersion = new DumpVersionJDKGeneric(StringHashCodeDisabled.class); + } catch (Exception e) { + dumpVersion = null; + } + return dumpVersion; + } } diff --git a/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/version/OriginalDumpVersion.java b/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/version/OriginalDumpVersion.java index ecdb2b54..4d1c34ac 100644 --- a/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/version/OriginalDumpVersion.java +++ b/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/version/OriginalDumpVersion.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -41,480 +41,469 @@ /** * This class holds the data for a specific dump version. - * - * */ - public class OriginalDumpVersion implements IDumpVersion { - private Timestamp timestamp; - private MetaData metaData; - - // XXX ivan.galkin - @SuppressWarnings("unused") - private String outputPath; - // XXX ivan.galkin - // private Map pageIdRevMap; // maps page id's to - // Revision - // objects - private final Set disambiguations; // caches the page id's of - // disambiguation pages. - private final Map textIdPageIdMap;// maps text id's to the page - // id's. - private final Map pPageIdNameMap;// maps page id's of pages to - // their names - private final Map cPageIdNameMap;// maps page id's of categories - // to their names - private final Map pNamePageIdMap;// maps names of pages to their - // page id's. - private final Map cNamePageIdMap;// maps names of categories to - // their page id's. - private final Map rPageIdNameMap;// maps page id's of redirects - - // to their names. - - // XXX ivan.galkin - private Files versionFiles; - private final Map pageIdRevMap; - private boolean skipCategory = true; - private boolean skipPage = true; - - /** - * Creates a new DumpVersion that corresponds to the specified time stamp. - * - * @param timestamp - */ - public OriginalDumpVersion(Timestamp timestamp) { - // XXX ivan.galkin - // this.timestamp = timestamp; - // pageIdRevMap = new HashMap(); - pageIdRevMap = new HashMap<>(); - disambiguations = new HashSet<>(); - textIdPageIdMap = new HashMap<>(); - pPageIdNameMap = new HashMap<>(); - cPageIdNameMap = new HashMap<>(); - pNamePageIdMap = new HashMap<>(); - cNamePageIdMap = new HashMap<>(); - rPageIdNameMap = new HashMap<>(); - - } - - public void setMetaData(MetaData metaData) { - this.metaData = metaData; - } - - public void setOutputPath(String outputPath) throws IOException { - this.outputPath = outputPath; - File directory = new File(outputPath); - directory.mkdir(); - } - - public void processRevisionRow(RevisionParser revisionParser) { - int rev_page; - Timestamp rev_timestamp; - Timestamp old_timestamp; - int old_text_id; - // get the rev_page (corresponds to page_id in the table page) - rev_page = revisionParser.getRevPage(); - // get the timestamp of the revision - - // XXX ivan.galkin - rev_timestamp = new Timestamp(Revision.extractTime(revisionParser - .getRevTimestamp())); - - if (rev_timestamp.before(timestamp)) { - - if (pageIdRevMap.containsKey(rev_page)) { - // XXX ivan.galkin go back to the time stamp classes - old_timestamp = new Timestamp(Revision.extractTime(Revision - .getTimestamp(pageIdRevMap.get(rev_page)))); - old_text_id = Revision.getTextId(pageIdRevMap.get(rev_page)); - // is it a better time stamp ? - if (rev_timestamp.after(old_timestamp)) { - pageIdRevMap.remove(rev_page); - pageIdRevMap.put(rev_page, Revision.createRevision( - revisionParser.getRevTextId(), Revision - .compressTime(rev_timestamp.getTime()))); - textIdPageIdMap.remove(old_text_id); - textIdPageIdMap - .put(revisionParser.getRevTextId(), rev_page); - } - } else { - // this is the first recoreded time stamp for that page id - pageIdRevMap.put(rev_page, Revision.createRevision( - revisionParser.getRevTextId(), Revision - .compressTime(rev_timestamp.getTime()))); - textIdPageIdMap.put(revisionParser.getRevTextId(), rev_page); - } - - } - } - - TxtFileWriter txtFW = null; - - public void initPageParsing() throws IOException { - // XXX ivan.galkin - // txtFW = new TxtFileWriter(outputPath + "/Category.txt"); - txtFW = new TxtFileWriter(versionFiles.getOutputCategory()); - } - - public void processPageRow(PageParser pageParser) throws IOException { - - int page_id; - int page_namespace; - String page_title; - page_namespace = pageParser.getPageNamespace(); - // handle categories - if (page_namespace == 14) { - if (skipCategory && pageParser.getPageIsRedirect()) - // skip categories that are redirects - return; - // retrieve page id and page title - page_id = pageParser.getPageId(); - // ignore categories, which have no revisions before the timestamp - if (!pageIdRevMap.containsKey(page_id)) - return; - - page_title = pageParser.getPageTitle(); - - // cache the retrieved values - recordCategory(page_id, page_title); - // write a new row in the table Category. - // Note that we also consider the page_id as id - txtFW.addRow(page_id, page_id, page_title); - metaData.addCategory(); - return; - } - // handle pages - if (page_namespace == 0) { - // retrieve page id and title - page_id = pageParser.getPageId(); - page_title = pageParser.getPageTitle(); - // ignore pages, which habe no revisions prior to the timestamp - if (!pageIdRevMap.containsKey(page_id)) - return; - // distinguish redirects - if (pageParser.getPageIsRedirect()) { - recordRedirect(page_id, page_title); - } else { - recordPage(page_id, page_title); - } - } - - } - - public void exportAfterPageParsing() throws IOException { - txtFW.export(); - } - - TxtFileWriter pageCategories = null; - TxtFileWriter categoryPages = null; - TxtFileWriter categoryInlinks = null; - TxtFileWriter categoryOutlinks = null; - - public void initCategoryLinksParsing() throws IOException { - // XXX ivan.galkin - // pageCategories = new TxtFileWriter(outputPath + File.separator - // + "page_categories.txt"); - // categoryPages = new TxtFileWriter(outputPath + File.separator - // + "category_pages.txt"); - // categoryInlinks = new TxtFileWriter(outputPath + File.separator - // + "category_inlinks.txt"); - // categoryOutlinks = new TxtFileWriter(outputPath + File.separator - // + "category_outlinks.txt"); - - pageCategories = new TxtFileWriter(versionFiles - .getOutputPageCategories()); - categoryPages = new TxtFileWriter(versionFiles.getOutputCategoryPages()); - categoryInlinks = new TxtFileWriter(versionFiles - .getOutputCategoryInlinks()); - categoryOutlinks = new TxtFileWriter(versionFiles - .getOutputCategoryOutlinks()); - - } - - public void processCategoryLinksRow(CategorylinksParser clParser) - throws IOException { - int cl_from; - String cl_to; - - cl_from = clParser.getClFrom(); - cl_to = clParser.getClTo(); - if (!existsCategory(cl_to)) {// discard links with non registred targets - return; - } - // if the link source is a page then write the link in category_pages - // and - // page_categories - if (existsPage(cl_from)) { - - categoryPages.addRow(getCategoryPageId(cl_to), cl_from); - pageCategories.addRow(cl_from, getCategoryPageId(cl_to)); - if (cl_to.equals(metaData.getDisambiguationCategory())) { - disambiguations.add(cl_from); - metaData.addDisamb(); - } - } else { - // if the link source is a category than write the link in - // category_inlinks and category_outlinks - if (existsCategoryPageId(cl_from)) { - categoryOutlinks.addRow(getCategoryPageId(cl_to), cl_from); - categoryInlinks.addRow(cl_from, getCategoryPageId(cl_to)); - } - } - } - - public void exportAfterCategoryLinksParsing() throws IOException { - // Export the written tables - pageCategories.export(); - categoryPages.export(); - categoryInlinks.export(); - categoryOutlinks.export(); - } - - TxtFileWriter pageInlinks = null; - TxtFileWriter pageOutlinks = null; - - public void initPageLinksParsing() throws IOException { - // XXX ivan.galkin - // pageInlinks = new TxtFileWriter(outputPath + File.separator - // + "page_inlinks.txt"); - // pageOutlinks = new TxtFileWriter(outputPath + File.separator - // + "page_outlinks.txt"); - pageInlinks = new TxtFileWriter(versionFiles.getOutputPageInlinks()); - pageOutlinks = new TxtFileWriter(versionFiles.getOutputPageOutlinks()); - } - - public void processPageLinksRow(PagelinksParser plParser) - throws IOException { - int pl_from; - String pl_to; - pl_from = plParser.getPlFrom(); - pl_to = plParser.getPlTo(); - // skip redirects or page with other namespace than 0 - if (skipPage && !existsPagePageId(pl_from) || !existsPageName(pl_to)) { - return; - } - pageOutlinks.addRow(pl_from, getPagePageId(pl_to)); - pageInlinks.addRow(getPagePageId(pl_to), pl_from); - } - - public void exportAfterPageLinksProcessing() throws IOException { - // export the written tables - pageInlinks.export(); - pageOutlinks.export(); - } - - TxtFileWriter page = null; - TxtFileWriter pageMapLine = null; - TxtFileWriter pageRedirects = null; - - public void initTextParsing() throws IOException { - // XXX ivan.galkin - // page = new TxtFileWriter(outputPath + File.separator + "Page.txt"); - // pageMapLine = new TxtFileWriter(outputPath + File.separator - // + "PageMapLine.txt"); - // pageRedirects = new TxtFileWriter(outputPath + File.separator - // + "page_redirects.txt"); - page = new TxtFileWriter(versionFiles.getOutputPage()); - pageMapLine = new TxtFileWriter(versionFiles.getOutputPageMapLine()); - pageRedirects = new TxtFileWriter(versionFiles.getOutputPageRedirects()); - } - - public void processTextRow(TextParser textParser) throws IOException { - String destination; - int text_id; - int page_id; - text_id = textParser.getOldId(); - if (!textIdPageIdMap.containsKey(text_id)) - return; - page_id = textIdPageIdMap.get(text_id); - if (existsPagePageId(page_id)) {// pages - page.addRow(page_id, page_id, getPageName(page_id), textParser - .getOldText(), formatBoolean(disambiguations - .contains(page_id))); - pageMapLine.addRow(page_id, getPageName(page_id), page_id, "NULL", - "NULL"); - metaData.addPage(); - return; - } - if (existsRedirect(page_id)) {// Redirects - destination = Redirects.getRedirectDestination(textParser - .getOldText()); - if (!existsPageName(destination)) - return; - pageRedirects.addRow(getPagePageId(destination), - getRedirectName(page_id)); - pageMapLine.addRow(page_id, getRedirectName(page_id), - getPagePageId(destination), "NULL", "NULL"); - metaData.addRedirect(); - } - } - - public void exportAfterTextParsing() throws IOException { - // export the written tables - page.export(); - pageRedirects.export(); - pageMapLine.export(); - } - - public void writeMetaData() throws IOException { - // XXX ivan.galkin - // TxtFileWriter metaData_ = new TxtFileWriter(outputPath + - // File.separator - // + "MetaData.txt"); - TxtFileWriter metaData_ = new TxtFileWriter(versionFiles - .getOutputMetadata()); - // ID,LANGUAGE,DISAMBIGUATION_CATEGORY,MAIN_CATEGORY,nrOfPages,nrOfRedirects,nrOfDisambiguationPages,nrOfCategories,timestamp - metaData_.addRow(metaData.getId(), metaData.getLanguage(), metaData - .getDisambiguationCategory(), metaData.getMainCategory(), - metaData.getNrOfPages(), metaData.getNrOfRedirects(), metaData - .getNrOfDisambiguations(), - metaData.getNrOfCategories(), TimestampUtil - .toMediaWikiString(metaData.getTimestamp())); - System.out.println("-------------------------------"); - System.out.println("Timestamp : " + timestamp.toString()); - System.out.println("nrOfCategories : " - + metaData.getNrOfCategories()); - System.out.println("nrOfPages : " + metaData.getNrOfPages()); - System.out.println("nrOfRedirects : " - + metaData.getNrOfRedirects()); - System.out.println("nrOfDisambiguations: " - + metaData.getNrOfDisambiguations()); - metaData_.export(); - } - - /** - * Returns the String value of the bit 1 if the given boolean is true
- * and an empty String otherwise. This the way bit values are written
- * in .txt dump files. - * - * @param b - * @return - */ - private String formatBoolean(boolean b) { - return b ? new String(new byte[] { 1 }) : ""; - } - - public void recordCategory(int page_id, String page_title) { - cPageIdNameMap.put(page_id, page_title); - cNamePageIdMap.put(page_title, page_id); - } - - public void recordPage(int page_id, String page_title) { - pPageIdNameMap.put(page_id, page_title); - pNamePageIdMap.put(page_title, page_id); - } - - public void recordRedirect(int page_id, String page_title) { - rPageIdNameMap.put(page_id, page_title); - } - - public boolean existsCategory(String name) { - return cNamePageIdMap.containsKey(name); - } - - public boolean existsPageName(String name) { - return pNamePageIdMap.containsKey(name); - } - - public boolean existsPage(int page_id) { - return pPageIdNameMap.containsKey(page_id); - } - - public boolean existsCategoryPageId(int page_id) { - return cPageIdNameMap.containsKey(page_id); - } - - public boolean existsPagePageId(int page_id) { - return pPageIdNameMap.containsKey(page_id); - } - - public int getPagePageId(String name) { - return pNamePageIdMap.get(name); - } - - public int getCategoryPageId(String name) { - return cNamePageIdMap.get(name); - } - - public String getPageName(int page_id) { - return pPageIdNameMap.get(page_id); - } - - public boolean existsRedirect(int page_id) { - return rPageIdNameMap.containsKey(page_id); - } - - public String getRedirectName(int page_id) { - return rPageIdNameMap.get(page_id); - } - - /* - * implemented methods from IDumpVersion interface - */ - - @Override - public void initialize(Timestamp timestamp) { - this.timestamp = timestamp; - } - - @Override - public void setFiles(Files versionFiles) { - this.versionFiles = versionFiles; - } - - /* - * not implemented methods - */ - - @Override - public void exportAfterPageLinksParsing() throws IOException { - } - - @Override - public void exportAfterRevisionParsing() throws IOException { - } - - @Override - public void flushByTextParsing() throws IOException { - } - - @Override - public void freeAfterCategoryLinksParsing() { - } - - @Override - public void freeAfterPageLinksParsing() { - } - - @Override - public void freeAfterPageParsing() { - } - - @Override - public void freeAfterRevisonParsing() { - } - - @Override - public void freeAfterTextParsing() { - } - - @Override - public void initRevisionParsion() { - } - - @Override - public void setLogger(ILogger logger) { - } - - @Override - public void setCategoryRedirectsSkip(boolean skipCategory) { - this.skipCategory = skipCategory; - } - - @Override - public void setPageRedirectsSkip(boolean skipPage) { - this.skipPage = skipPage; - } + private Timestamp timestamp; + private MetaData metaData; + + // XXX ivan.galkin + @SuppressWarnings("unused") + private String outputPath; + // XXX ivan.galkin + // private Map pageIdRevMap; // maps page id's to + // Revision + // objects + private final Set disambiguations; // caches the page id's of + // disambiguation pages. + private final Map textIdPageIdMap;// maps text id's to the page + // id's. + private final Map pPageIdNameMap;// maps page id's of pages to + // their names + private final Map cPageIdNameMap;// maps page id's of categories + // to their names + private final Map pNamePageIdMap;// maps names of pages to their + // page id's. + private final Map cNamePageIdMap;// maps names of categories to + // their page id's. + private final Map rPageIdNameMap;// maps page id's of redirects + + // to their names. + + // XXX ivan.galkin + private Files versionFiles; + private final Map pageIdRevMap; + private boolean skipCategory = true; + private boolean skipPage = true; + + /** + * Creates a new DumpVersion that corresponds to the specified time stamp. + * + * @param timestamp + */ + public OriginalDumpVersion(Timestamp timestamp) { + // XXX ivan.galkin + // this.timestamp = timestamp; + // pageIdRevMap = new HashMap(); + pageIdRevMap = new HashMap<>(); + disambiguations = new HashSet<>(); + textIdPageIdMap = new HashMap<>(); + pPageIdNameMap = new HashMap<>(); + cPageIdNameMap = new HashMap<>(); + pNamePageIdMap = new HashMap<>(); + cNamePageIdMap = new HashMap<>(); + rPageIdNameMap = new HashMap<>(); + + } + + @Override + public void setMetaData(MetaData metaData) { + this.metaData = metaData; + } + + public void setOutputPath(String outputPath) throws IOException { + this.outputPath = outputPath; + File directory = new File(outputPath); + directory.mkdir(); + } + + @Override + public void processRevisionRow(RevisionParser revisionParser) { + int rev_page; + Timestamp rev_timestamp; + Timestamp old_timestamp; + int old_text_id; + // get the rev_page (corresponds to page_id in the table page) + rev_page = revisionParser.getRevPage(); + // get the timestamp of the revision + + // XXX ivan.galkin + rev_timestamp = new Timestamp(Revision.extractTime(revisionParser.getRevTimestamp())); + + if (rev_timestamp.before(timestamp)) { + + if (pageIdRevMap.containsKey(rev_page)) { + // XXX ivan.galkin go back to the time stamp classes + old_timestamp = new Timestamp(Revision.extractTime(Revision.getTimestamp(pageIdRevMap.get(rev_page)))); + old_text_id = Revision.getTextId(pageIdRevMap.get(rev_page)); + // is it a better time stamp ? + if (rev_timestamp.after(old_timestamp)) { + pageIdRevMap.remove(rev_page); + pageIdRevMap.put(rev_page, Revision.createRevision(revisionParser.getRevTextId(), Revision + .compressTime(rev_timestamp.getTime()))); + textIdPageIdMap.remove(old_text_id); + textIdPageIdMap.put(revisionParser.getRevTextId(), rev_page); + } + } else { + // this is the first recorded time stamp for that page id + pageIdRevMap.put(rev_page, Revision.createRevision(revisionParser.getRevTextId(), Revision + .compressTime(rev_timestamp.getTime()))); + textIdPageIdMap.put(revisionParser.getRevTextId(), rev_page); + } + + } + } + + TxtFileWriter txtFW = null; + + @Override + public void initPageParsing() throws IOException { + // XXX ivan.galkin + // txtFW = new TxtFileWriter(outputPath + "/Category.txt"); + txtFW = new TxtFileWriter(versionFiles.getOutputCategory()); + } + + @Override + public void processPageRow(PageParser pageParser) throws IOException { + + int page_id; + int page_namespace; + String page_title; + page_namespace = pageParser.getPageNamespace(); + // handle categories + if (page_namespace == 14) { + if (skipCategory && pageParser.getPageIsRedirect()) + // skip categories that are redirects + return; + // retrieve page id and page title + page_id = pageParser.getPageId(); + // ignore categories, which have no revisions before the timestamp + if (!pageIdRevMap.containsKey(page_id)) + return; + + page_title = pageParser.getPageTitle(); + + // cache the retrieved values + recordCategory(page_id, page_title); + // write a new row in the table Category. + // Note that we also consider the page_id as id + txtFW.addRow(page_id, page_id, page_title); + metaData.addCategory(); + return; + } + // handle pages + if (page_namespace == 0) { + // retrieve page id and title + page_id = pageParser.getPageId(); + page_title = pageParser.getPageTitle(); + // ignore pages, which habe no revisions prior to the timestamp + if (!pageIdRevMap.containsKey(page_id)) + return; + // distinguish redirects + if (pageParser.getPageIsRedirect()) { + recordRedirect(page_id, page_title); + } else { + recordPage(page_id, page_title); + } + } + + } + + @Override + public void exportAfterPageParsing() throws IOException { + txtFW.export(); + } + + private TxtFileWriter pageCategories = null; + private TxtFileWriter categoryPages = null; + private TxtFileWriter categoryInlinks = null; + private TxtFileWriter categoryOutlinks = null; + + @Override + public void initCategoryLinksParsing() throws IOException { + // XXX ivan.galkin + // pageCategories = new TxtFileWriter(outputPath + File.separator + // + "page_categories.txt"); + // categoryPages = new TxtFileWriter(outputPath + File.separator + // + "category_pages.txt"); + // categoryInlinks = new TxtFileWriter(outputPath + File.separator + // + "category_inlinks.txt"); + // categoryOutlinks = new TxtFileWriter(outputPath + File.separator + // + "category_outlinks.txt"); + + pageCategories = new TxtFileWriter(versionFiles.getOutputPageCategories()); + categoryPages = new TxtFileWriter(versionFiles.getOutputCategoryPages()); + categoryInlinks = new TxtFileWriter(versionFiles.getOutputCategoryInlinks()); + categoryOutlinks = new TxtFileWriter(versionFiles.getOutputCategoryOutlinks()); + + } + + @Override + public void processCategoryLinksRow(CategorylinksParser clParser) throws IOException { + int cl_from; + String cl_to; + + cl_from = clParser.getClFrom(); + cl_to = clParser.getClTo(); + if (!existsCategory(cl_to)) {// discard links with non registred targets + return; + } + // if the link source is a page then write the link in category_pages + // and + // page_categories + if (existsPage(cl_from)) { + + categoryPages.addRow(getCategoryPageId(cl_to), cl_from); + pageCategories.addRow(cl_from, getCategoryPageId(cl_to)); + if (cl_to.equals(metaData.getDisambiguationCategory())) { + disambiguations.add(cl_from); + metaData.addDisamb(); + } + } else { + // if the link source is a category than write the link in + // category_inlinks and category_outlinks + if (existsCategoryPageId(cl_from)) { + categoryOutlinks.addRow(getCategoryPageId(cl_to), cl_from); + categoryInlinks.addRow(cl_from, getCategoryPageId(cl_to)); + } + } + } + + @Override + public void exportAfterCategoryLinksParsing() throws IOException { + // Export the written tables + pageCategories.export(); + categoryPages.export(); + categoryInlinks.export(); + categoryOutlinks.export(); + } + + private TxtFileWriter pageInlinks = null; + private TxtFileWriter pageOutlinks = null; + + @Override + public void initPageLinksParsing() throws IOException { + // XXX ivan.galkin + // pageInlinks = new TxtFileWriter(outputPath + File.separator + // + "page_inlinks.txt"); + // pageOutlinks = new TxtFileWriter(outputPath + File.separator + // + "page_outlinks.txt"); + pageInlinks = new TxtFileWriter(versionFiles.getOutputPageInlinks()); + pageOutlinks = new TxtFileWriter(versionFiles.getOutputPageOutlinks()); + } + + @Override + public void processPageLinksRow(PagelinksParser plParser) throws IOException { + int pl_from; + String pl_to; + pl_from = plParser.getPlFrom(); + pl_to = plParser.getPlTo(); + // skip redirects or page with other namespace than 0 + if (skipPage && !existsPagePageId(pl_from) || !existsPageName(pl_to)) { + return; + } + pageOutlinks.addRow(pl_from, getPagePageId(pl_to)); + pageInlinks.addRow(getPagePageId(pl_to), pl_from); + } + + public void exportAfterPageLinksProcessing() throws IOException { + // export the written tables + pageInlinks.export(); + pageOutlinks.export(); + } + + private TxtFileWriter page = null; + private TxtFileWriter pageMapLine = null; + private TxtFileWriter pageRedirects = null; + + @Override + public void initTextParsing() throws IOException { + // XXX ivan.galkin + // page = new TxtFileWriter(outputPath + File.separator + "Page.txt"); + // pageMapLine = new TxtFileWriter(outputPath + File.separator + // + "PageMapLine.txt"); + // pageRedirects = new TxtFileWriter(outputPath + File.separator + // + "page_redirects.txt"); + page = new TxtFileWriter(versionFiles.getOutputPage()); + pageMapLine = new TxtFileWriter(versionFiles.getOutputPageMapLine()); + pageRedirects = new TxtFileWriter(versionFiles.getOutputPageRedirects()); + } + + @Override + public void processTextRow(TextParser textParser) throws IOException { + String destination; + int text_id; + int page_id; + text_id = textParser.getOldId(); + if (!textIdPageIdMap.containsKey(text_id)) + return; + page_id = textIdPageIdMap.get(text_id); + if (existsPagePageId(page_id)) {// pages + page.addRow(page_id, page_id, getPageName(page_id), textParser.getOldText(), + formatBoolean(disambiguations.contains(page_id))); + pageMapLine.addRow(page_id, getPageName(page_id), page_id, "NULL", "NULL"); + metaData.addPage(); + return; + } + if (existsRedirect(page_id)) {// Redirects + destination = Redirects.getRedirectDestination(textParser.getOldText()); + if (!existsPageName(destination)) + return; + pageRedirects.addRow(getPagePageId(destination), getRedirectName(page_id)); + pageMapLine.addRow(page_id, getRedirectName(page_id), getPagePageId(destination), "NULL", "NULL"); + metaData.addRedirect(); + } + } + + @Override + public void exportAfterTextParsing() throws IOException { + // export the written tables + page.export(); + pageRedirects.export(); + pageMapLine.export(); + } + + @Override + public void writeMetaData() throws IOException { + // XXX ivan.galkin + // TxtFileWriter metaData_ = new TxtFileWriter(outputPath + File.separator + "MetaData.txt"); + try (TxtFileWriter metaData_ = new TxtFileWriter(versionFiles.getOutputMetadata())) { + // ID,LANGUAGE,DISAMBIGUATION_CATEGORY,MAIN_CATEGORY,nrOfPages,nrOfRedirects,nrOfDisambiguationPages,nrOfCategories,timestamp + metaData_.addRow(metaData.getId(), metaData.getLanguage(), metaData.getDisambiguationCategory(), + metaData.getMainCategory(), metaData.getNrOfPages(), metaData.getNrOfRedirects(), + metaData.getNrOfDisambiguations(), metaData.getNrOfCategories(), + TimestampUtil.toMediaWikiString(metaData.getTimestamp())); + System.out.println("-------------------------------"); + System.out.println("Timestamp : " + timestamp.toString()); + System.out.println("nrOfCategories : " + metaData.getNrOfCategories()); + System.out.println("nrOfPages : " + metaData.getNrOfPages()); + System.out.println("nrOfRedirects : " + metaData.getNrOfRedirects()); + System.out.println("nrOfDisambiguations: " + metaData.getNrOfDisambiguations()); + metaData_.export(); + } + } + + /** + * Returns the String value of the bit 1 if the given boolean is true
+ * and an empty String otherwise. This the way bit values are written
+ * in .txt dump files. + * + * @param b + * @return + */ + private String formatBoolean(boolean b) { + return b ? new String(new byte[]{1}) : ""; + } + + public void recordCategory(int page_id, String page_title) { + cPageIdNameMap.put(page_id, page_title); + cNamePageIdMap.put(page_title, page_id); + } + + public void recordPage(int page_id, String page_title) { + pPageIdNameMap.put(page_id, page_title); + pNamePageIdMap.put(page_title, page_id); + } + + public void recordRedirect(int page_id, String page_title) { + rPageIdNameMap.put(page_id, page_title); + } + + public boolean existsCategory(String name) { + return cNamePageIdMap.containsKey(name); + } + + public boolean existsPageName(String name) { + return pNamePageIdMap.containsKey(name); + } + + public boolean existsPage(int page_id) { + return pPageIdNameMap.containsKey(page_id); + } + + public boolean existsCategoryPageId(int page_id) { + return cPageIdNameMap.containsKey(page_id); + } + + public boolean existsPagePageId(int page_id) { + return pPageIdNameMap.containsKey(page_id); + } + + public int getPagePageId(String name) { + return pNamePageIdMap.get(name); + } + + public int getCategoryPageId(String name) { + return cNamePageIdMap.get(name); + } + + public String getPageName(int page_id) { + return pPageIdNameMap.get(page_id); + } + + public boolean existsRedirect(int page_id) { + return rPageIdNameMap.containsKey(page_id); + } + + public String getRedirectName(int page_id) { + return rPageIdNameMap.get(page_id); + } + + /* + * implemented methods from IDumpVersion interface + */ + + @Override + public void initialize(Timestamp timestamp) { + this.timestamp = timestamp; + } + + @Override + public void setFiles(Files versionFiles) { + this.versionFiles = versionFiles; + } + + /* + * not implemented methods + */ + + @Override + public void exportAfterPageLinksParsing() throws IOException { + } + + @Override + public void exportAfterRevisionParsing() throws IOException { + } + + @Override + public void flushByTextParsing() throws IOException { + } + + @Override + public void freeAfterCategoryLinksParsing() { + } + + @Override + public void freeAfterPageLinksParsing() { + } + + @Override + public void freeAfterPageParsing() { + } + + @Override + public void freeAfterRevisonParsing() { + } + + @Override + public void freeAfterTextParsing() { + } + + @Override + public void initRevisionParsion() { + } + + @Override + public void setLogger(ILogger logger) { + } + + @Override + public void setCategoryRedirectsSkip(boolean skipCategory) { + this.skipCategory = skipCategory; + } + + @Override + public void setPageRedirectsSkip(boolean skipPage) { + this.skipPage = skipPage; + } } diff --git a/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/xml/PageReader.java b/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/xml/PageReader.java index ad6ccc29..eaf15139 100644 --- a/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/xml/PageReader.java +++ b/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/xml/PageReader.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -25,36 +25,34 @@ /** * This class is a specified variant of XmlDumpReader. Please see its source for more * information about a functionality and a license.
- * - * */ public class PageReader extends AbstractXmlDumpReader { - public PageReader(InputStream inputStream, DumpWriter writer) { - super(inputStream, writer); - } + public PageReader(InputStream inputStream, DumpWriter writer) { + super(inputStream, writer); + } - @Override - protected void setupStartElements() { - startElements.put(REVISION, REVISION); - startElements.put(CONTRIBUTOR, CONTRIBUTOR); - startElements.put(PAGE, PAGE); - startElements.put(SITEINFO, SITEINFO); - startElements.put(NAMESPACES, NAMESPACES); - startElements.put(NAMESPACE, NAMESPACE); - } + @Override + protected void setupStartElements() { + startElements.put(REVISION, REVISION); + startElements.put(CONTRIBUTOR, CONTRIBUTOR); + startElements.put(PAGE, PAGE); + startElements.put(SITEINFO, SITEINFO); + startElements.put(NAMESPACES, NAMESPACES); + startElements.put(NAMESPACE, NAMESPACE); + } - @Override - protected void setupEndElements() { - endElements.put(REVISION, REVISION); - endElements.put(TIMESTAMP, TIMESTAMP); - endElements.put(TEXT, TEXT); - endElements.put(CONTRIBUTOR, CONTRIBUTOR); - endElements.put(ID, ID); - endElements.put(PAGE, PAGE); - endElements.put(TITLE, TITLE); - endElements.put(SITEINFO, SITEINFO); - endElements.put(NAMESPACES, NAMESPACES); - endElements.put(NAMESPACE, NAMESPACE); - } + @Override + protected void setupEndElements() { + endElements.put(REVISION, REVISION); + endElements.put(TIMESTAMP, TIMESTAMP); + endElements.put(TEXT, TEXT); + endElements.put(CONTRIBUTOR, CONTRIBUTOR); + endElements.put(ID, ID); + endElements.put(PAGE, PAGE); + endElements.put(TITLE, TITLE); + endElements.put(SITEINFO, SITEINFO); + endElements.put(NAMESPACES, NAMESPACES); + endElements.put(NAMESPACE, NAMESPACE); + } } diff --git a/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/xml/PageWriter.java b/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/xml/PageWriter.java index 5ab3a2ff..8c10d4aa 100755 --- a/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/xml/PageWriter.java +++ b/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/xml/PageWriter.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -30,62 +30,62 @@ public class PageWriter implements DumpWriter { - private Page currentPage; - private Revision lastRevision; - private final UTFDataOutputStream stream; + private Page currentPage; + private Revision lastRevision; + private final UTFDataOutputStream stream; - public PageWriter(OutputStream output) throws IOException { - this.stream = new UTFDataOutputStream(output); - } + public PageWriter(OutputStream output) throws IOException { + this.stream = new UTFDataOutputStream(output); + } - @Override - public void close() throws IOException { - stream.close(); - } + @Override + public void close() throws IOException { + stream.close(); + } - @Override - public void writeEndPage() throws IOException { - if (lastRevision != null) { - updatePage(currentPage, lastRevision); - } - currentPage = null; - lastRevision = null; + @Override + public void writeEndPage() throws IOException { + if (lastRevision != null) { + updatePage(currentPage, lastRevision); + } + currentPage = null; + lastRevision = null; - } + } - @Override - public void writeEndWiki() throws IOException { - stream.flush(); - } + @Override + public void writeEndWiki() throws IOException { + stream.flush(); + } - @Override - public void writeRevision(Revision revision) throws IOException { + @Override + public void writeRevision(Revision revision) throws IOException { - lastRevision = revision; + lastRevision = revision; - } + } - @Override - public void writeSiteinfo(Siteinfo info) throws IOException { + @Override + public void writeSiteinfo(Siteinfo info) throws IOException { - } + } - @Override - public void writeStartPage(Page page) throws IOException { - currentPage = page; - lastRevision = null; - } + @Override + public void writeStartPage(Page page) throws IOException { + currentPage = page; + lastRevision = null; + } - @Override - public void writeStartWiki() throws IOException { - } + @Override + public void writeStartWiki() throws IOException { + } - private void updatePage(Page page, Revision revision) throws IOException { - stream.writeInt(page.Id); - stream.writeInt(page.Title.Namespace); - String wellformedTitle = SQLEscape.titleFormat(page.Title.Text); - stream.writeUTFAsArray(SQLEscape.escape(wellformedTitle)); - // stream.writeBoolean(revision.isRedirect()); - stream.writeBoolean(Redirects.isRedirect(revision.Text)); - } + private void updatePage(Page page, Revision revision) throws IOException { + stream.writeInt(page.Id); + stream.writeInt(page.Title.Namespace); + String wellformedTitle = SQLEscape.titleFormat(page.Title.Text); + stream.writeUTFAsArray(SQLEscape.escape(wellformedTitle)); + // stream.writeBoolean(revision.isRedirect()); + stream.writeBoolean(Redirects.isRedirect(revision.Text)); + } } diff --git a/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/xml/RevisionReader.java b/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/xml/RevisionReader.java index 9d7044f4..53f39e51 100644 --- a/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/xml/RevisionReader.java +++ b/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/xml/RevisionReader.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -25,36 +25,34 @@ /** * This class is a specified variant of XmlDumpReader. Please see its source for more * information about a functionality and a license.
- * - * */ public class RevisionReader extends AbstractXmlDumpReader { - public RevisionReader(InputStream inputStream, DumpWriter writer) { - super(inputStream, writer); - } + public RevisionReader(InputStream inputStream, DumpWriter writer) { + super(inputStream, writer); + } - @Override - protected void setupStartElements() { - startElements.put(REVISION, REVISION); - startElements.put(CONTRIBUTOR, CONTRIBUTOR); - startElements.put(PAGE, PAGE); - startElements.put(SITEINFO, SITEINFO); - startElements.put(NAMESPACES, NAMESPACES); - startElements.put(NAMESPACE, NAMESPACE); - } + @Override + protected void setupStartElements() { + startElements.put(REVISION, REVISION); + startElements.put(CONTRIBUTOR, CONTRIBUTOR); + startElements.put(PAGE, PAGE); + startElements.put(SITEINFO, SITEINFO); + startElements.put(NAMESPACES, NAMESPACES); + startElements.put(NAMESPACE, NAMESPACE); + } - @Override - protected void setupEndElements() { - endElements.put(REVISION, REVISION); - endElements.put(TIMESTAMP, TIMESTAMP); - endElements.put(TEXT, TEXT); - endElements.put(CONTRIBUTOR, CONTRIBUTOR); - endElements.put(ID, ID); - endElements.put(PAGE, PAGE); - endElements.put(TITLE, TITLE); - endElements.put(SITEINFO, SITEINFO); - endElements.put(NAMESPACES, NAMESPACES); - endElements.put(NAMESPACE, NAMESPACE); - } + @Override + protected void setupEndElements() { + endElements.put(REVISION, REVISION); + endElements.put(TIMESTAMP, TIMESTAMP); + endElements.put(TEXT, TEXT); + endElements.put(CONTRIBUTOR, CONTRIBUTOR); + endElements.put(ID, ID); + endElements.put(PAGE, PAGE); + endElements.put(TITLE, TITLE); + endElements.put(SITEINFO, SITEINFO); + endElements.put(NAMESPACES, NAMESPACES); + endElements.put(NAMESPACE, NAMESPACE); + } } diff --git a/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/xml/RevisionWriter.java b/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/xml/RevisionWriter.java index 46f29935..785a7d81 100755 --- a/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/xml/RevisionWriter.java +++ b/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/xml/RevisionWriter.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -28,46 +28,46 @@ public class RevisionWriter implements DumpWriter { - private Page currentPage; - private final DataOutputStream stream; + private Page currentPage; + private final DataOutputStream stream; - public RevisionWriter(OutputStream output) throws IOException { - this.stream = new DataOutputStream(output); - } + public RevisionWriter(OutputStream output) throws IOException { + this.stream = new DataOutputStream(output); + } - @Override - public void close() throws IOException { - stream.close(); - } + @Override + public void close() throws IOException { + stream.close(); + } - @Override - public void writeEndPage() throws IOException { - currentPage = null; - } + @Override + public void writeEndPage() throws IOException { + currentPage = null; + } - @Override - public void writeEndWiki() throws IOException { - stream.flush(); - } + @Override + public void writeEndWiki() throws IOException { + stream.flush(); + } - @Override - public void writeRevision(Revision revision) throws IOException { - stream.writeInt(currentPage.Id); - stream.writeInt(revision.Id); - stream.writeLong(revision.Timestamp.getTimeInMillis()); - } + @Override + public void writeRevision(Revision revision) throws IOException { + stream.writeInt(currentPage.Id); + stream.writeInt(revision.Id); + stream.writeLong(revision.Timestamp.getTimeInMillis()); + } - @Override - public void writeSiteinfo(Siteinfo info) throws IOException { + @Override + public void writeSiteinfo(Siteinfo info) throws IOException { - } + } - @Override - public void writeStartPage(Page page) throws IOException { - currentPage = page; - } + @Override + public void writeStartPage(Page page) throws IOException { + currentPage = page; + } - @Override - public void writeStartWiki() throws IOException { - } + @Override + public void writeStartWiki() throws IOException { + } } diff --git a/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/xml/TextReader.java b/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/xml/TextReader.java index 3203de4c..b7c44aab 100644 --- a/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/xml/TextReader.java +++ b/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/xml/TextReader.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -25,35 +25,33 @@ /** * This class is a specified variant of XmlDumpReader. Please see its source for more * information about a functionality and a license.
- * - * */ public class TextReader extends AbstractXmlDumpReader { - public TextReader(InputStream inputStream, DumpWriter writer) { - super(inputStream, writer); - } + public TextReader(InputStream inputStream, DumpWriter writer) { + super(inputStream, writer); + } - @Override - protected void setupStartElements() { - startElements.put(REVISION, REVISION); - startElements.put(CONTRIBUTOR, CONTRIBUTOR); - startElements.put(PAGE, PAGE); - startElements.put(SITEINFO, SITEINFO); - startElements.put(NAMESPACES, NAMESPACES); - startElements.put(NAMESPACE, NAMESPACE); - } + @Override + protected void setupStartElements() { + startElements.put(REVISION, REVISION); + startElements.put(CONTRIBUTOR, CONTRIBUTOR); + startElements.put(PAGE, PAGE); + startElements.put(SITEINFO, SITEINFO); + startElements.put(NAMESPACES, NAMESPACES); + startElements.put(NAMESPACE, NAMESPACE); + } - @Override - protected void setupEndElements() { - endElements.put(REVISION, REVISION); - endElements.put(TEXT, TEXT); - endElements.put(CONTRIBUTOR, CONTRIBUTOR); - endElements.put(ID, ID); - endElements.put(PAGE, PAGE); - endElements.put(TITLE, TITLE); - endElements.put(SITEINFO, SITEINFO); - endElements.put(NAMESPACES, NAMESPACES); - endElements.put(NAMESPACE, NAMESPACE); - } + @Override + protected void setupEndElements() { + endElements.put(REVISION, REVISION); + endElements.put(TEXT, TEXT); + endElements.put(CONTRIBUTOR, CONTRIBUTOR); + endElements.put(ID, ID); + endElements.put(PAGE, PAGE); + endElements.put(TITLE, TITLE); + endElements.put(SITEINFO, SITEINFO); + endElements.put(NAMESPACES, NAMESPACES); + endElements.put(NAMESPACE, NAMESPACE); + } } diff --git a/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/xml/TextWriter.java b/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/xml/TextWriter.java index 26ec1056..0f055c49 100755 --- a/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/xml/TextWriter.java +++ b/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/xml/TextWriter.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -29,41 +29,41 @@ public class TextWriter implements DumpWriter { - private final UTFDataOutputStream stream; + private final UTFDataOutputStream stream; - public TextWriter(OutputStream output) throws IOException { - this.stream = new UTFDataOutputStream(output); - } + public TextWriter(OutputStream output) throws IOException { + this.stream = new UTFDataOutputStream(output); + } - @Override - public void close() throws IOException { - stream.close(); - } + @Override + public void close() throws IOException { + stream.close(); + } - @Override - public void writeEndPage() throws IOException { - } + @Override + public void writeEndPage() throws IOException { + } - @Override - public void writeEndWiki() throws IOException { - stream.flush(); - } + @Override + public void writeEndWiki() throws IOException { + stream.flush(); + } - @Override - public void writeRevision(Revision revision) throws IOException { - stream.writeInt(revision.Id); - stream.writeUTFAsArray(SQLEscape.escape(revision.Text)); - } + @Override + public void writeRevision(Revision revision) throws IOException { + stream.writeInt(revision.Id); + stream.writeUTFAsArray(SQLEscape.escape(revision.Text)); + } - @Override - public void writeSiteinfo(Siteinfo info) throws IOException { - } + @Override + public void writeSiteinfo(Siteinfo info) throws IOException { + } - @Override - public void writeStartPage(Page page) throws IOException { - } + @Override + public void writeStartPage(Page page) throws IOException { + } - @Override - public void writeStartWiki() throws IOException { - } + @Override + public void writeStartWiki() throws IOException { + } } diff --git a/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/xml/TimeMachineRevisionParser.java b/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/xml/TimeMachineRevisionParser.java index 672bbba1..da78bc33 100755 --- a/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/xml/TimeMachineRevisionParser.java +++ b/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/xml/TimeMachineRevisionParser.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -25,18 +25,19 @@ public class TimeMachineRevisionParser extends RevisionParser { - public boolean next() throws IOException { - boolean hasNext = true; - try { - revPage = stream.readInt(); - revTextId = stream.readInt(); - revTimestamp = Revision.compressTime(stream.readLong()); - } catch (EOFException e) { - hasNext = false; - } + @Override + public boolean next() throws IOException { + boolean hasNext = true; + try { + revPage = stream.readInt(); + revTextId = stream.readInt(); + revTimestamp = Revision.compressTime(stream.readLong()); + } catch (EOFException e) { + hasNext = false; + } - return hasNext; + return hasNext; - } + } } diff --git a/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/xml/XMLDumpTableInputStream.java b/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/xml/XMLDumpTableInputStream.java index 0863d7f2..c7227ee2 100755 --- a/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/xml/XMLDumpTableInputStream.java +++ b/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/xml/XMLDumpTableInputStream.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -27,7 +27,7 @@ import org.dkpro.jwpl.wikimachine.dump.xml.DumpTableInputStream; /** - * Decorator for an InputStream. Converts an XML source to SQL + * Decorator for an {@link InputStream}. Converts an XML source to SQL * result in a separated thread via * org.mediawiki.importer.XmlDumpReader * @@ -37,76 +37,70 @@ */ public class XMLDumpTableInputStream extends DumpTableInputStream { - private static final int BUFFERSIZE = 8192; - /** - * output stream where the conversion thread - * XMLInputStreamThread is writing in - */ - private PipedOutputStream decodedStream; - /** - * piped stream, that allows to read from a decodedStream - */ - private PipedInputStream unbufferedResult; - /** - * piped result stream, that is buffered for better performance - */ - private BufferedInputStream result; - /** - * thread where the conversion algorithm should run - */ - private XMLDumpTableInputStreamThread xmlInputThread; + private static final int BUFFERSIZE = 8192; + /** + * piped result stream, that is buffered for better performance + */ + private BufferedInputStream result; + /** + * thread where the conversion algorithm should run + */ + private XMLDumpTableInputStreamThread xmlInputThread; - /** - * Decorator for InputStream, which allows to convert an XML input stream to - * SQL - * - * @param inputStream - * XML input stream - * @throws IOException - */ - @Override - public void initialize(InputStream inputStream, DumpTableEnum table) - throws IOException { + /** + * Decorator for InputStream, which allows to convert an XML input stream to + * SQL + * + * @param inputStream XML input stream + * @throws IOException + */ + @Override + public void initialize(InputStream inputStream, DumpTableEnum table) throws IOException { - unbufferedResult = new PipedInputStream(); - decodedStream = new PipedOutputStream(unbufferedResult); - result = new BufferedInputStream(unbufferedResult, BUFFERSIZE); + /* + * piped input stream, that allows to read from a decodedStream + */ + PipedInputStream unbufferedResult = new PipedInputStream(); + /* + * piped output stream where the conversion thread XMLInputStreamThread is writing in + */ + PipedOutputStream decodedStream = new PipedOutputStream(unbufferedResult); + result = new BufferedInputStream(unbufferedResult, BUFFERSIZE); - xmlInputThread = new XMLDumpTableInputStreamThread(inputStream, - decodedStream, table); - xmlInputThread.start(); + xmlInputThread = new XMLDumpTableInputStreamThread(inputStream, decodedStream, table); + xmlInputThread.start(); - } + } - @Override - public int read() throws IOException { - return result.read(); - } + @Override + public int read() throws IOException { + return result.read(); + } - @Override - public int available() throws IOException { - return result.available(); - } + @Override + public int available() throws IOException { + return result.available(); + } - @Override - public void close() throws IOException { - result.close(); - xmlInputThread.abort(); - } + @Override + public void close() throws IOException { + result.close(); + xmlInputThread.abort(); + } - @Override - public void mark(int readlimit) { - result.mark(readlimit); - } + @Override + public void mark(int readlimit) { + result.mark(readlimit); + } - @Override - public void reset() throws IOException { - result.reset(); - } + @Override + public void reset() throws IOException { + result.reset(); + } - @Override - public boolean markSupported() { - return result.markSupported(); - } + @Override + public boolean markSupported() { + return result.markSupported(); + } } diff --git a/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/xml/XMLDumpTableInputStreamThread.java b/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/xml/XMLDumpTableInputStreamThread.java index b17101da..83560296 100755 --- a/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/xml/XMLDumpTableInputStreamThread.java +++ b/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/xml/XMLDumpTableInputStreamThread.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -33,74 +33,67 @@ */ class XMLDumpTableInputStreamThread extends Thread { - private static final Logger logger = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + private static final Logger logger = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); - /** - * Enable the main and category pages as well as discussions - */ - private static final String ENABLED_NAMESPACES = "NS_MAIN,NS_TALK,NS_CATEGORY"; + /** + * Enable the main and category pages as well as discussions + */ + private static final String ENABLED_NAMESPACES = "NS_MAIN,NS_TALK,NS_CATEGORY"; - /** - * Generalization {@link org.dkpro.jwpl.mwdumper.importer.XmlDumpReader} - * that parses the XML dump - */ - private AbstractXmlDumpReader xmlReader; + /** + * Generalization {@link org.dkpro.jwpl.mwdumper.importer.XmlDumpReader} + * that parses the XML dump + */ + private AbstractXmlDumpReader xmlReader; - /** - * completion flag for a conversion process - */ - private boolean isComplete; + /** + * completion flag for a conversion process + */ + private boolean isComplete; - /** - * Initiate input and output streams - * - * @param iStream - * XML input stream - * @param oStream - * SQL output stream - * @throws IOException Thrown in case errors occurred. - */ - public XMLDumpTableInputStreamThread(InputStream iStream, - OutputStream oStream, DumpTableEnum table) throws IOException { - super("xml2sql"); + /** + * Initiate input and output streams + * + * @param iStream XML input stream + * @param oStream SQL output stream + * @throws IOException Thrown in case errors occurred. + */ + public XMLDumpTableInputStreamThread(InputStream iStream, OutputStream oStream, DumpTableEnum table) + throws IOException { + super("xml2sql"); - switch (table) { - case PAGE: - xmlReader = new PageReader(iStream, new NamespaceFilter( - new PageWriter(oStream), ENABLED_NAMESPACES)); - break; - case REVISION: - xmlReader = new RevisionReader(iStream, new NamespaceFilter( - new RevisionWriter(oStream), ENABLED_NAMESPACES)); - break; - case TEXT: - xmlReader = new TextReader(iStream, new NamespaceFilter( - new TextWriter(oStream), ENABLED_NAMESPACES)); - break; + switch (table) { + case PAGE: + xmlReader = new PageReader(iStream, new NamespaceFilter(new PageWriter(oStream), ENABLED_NAMESPACES)); + break; + case REVISION: + xmlReader = new RevisionReader(iStream, new NamespaceFilter(new RevisionWriter(oStream), ENABLED_NAMESPACES)); + break; + case TEXT: + xmlReader = new TextReader(iStream, new NamespaceFilter(new TextWriter(oStream), ENABLED_NAMESPACES)); + break; + } + } - } + @Override + public synchronized void run() { + try { + isComplete = false; + xmlReader.readDump(); + isComplete = true; + } catch (IOException e) { + logger.error(e.getMessage(), e); + throw new RuntimeException(e); + } + } - } - - @Override - public synchronized void run() { - try { - isComplete = false; - xmlReader.readDump(); - isComplete = true; - } catch (IOException e) { - logger.error(e.getMessage(), e); - throw new RuntimeException(e); - } - } - - /** - * Abort a conversion - */ - public synchronized void abort() { - if (!isComplete) { - xmlReader.abort(); - isComplete = true; - } - } + /** + * Abort a conversion + */ + public synchronized void abort() { + if (!isComplete) { + xmlReader.abort(); + isComplete = true; + } + } } diff --git a/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/api/T1a_HelloWorld.java b/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/api/T1a_HelloWorld.java index faca6735..7e338058 100644 --- a/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/api/T1a_HelloWorld.java +++ b/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/api/T1a_HelloWorld.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -30,28 +30,26 @@ * The text will be formatted with MediaWiki markup. *

* Throws an exception, if no page with the given title exists. - * - * */ public class T1a_HelloWorld implements WikiConstants { - public static void main(String[] args) throws WikiApiException { + public static void main(String[] args) throws WikiApiException { - // configure the database connection parameters - DatabaseConfiguration dbConfig = new DatabaseConfiguration(); - dbConfig.setHost("SERVER_URL"); - dbConfig.setDatabase("DATABASE"); - dbConfig.setUser("USER"); - dbConfig.setPassword("PASSWORD"); - dbConfig.setLanguage(Language.german); + // configure the database connection parameters + DatabaseConfiguration dbConfig = new DatabaseConfiguration(); + dbConfig.setHost("SERVER_URL"); + dbConfig.setDatabase("DATABASE"); + dbConfig.setUser("USER"); + dbConfig.setPassword("PASSWORD"); + dbConfig.setLanguage(Language.german); - // Create a new German wikipedia. - Wikipedia wiki = new Wikipedia(dbConfig); + // Create a new German wikipedia. + Wikipedia wiki = new Wikipedia(dbConfig); - // Get the page with title "Hello world". - // May throw an exception, if the page does not exist. - Page page = wiki.getPage("Hello world"); - System.out.println(page.getText()); + // Get the page with title "Hello world". + // May throw an exception, if the page does not exist. + Page page = wiki.getPage("Hello world"); + System.out.println(page.getText()); - } + } } diff --git a/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/api/T1b_HelloWorld.java b/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/api/T1b_HelloWorld.java index 4803ad60..0d2b3726 100644 --- a/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/api/T1b_HelloWorld.java +++ b/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/api/T1b_HelloWorld.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -30,31 +30,28 @@ * The text will be formatted with MediaWiki markup. *

* If you do not care about exception handling, but want to avoid crashes on every page that does not exist. - * - * */ public class T1b_HelloWorld implements WikiConstants { - public static void main(String[] args) throws WikiApiException { + public static void main(String[] args) throws WikiApiException { - // configure the database connection parameters - DatabaseConfiguration dbConfig = new DatabaseConfiguration(); - dbConfig.setHost("SERVER_URL"); - dbConfig.setDatabase("DATABASE"); - dbConfig.setUser("USER"); - dbConfig.setPassword("PASSWORD"); - dbConfig.setLanguage(Language.german); + // configure the database connection parameters + DatabaseConfiguration dbConfig = new DatabaseConfiguration(); + dbConfig.setHost("SERVER_URL"); + dbConfig.setDatabase("DATABASE"); + dbConfig.setUser("USER"); + dbConfig.setPassword("PASSWORD"); + dbConfig.setLanguage(Language.german); - // Create a new German wikipedia. - Wikipedia wiki = new Wikipedia(dbConfig); + // Create a new German wikipedia. + Wikipedia wiki = new Wikipedia(dbConfig); - String title = "Hello world"; - if (wiki.existsPage(title)) { - Page page = wiki.getPage(title); - System.out.println(page.getText()); - } - else { - System.out.println("Page " + title + " does not exist"); - } + String title = "Hello world"; + if (wiki.existsPage(title)) { + Page page = wiki.getPage(title); + System.out.println(page.getText()); + } else { + System.out.println("Page " + title + " does not exist"); } + } } diff --git a/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/api/T1c_HelloWorld.java b/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/api/T1c_HelloWorld.java index 81a34ff2..aa504b77 100644 --- a/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/api/T1c_HelloWorld.java +++ b/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/api/T1c_HelloWorld.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -31,40 +31,38 @@ * The text will be formatted with MediaWiki markup. *

* Handle exceptions. - * - * */ public class T1c_HelloWorld implements WikiConstants { - public static void main(String[] args) { + public static void main(String[] args) { - // configure the database connection parameters - DatabaseConfiguration dbConfig = new DatabaseConfiguration(); - dbConfig.setHost("SERVER_URL"); - dbConfig.setDatabase("DATABASE"); - dbConfig.setUser("USER"); - dbConfig.setPassword("PASSWORD"); - dbConfig.setLanguage(Language.german); + // configure the database connection parameters + DatabaseConfiguration dbConfig = new DatabaseConfiguration(); + dbConfig.setHost("SERVER_URL"); + dbConfig.setDatabase("DATABASE"); + dbConfig.setUser("USER"); + dbConfig.setPassword("PASSWORD"); + dbConfig.setLanguage(Language.german); - // Create a new German wikipedia. - Wikipedia wiki = null; - try { - wiki = new Wikipedia(dbConfig); - } catch (WikiInitializationException e1) { - System.out.println("Could not initialize Wikipedia."); - e1.printStackTrace(); - System.exit(1); - } + // Create a new German wikipedia. + Wikipedia wiki = null; + try { + wiki = new Wikipedia(dbConfig); + } catch (WikiInitializationException e1) { + System.out.println("Could not initialize Wikipedia."); + e1.printStackTrace(); + System.exit(1); + } - // Get the page with title "Hello world". - String title = "Hello world"; - try { - Page page = wiki.getPage(title); - System.out.println(page.getText()); - } catch (WikiApiException e) { - System.out.println("Page " + title + " does not exist"); - e.printStackTrace(); - System.exit(1); - } + // Get the page with title "Hello world". + String title = "Hello world"; + try { + Page page = wiki.getPage(title); + System.out.println(page.getText()); + } catch (WikiApiException e) { + System.out.println("Page " + title + " does not exist"); + e.printStackTrace(); + System.exit(1); } + } } diff --git a/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/api/T2_PageInfo.java b/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/api/T2_PageInfo.java index fb558c13..fdfb0aa0 100644 --- a/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/api/T2_PageInfo.java +++ b/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/api/T2_PageInfo.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -29,51 +29,49 @@ * Tutorial 2 *

* A page provides a number of informative methods. - * - * */ public class T2_PageInfo implements WikiConstants { - public static void main(String[] args) throws WikiApiException { + public static void main(String[] args) throws WikiApiException { - // configure the database connection parameters - DatabaseConfiguration dbConfig = new DatabaseConfiguration(); - dbConfig.setHost("SERVER_URL"); - dbConfig.setDatabase("DATABASE"); - dbConfig.setUser("USER"); - dbConfig.setPassword("PASSWORD"); - dbConfig.setLanguage(Language.german); + // configure the database connection parameters + DatabaseConfiguration dbConfig = new DatabaseConfiguration(); + dbConfig.setHost("SERVER_URL"); + dbConfig.setDatabase("DATABASE"); + dbConfig.setUser("USER"); + dbConfig.setPassword("PASSWORD"); + dbConfig.setLanguage(Language.german); - // Create a new German wikipedia - Wikipedia wiki = new Wikipedia(dbConfig); + // Create a new German wikipedia + Wikipedia wiki = new Wikipedia(dbConfig); - String title = "Hello world"; - Page page; - try { - page = wiki.getPage(title); - } catch (WikiPageNotFoundException e) { - throw new WikiApiException("Page " + title + " does not exist"); - } + String title = "Hello world"; + Page page; + try { + page = wiki.getPage(title); + } catch (WikiPageNotFoundException e) { + throw new WikiApiException("Page " + title + " does not exist"); + } - // the title of the page - System.out.println("Queried string : " + title); - System.out.println("Title : " + page.getTitle()); + // the title of the page + System.out.println("Queried string : " + title); + System.out.println("Title : " + page.getTitle()); - // whether the page is a disambiguation page - System.out.println("IsDisambiguationPage : " + page.isDisambiguation()); + // whether the page is a disambiguation page + System.out.println("IsDisambiguationPage : " + page.isDisambiguation()); - // whether the page is a redirect - // If a page is a redirect, we can use it like a normal page. - // The other infos in this example are transparently served by the page that the redirect points to. - System.out.println("redirect page query : " + page.isRedirect()); + // whether the page is a redirect + // If a page is a redirect, we can use it like a normal page. + // The other infos in this example are transparently served by the page that the redirect points to. + System.out.println("redirect page query : " + page.isRedirect()); - // the number of links pointing to this page - System.out.println("# of ingoing links : " + page.getNumberOfInlinks()); + // the number of links pointing to this page + System.out.println("# of ingoing links : " + page.getNumberOfInlinks()); - // the number of links in this page pointing to other pages - System.out.println("# of outgoing links : " + page.getNumberOfOutlinks()); + // the number of links in this page pointing to other pages + System.out.println("# of outgoing links : " + page.getNumberOfOutlinks()); - // the number of categories that are assigned to this page - System.out.println("# of categories : " + page.getNumberOfCategories()); - } + // the number of categories that are assigned to this page + System.out.println("# of categories : " + page.getNumberOfCategories()); + } } diff --git a/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/api/T3_PageDetails.java b/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/api/T3_PageDetails.java index 81f5b805..6edd3624 100644 --- a/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/api/T3_PageDetails.java +++ b/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/api/T3_PageDetails.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -30,66 +30,64 @@ * Tutorial 3 *

* Even more things to do with a Wikipedia page. - * - * */ public class T3_PageDetails implements WikiConstants { - public static void main(String[] args) throws WikiApiException { - - // configure the database connection parameters - DatabaseConfiguration dbConfig = new DatabaseConfiguration(); - dbConfig.setHost("SERVER_URL"); - dbConfig.setDatabase("DATABASE"); - dbConfig.setUser("USER"); - dbConfig.setPassword("PASSWORD"); - dbConfig.setLanguage(Language.german); + public static void main(String[] args) throws WikiApiException { - // Create a new German wikipedia. - Wikipedia wiki = new Wikipedia(dbConfig); + // configure the database connection parameters + DatabaseConfiguration dbConfig = new DatabaseConfiguration(); + dbConfig.setHost("SERVER_URL"); + dbConfig.setDatabase("DATABASE"); + dbConfig.setUser("USER"); + dbConfig.setPassword("PASSWORD"); + dbConfig.setLanguage(Language.german); - String title = "Hello world"; - Page page; - try { - page = wiki.getPage(title); - } catch (WikiPageNotFoundException e) { - throw new WikiApiException("Page " + title + " does not exist"); - } + // Create a new German wikipedia. + Wikipedia wiki = new Wikipedia(dbConfig); - StringBuilder sb = new StringBuilder(); + String title = "Hello world"; + Page page; + try { + page = wiki.getPage(title); + } catch (WikiPageNotFoundException e) { + throw new WikiApiException("Page " + title + " does not exist"); + } - // the title of the page - sb.append("Queried string : " + title + LF); - sb.append("Title : " + page.getTitle() + LF); - sb.append(LF); + StringBuilder sb = new StringBuilder(); - // output the page's redirects - sb.append("Redirects" + LF); - for (String redirect : page.getRedirects()) { - sb.append(" " + new Title(redirect).getPlainTitle() + LF); - } - sb.append(LF); + // the title of the page + sb.append("Queried string : " + title + LF); + sb.append("Title : " + page.getTitle() + LF); + sb.append(LF); - // output the page's categories - sb.append("Categories" + LF); - for (Category category : page.getCategories()) { - sb.append(" " + category.getTitle() + LF); - } - sb.append(LF); + // output the page's redirects + sb.append("Redirects" + LF); + for (String redirect : page.getRedirects()) { + sb.append(" " + new Title(redirect).getPlainTitle() + LF); + } + sb.append(LF); - // output the ingoing links - sb.append("In-Links" + LF); - for (Page inLinkPage : page.getInlinks()) { - sb.append(" " + inLinkPage.getTitle() + LF); - } - sb.append(LF); + // output the page's categories + sb.append("Categories" + LF); + for (Category category : page.getCategories()) { + sb.append(" " + category.getTitle() + LF); + } + sb.append(LF); - // output the outgoing links - sb.append("Out-Links" + LF); - for (Page outLinkPage : page.getOutlinks()) { - sb.append(" " + outLinkPage.getTitle() + LF); - } + // output the ingoing links + sb.append("In-Links" + LF); + for (Page inLinkPage : page.getInlinks()) { + sb.append(" " + inLinkPage.getTitle() + LF); + } + sb.append(LF); - System.out.println(sb); + // output the outgoing links + sb.append("Out-Links" + LF); + for (Page outLinkPage : page.getOutlinks()) { + sb.append(" " + outLinkPage.getTitle() + LF); } + + System.out.println(sb); + } } diff --git a/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/api/T4_Categories.java b/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/api/T4_Categories.java index dee66189..ca6c538c 100644 --- a/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/api/T4_Categories.java +++ b/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/api/T4_Categories.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -30,65 +30,63 @@ *

* Wikipedia categories are used as a kind of semantic tag for pages. * They are organized in a thesaurus like structure. - * - * */ public class T4_Categories implements WikiConstants { - public static void main(String[] args) throws WikiApiException { - - // configure the database connection parameters - DatabaseConfiguration dbConfig = new DatabaseConfiguration(); - dbConfig.setHost("SERVER_URL"); - dbConfig.setDatabase("DATABASE"); - dbConfig.setUser("USER"); - dbConfig.setPassword("PASSWORD"); - dbConfig.setLanguage(Language.german); + public static void main(String[] args) throws WikiApiException { - // Create a new German wikipedia. - Wikipedia wiki = new Wikipedia(dbConfig); + // configure the database connection parameters + DatabaseConfiguration dbConfig = new DatabaseConfiguration(); + dbConfig.setHost("SERVER_URL"); + dbConfig.setDatabase("DATABASE"); + dbConfig.setUser("USER"); + dbConfig.setPassword("PASSWORD"); + dbConfig.setLanguage(Language.german); - // Get the category "Säugetiere" (mammals) - String title = "Säugetiere"; - Category cat; - try { - cat = wiki.getCategory(title); - } catch (WikiPageNotFoundException e) { - throw new WikiApiException("Category " + title + " does not exist"); - } + // Create a new German wikipedia. + Wikipedia wiki = new Wikipedia(dbConfig); - StringBuilder sb = new StringBuilder(); + // Get the category "Säugetiere" (mammals) + String title = "Säugetiere"; + Category cat; + try { + cat = wiki.getCategory(title); + } catch (WikiPageNotFoundException e) { + throw new WikiApiException("Category " + title + " does not exist"); + } - // the title of the category - sb.append("Title : " + cat.getTitle() + LF); - sb.append(LF); + StringBuilder sb = new StringBuilder(); - // the number of links pointing to this page (number of superordinate categories) - sb.append("# super categories : " + cat.getParents().size() + LF); - for (Category parent : cat.getParents()) { - sb.append(" " + parent.getTitle() + LF); - } - sb.append(LF); + // the title of the category + sb.append("Title : " + cat.getTitle() + LF); + sb.append(LF); - // the number of links in this page pointing to other pages (number of subordinate categories) - sb.append("# sub categories : " + cat.getChildren().size() + LF); - for (Category child : cat.getChildren()) { - sb.append(" " + child.getTitle() + LF); - } - sb.append(LF); + // the number of links pointing to this page (number of superordinate categories) + sb.append("# super categories : " + cat.getParents().size() + LF); + for (Category parent : cat.getParents()) { + sb.append(" " + parent.getTitle() + LF); + } + sb.append(LF); - // the number of pages that are categorized under this category - sb.append("# pages : " + cat.getArticles().size() + LF); - for (Page page : cat.getArticles()) { - sb.append(" " + page.getTitle() + LF); - } + // the number of links in this page pointing to other pages (number of subordinate categories) + sb.append("# sub categories : " + cat.getChildren().size() + LF); + for (Category child : cat.getChildren()) { + sb.append(" " + child.getTitle() + LF); + } + sb.append(LF); - // extract only the pageIDs of pages that are categorized under this category - sb.append("# pageIDs : " + cat.getArticleIds().size() + LF); - for (int pageID : cat.getArticleIds()) { - sb.append(" " + pageID + LF); - } + // the number of pages that are categorized under this category + sb.append("# pages : " + cat.getArticles().size() + LF); + for (Page page : cat.getArticles()) { + sb.append(" " + page.getTitle() + LF); + } - System.out.println(sb); + // extract only the pageIDs of pages that are categorized under this category + sb.append("# pageIDs : " + cat.getArticleIds().size() + LF); + for (int pageID : cat.getArticleIds()) { + sb.append(" " + pageID + LF); } + + System.out.println(sb); + } } diff --git a/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/api/T5_TownList.java b/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/api/T5_TownList.java index 8e795fac..33c8f7e7 100644 --- a/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/api/T5_TownList.java +++ b/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/api/T5_TownList.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -36,52 +36,50 @@ * They are organized in a thesaurus like structure. *

* If we get all pages assigned to categories in the sub-tree under the category for "Towns in Germany", - * we can get a quite long list of towns in Germany. - * - * + * we can get a quite long list of towns in Germany. */ public class T5_TownList implements WikiConstants { - public static void main(String[] args) throws WikiApiException { - - // configure the database connection parameters - DatabaseConfiguration dbConfig = new DatabaseConfiguration(); - dbConfig.setHost("SERVER_URL"); - dbConfig.setDatabase("DATABASE"); - dbConfig.setUser("USER"); - dbConfig.setPassword("PASSWORD"); - dbConfig.setLanguage(Language.german); + public static void main(String[] args) throws WikiApiException { - // Create a new German wikipedia. - Wikipedia wiki = new Wikipedia(dbConfig); + // configure the database connection parameters + DatabaseConfiguration dbConfig = new DatabaseConfiguration(); + dbConfig.setHost("SERVER_URL"); + dbConfig.setDatabase("DATABASE"); + dbConfig.setUser("USER"); + dbConfig.setPassword("PASSWORD"); + dbConfig.setLanguage(Language.german); - // Get the category "Towns in Germany" - String title = "Towns in Germany"; - Category topCat; - try { - topCat = wiki.getCategory(title); - } catch (WikiPageNotFoundException e) { - throw new WikiApiException("Category " + title + " does not exist"); - } + // Create a new German wikipedia. + Wikipedia wiki = new Wikipedia(dbConfig); - // Add the pages categorized under "Towns in Germany". - Set towns = new TreeSet<>(); - for (Page p : topCat.getArticles()) { - towns.add(p.getTitle().getPlainTitle()); - } + // Get the category "Towns in Germany" + String title = "Towns in Germany"; + Category topCat; + try { + topCat = wiki.getCategory(title); + } catch (WikiPageNotFoundException e) { + throw new WikiApiException("Category " + title + " does not exist"); + } - // Get the pages categorized under each subcategory of "Towns in Germany". - for (Category townCategory : topCat.getDescendants()) { - for (Page p : townCategory.getArticles()) { - towns.add(p.getTitle().getPlainTitle()); - } - System.out.println("Number of towns: " + towns.size()); - } + // Add the pages categorized under "Towns in Germany". + Set towns = new TreeSet<>(); + for (Page p : topCat.getArticles()) { + towns.add(p.getTitle().getPlainTitle()); + } - // Output the pages - for (String town : towns) { - System.out.println(town); - } + // Get the pages categorized under each subcategory of "Towns in Germany". + for (Category townCategory : topCat.getDescendants()) { + for (Page p : townCategory.getArticles()) { + towns.add(p.getTitle().getPlainTitle()); + } + System.out.println("Number of towns: " + towns.size()); + } + // Output the pages + for (String town : towns) { + System.out.println(town); } + + } } diff --git a/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/api/T6_HelperMethods.java b/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/api/T6_HelperMethods.java index cd23db40..6103dc99 100644 --- a/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/api/T6_HelperMethods.java +++ b/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/api/T6_HelperMethods.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -28,24 +28,24 @@ public class T6_HelperMethods { - public static Set getUniqueArticleTitles() throws WikiInitializationException { - // configure the database connection parameters - DatabaseConfiguration dbConfig = new DatabaseConfiguration(); - dbConfig.setHost("SERVER_URL"); - dbConfig.setDatabase("DATABASE"); - dbConfig.setUser("USER"); - dbConfig.setPassword("PASSWORD"); - dbConfig.setLanguage(Language.german); - - // Create a new German wikipedia. - Wikipedia wiki = new Wikipedia(dbConfig); - - Set uniqueArticleTitles = new TreeSet<>(); - for (Title title : wiki.getTitles()) { - uniqueArticleTitles.add(title.getPlainTitle()); - } - - return uniqueArticleTitles; + public static Set getUniqueArticleTitles() throws WikiInitializationException { + // configure the database connection parameters + DatabaseConfiguration dbConfig = new DatabaseConfiguration(); + dbConfig.setHost("SERVER_URL"); + dbConfig.setDatabase("DATABASE"); + dbConfig.setUser("USER"); + dbConfig.setPassword("PASSWORD"); + dbConfig.setLanguage(Language.german); + + // Create a new German wikipedia. + Wikipedia wiki = new Wikipedia(dbConfig); + + Set uniqueArticleTitles = new TreeSet<>(); + for (Title title : wiki.getTitles()) { + uniqueArticleTitles.add(title.getPlainTitle()); } + return uniqueArticleTitles; + } + } diff --git a/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/parser/T1_SimpleParserDemo.java b/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/parser/T1_SimpleParserDemo.java index d2b6eebf..f71cfa83 100644 --- a/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/parser/T1_SimpleParserDemo.java +++ b/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/parser/T1_SimpleParserDemo.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -26,31 +26,30 @@ /** * Displays informations about the inner structure of a page. - * */ public class T1_SimpleParserDemo { - /** - * @param args - * @throws IOException - */ - public static void main(String[] args) throws IOException { + /** + * @param args + * @throws IOException + */ + public static void main(String[] args) throws IOException { + + // load a sample document (the contents are equal to "DarmstadtWikipediaArticle.txt") + String documentText = TestFile.getFileText(); - // load a sample document (the contents are equal to "DarmstadtWikipediaArticle.txt") - String documentText = TestFile.getFileText(); + //get a ParsedPage object + MediaWikiParserFactory pf = new MediaWikiParserFactory(); + MediaWikiParser parser = pf.createParser(); + ParsedPage pp = parser.parse(documentText); - //get a ParsedPage object - MediaWikiParserFactory pf = new MediaWikiParserFactory(); - MediaWikiParser parser = pf.createParser(); - ParsedPage pp = parser.parse(documentText); - - //get the sections - for(Section section : pp.getSections()) { - System.out.println("section : " + section.getTitle()); - System.out.println(" nr of paragraphs : " + section.nrOfParagraphs()); - System.out.println(" nr of tables : " + section.nrOfTables()); - System.out.println(" nr of nested lists : " + section.nrOfNestedLists()); - System.out.println(" nr of definition lists: " + section.nrOfDefinitionLists()); - } - } + //get the sections + for (Section section : pp.getSections()) { + System.out.println("section : " + section.getTitle()); + System.out.println(" nr of paragraphs : " + section.nrOfParagraphs()); + System.out.println(" nr of tables : " + section.nrOfTables()); + System.out.println(" nr of nested lists : " + section.nrOfNestedLists()); + System.out.println(" nr of definition lists: " + section.nrOfDefinitionLists()); + } + } } diff --git a/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/parser/T2_InternalLinks.java b/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/parser/T2_InternalLinks.java index 01271990..f3ef1fb9 100644 --- a/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/parser/T2_InternalLinks.java +++ b/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/parser/T2_InternalLinks.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -28,37 +28,37 @@ * This class shows how to get the internal links from a parsed page.
* Internal links point to other pages and categories in the current
*

Wikipedia
. - * */ public class T2_InternalLinks { - /** - * Prints the targets of the internal links found in the page Germany. - * @param args - * @throws WikiApiException - */ - public static void main(String[] args) throws WikiApiException { + /** + * Prints the targets of the internal links found in the page Germany. + * + * @param args + * @throws WikiApiException + */ + public static void main(String[] args) throws WikiApiException { - // load a sample document (the contents are equal to "DarmstadtWikipediaArticle.txt") - String documentText = TestFile.getFileText(); - - // get a ParsedPage object - MediaWikiParserFactory pf = new MediaWikiParserFactory(); - MediaWikiParser parser = pf.createParser(); - ParsedPage pp = parser.parse(documentText); - - // only the links to other Wikipedia language editions - for (Link language : pp.getLanguages()) { - System.out.println(language.getTarget()); - } + // load a sample document (the contents are equal to "DarmstadtWikipediaArticle.txt") + String documentText = TestFile.getFileText(); + + // get a ParsedPage object + MediaWikiParserFactory pf = new MediaWikiParserFactory(); + MediaWikiParser parser = pf.createParser(); + ParsedPage pp = parser.parse(documentText); + + // only the links to other Wikipedia language editions + for (Link language : pp.getLanguages()) { + System.out.println(language.getTarget()); + } - //get the internal links of each section - for (Section section : pp.getSections()){ - System.out.println("Section: " + section.getTitle()); + //get the internal links of each section + for (Section section : pp.getSections()) { + System.out.println("Section: " + section.getTitle()); - for (Link link : section.getLinks(Link.type.INTERNAL)) { - System.out.println(" " + link.getTarget()); - } - } + for (Link link : section.getLinks(Link.type.INTERNAL)) { + System.out.println(" " + link.getTarget()); + } } + } } diff --git a/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/parser/T3_LinkContexts.java b/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/parser/T3_LinkContexts.java index 97739c8b..81354eee 100644 --- a/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/parser/T3_LinkContexts.java +++ b/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/parser/T3_LinkContexts.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -25,28 +25,27 @@ /** * This is a little demo, to show how the parsedpage and parsedpage.parser package * works. - * */ public class T3_LinkContexts { - public static void main(String[] args){ - - // load a sample document (the contents are equal to "DarmstadtWikipediaArticle.txt") - String documentText = TestFile.getFileText(); + public static void main(String[] args) { + + // load a sample document (the contents are equal to "DarmstadtWikipediaArticle.txt") + String documentText = TestFile.getFileText(); + + // get a ParsedPage object + MediaWikiParserFactory pf = new MediaWikiParserFactory(); + MediaWikiParser parser = pf.createParser(); + ParsedPage pp = parser.parse(documentText); - // get a ParsedPage object - MediaWikiParserFactory pf = new MediaWikiParserFactory(); - MediaWikiParser parser = pf.createParser(); - ParsedPage pp = parser.parse(documentText); - - // Link Context (return 1 token left, 2 token right of the link) - for (Link link : pp.getLinks()) { - System.out.println( - link.getContext(1, 0) + "<" + - link.getText().toString().toUpperCase() + ">" + - link.getContext(0, 2) - ); - } + // Link Context (return 1 token left, 2 token right of the link) + for (Link link : pp.getLinks()) { + System.out.println( + link.getContext(1, 0) + "<" + + link.getText().toString().toUpperCase() + ">" + + link.getContext(0, 2) + ); } + } } diff --git a/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/parser/T4_InterfacingWithWikipedia.java b/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/parser/T4_InterfacingWithWikipedia.java index 5afc84d6..25653b9c 100644 --- a/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/parser/T4_InterfacingWithWikipedia.java +++ b/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/parser/T4_InterfacingWithWikipedia.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -34,31 +34,31 @@ */ public class T4_InterfacingWithWikipedia { - public static void main(String[] args) throws WikiApiException { - //db connection settings - DatabaseConfiguration dbConfig = new DatabaseConfiguration(); - dbConfig.setDatabase("DATABASE"); - dbConfig.setHost("HOST"); - dbConfig.setUser("USER"); - dbConfig.setPassword("PASSWORD"); - dbConfig.setLanguage(Language.english); - - //initialize a wiki - Wikipedia wiki = new Wikipedia(dbConfig); - - //get the page 'Dog' - Page p = wiki.getPage("Dog"); - - //get a ParsedPage object - MediaWikiParserFactory pf = new MediaWikiParserFactory(); - MediaWikiParser parser = pf.createParser(); - ParsedPage pp = parser.parse(p.getText()); - - //get the sections of the page - List
sections = pp.getSections(); - - for(Section section : sections) { - System.out.println(section.getTitle()); - } - } + public static void main(String[] args) throws WikiApiException { + //db connection settings + DatabaseConfiguration dbConfig = new DatabaseConfiguration(); + dbConfig.setDatabase("DATABASE"); + dbConfig.setHost("HOST"); + dbConfig.setUser("USER"); + dbConfig.setPassword("PASSWORD"); + dbConfig.setLanguage(Language.english); + + //initialize a wiki + Wikipedia wiki = new Wikipedia(dbConfig); + + //get the page 'Dog' + Page p = wiki.getPage("Dog"); + + //get a ParsedPage object + MediaWikiParserFactory pf = new MediaWikiParserFactory(); + MediaWikiParser parser = pf.createParser(); + ParsedPage pp = parser.parse(p.getText()); + + //get the sections of the page + List
sections = pp.getSections(); + + for (Section section : sections) { + System.out.println(section.getTitle()); + } + } } diff --git a/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/parser/T5_CleaningTemplateImage.java b/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/parser/T5_CleaningTemplateImage.java index 069c629f..972773d8 100644 --- a/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/parser/T5_CleaningTemplateImage.java +++ b/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/parser/T5_CleaningTemplateImage.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -29,41 +29,40 @@ /** * Shows how to clean an article text from "TEMPLATE" and "Image" elements - * */ public class T5_CleaningTemplateImage { - - public static void main(String[] args) throws WikiApiException { - //db connection settings - DatabaseConfiguration dbConfig = new DatabaseConfiguration(); - dbConfig.setDatabase("DATABASE"); - dbConfig.setHost("HOST"); - dbConfig.setUser("USER"); - dbConfig.setPassword("PASSWORD"); - dbConfig.setLanguage(Language.english); + public static void main(String[] args) throws WikiApiException { + + //db connection settings + DatabaseConfiguration dbConfig = new DatabaseConfiguration(); + dbConfig.setDatabase("DATABASE"); + dbConfig.setHost("HOST"); + dbConfig.setUser("USER"); + dbConfig.setPassword("PASSWORD"); + dbConfig.setLanguage(Language.english); + + //initialize a wiki + Wikipedia wiki = new Wikipedia(dbConfig); + + //get the page 'Dog' + Page p = wiki.getPage("Dog"); + + //get a ParsedPage object + MediaWikiParserFactory pf = new MediaWikiParserFactory(); + pf.setTemplateParserClass(FlushTemplates.class); // Filtering TEMPLATE-Elements + + String IMAGE = "Image"; // Replace it with the image template name in your Wiki language edition, + // e.g. "Image" in English + + // filtering Image-Elements + pf.getImageIdentifers().add(IMAGE); + + // parse page text + MediaWikiParser parser = pf.createParser(); + ParsedPage pp = parser.parse(p.getText()); - //initialize a wiki - Wikipedia wiki = new Wikipedia(dbConfig); - - //get the page 'Dog' - Page p = wiki.getPage("Dog"); - - //get a ParsedPage object - MediaWikiParserFactory pf = new MediaWikiParserFactory(); - pf.setTemplateParserClass(FlushTemplates.class); // Filtering TEMPLATE-Elements - - String IMAGE = "Image"; // Replace it with the image template name in your Wiki language edition, - // e.g. "Image" in English - - // filtering Image-Elements - pf.getImageIdentifers().add(IMAGE); - - // parse page text - MediaWikiParser parser = pf.createParser(); - ParsedPage pp = parser.parse(p.getText()); - - System.out.println(pp.getText()); - } + System.out.println(pp.getText()); + } } diff --git a/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/parser/T6_NestedLists.java b/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/parser/T6_NestedLists.java index be866c30..b257b220 100644 --- a/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/parser/T6_NestedLists.java +++ b/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/parser/T6_NestedLists.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -30,67 +30,65 @@ /** * Displays all nested lists of a page. - * */ public class T6_NestedLists { - public static void main(String[] args) throws WikiApiException { + public static void main(String[] args) throws WikiApiException { - //db connection settings - DatabaseConfiguration dbConfig = new DatabaseConfiguration(); - dbConfig.setDatabase("DATABASE"); - dbConfig.setHost("HOST"); - dbConfig.setUser("USER"); - dbConfig.setPassword("PASSWORD"); - dbConfig.setLanguage(Language.english); + //db connection settings + DatabaseConfiguration dbConfig = new DatabaseConfiguration(); + dbConfig.setDatabase("DATABASE"); + dbConfig.setHost("HOST"); + dbConfig.setUser("USER"); + dbConfig.setPassword("PASSWORD"); + dbConfig.setLanguage(Language.english); - //initialize a wiki - Wikipedia wiki = new Wikipedia(dbConfig); + //initialize a wiki + Wikipedia wiki = new Wikipedia(dbConfig); - MediaWikiParserFactory pf = new MediaWikiParserFactory(Language.english); - MediaWikiParser parser = pf.createParser(); + MediaWikiParserFactory pf = new MediaWikiParserFactory(Language.english); + MediaWikiParser parser = pf.createParser(); - //get the page 'House_(disambiguation)' - ParsedPage pp = parser.parse(wiki.getPage("House_(disambiguation)").getText()); + //get the page 'House_(disambiguation)' + ParsedPage pp = parser.parse(wiki.getPage("House_(disambiguation)").getText()); - int i = 1; - // print out all nested lists of the page - for(NestedList nl : pp.getNestedLists()){ - System.out.println(i + ": \n" + outputNestedList(nl,0)); - i++; - } - } + int i = 1; + // print out all nested lists of the page + for (NestedList nl : pp.getNestedLists()) { + System.out.println(i + ": \n" + outputNestedList(nl, 0)); + i++; + } + } - /** - * Returns String with all elements of a NestedList - * @param nl NestedList - * @param depth Current depth of the Nestedlist - * @return - */ - public static String outputNestedList(NestedList nl, int depth){ - String result = ""; - if(nl == null) - { - return result; // If null return empty string - } + /** + * Returns String with all elements of a NestedList + * + * @param nl NestedList + * @param depth Current depth of the Nestedlist + * @return + */ + public static String outputNestedList(NestedList nl, int depth) { + String result = ""; + if (nl == null) { + return result; // If null return empty string + } - for(int i = 0; i * Mainly, you can create an HtmlFile of a {@link ParsedPage}. - * */ public class T7_HtmlFileDemo { - - public static void main( String[] argv ) { - - // load a sample document (the contents are equal to "DarmstadtWikipediaArticle.txt") - String documentText = TestFile.getFileText(); - - // set up an individually parametrized MediaWikiParser - MediaWikiParserFactory pf = new MediaWikiParserFactory(); - pf.getImageIdentifers().add("Image"); - MediaWikiParser parser = pf.createParser(); - - ParsedPage pp = parser.parse( documentText ); - - String outFileName = "htmlFileDemo.html"; - HtmlWriter.writeFile(outFileName, "UTF8", HtmlWriter.parsedPageToHtml(pp)); - - System.out.println("Writing output to file: " + outFileName); - } + + public static void main(String[] argv) { + + // load a sample document (the contents are equal to "DarmstadtWikipediaArticle.txt") + String documentText = TestFile.getFileText(); + + // set up an individually parametrized MediaWikiParser + MediaWikiParserFactory pf = new MediaWikiParserFactory(); + pf.getImageIdentifers().add("Image"); + MediaWikiParser parser = pf.createParser(); + + ParsedPage pp = parser.parse(documentText); + + String outFileName = "htmlFileDemo.html"; + HtmlWriter.writeFile(outFileName, "UTF8", HtmlWriter.parsedPageToHtml(pp)); + + System.out.println("Writing output to file: " + outFileName); + } } diff --git a/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/parser/TestFile.java b/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/parser/TestFile.java index 2c5ee458..93d13fca 100644 --- a/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/parser/TestFile.java +++ b/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/parser/TestFile.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -19,215 +19,215 @@ public class TestFile { - private static final String LF = "\n"; + private static final String LF = "\n"; - public static String getFileText() { - StringBuilder sb = new StringBuilder(); - sb.append("'''Darmstadt''' is a city in the [[States of Germany|Bundesland]] (federal state) of [[Hesse]]n in [[Germany]]. As of 2005, its population was 139,000. The city is located in the southern part of the [[Frankfurt Rhine Main Area|Rhine Main Metropolitan Area]]."); - sb.append(LF); - sb.append(LF); - sb.append("== History =="); - sb.append(LF); - sb.append("The name Darmstadt first appears towards the end of the [[11th century]], then ''Darmundestat''; Darmstadt was chartered as a city by the [[Holy Roman Emperor]] [[Louis IV, Holy Roman Emperor|Ludwig the Bavarian]] in 1330. The seat of the ruling [[Landgraf|landgraves]] (1567-1806) and thereafter (to 1918) to the [[Grand Duke of Hesse and by Rhine|Grand Dukes of Hesse]], the city grew in population during the [[19th century]] from little over 10,000 to 72,000 inhabitants. A polytechnical school, which later became a Technical University now known as [[Darmstadt University of Technology|TU Darmstadt]], was established in 1877. "); - sb.append(LF); - sb.append("In the beginning of the 20th Century Darmstadt was an important centre for the art movement of [[Art Nouveau|Jugendstil]], the German variant of [[Art Nouveau]]. Annual architectural competitions led to the building of many architectural treasures of this period. Also during this period, in [[1912]] the chemist [[Anton Kollisch]], working for the pharmaceutical company [[Merck]], first synthesised the chemical [[MDMA]] (ecstasy) in Darmstadt."); - sb.append(LF); - sb.append("Darmstadt's municipal area was extended in 1937 to include the neighbouring localities of Arheilgen [not Arheil''i''gen] and Eberstadt, and in 1938 the city was separated administratively from the surrounding district (''Kreis''). Its old city centre was largely destroyed in a [[Bombing of Darmstadt in World War II|British bombing raid]] of [[September 11]] [[1944]], which killed an estimated 12,300 inhabitants and rendered 66,000 homeless. Most of Darmstadt's 3000 [[Jew]]s were killed by the [[Nazism|Nazi]] regime between 1933 and 1945. "); - sb.append(LF); - sb.append("Darmstadt is home to many technology companies and research institutes, and has been promoting itself as a \"city of science\" since 1997. It is well known as the high-tech centre in the vicinity of [[Frankfurt International Airport|Frankfurt Airport]], with important activities in spacecraft operations, chemistry, pharmacy, information technology, biotechnology, telecommunications and mechatronics. The [[Darmstadt University of Technology|TU Darmstadt]] is one of the important technical institutes in Germany and is well known for its research and teaching in the Electrical, Mechanical and Civil Engineering disciplines."); - sb.append(LF); - sb.append(LF); - sb.append("== Institutions =="); - sb.append(LF); - sb.append("Darmstadt is the site of one of the leading German universities, the [[Darmstadt University of Technology]], renowned for its engineering departments and the [[Darmstadt University of Applied Sciences]]. Related institutes are the [[Gesellschaft für Schwerionenforschung]] (see also 'Trivia', below) and the four Institutes of the [[Fraunhofer Society]]. The European Space Operations Center ([[ESOC]]) of the [[European Space Agency]] is located in Darmstadt, as is [[EUMETSAT]], which operates [[meteorological]] [[satellite]]s. Darmstadt is a centre for the pharmaceutical and chemical industry, with [[Merck KGaA|Merck]], Röhm and Schenck RoTec (part of The Dürr Group) having their main plants and centres here."); - sb.append(LF); - sb.append("The [http://www.jazzinstitut.de Jazz-Institut Darmstadt] is Germany's largest publicly accessible [[Jazz]] archive."); - sb.append(LF); - sb.append("The [http://www.imd.darmstadt.de/ Internationales Musikinstitut Darmstadt], harboring one of the world's largest collections of [[post-war]] [[sheet music]], also hosts the biannual ''[[International Summer Courses for New Music|Internationale Ferienkurse für Neue Musik]]'', a summer school in [[contemporary classical music]] founded by [[Wolfgang Steinecke]]. A large number of avant-garde [[composer]]s have attended and given lectures there, including [[Olivier Messiaen]], [[Luciano Berio]], [[Milton Babbitt]], [[Pierre Boulez]], [[John Cage]], [[György Ligeti]], [[Iannis Xenakis]], [[Karlheinz Stockhausen]] and [[Mauricio Kagel]]."); - sb.append(LF); - sb.append("The [http://www.deutscheakademie.de/ Deutsche Akademie für Sprache und Dichtung] (German Academy for Language and Poetry) provides writers and scholars with a place to research the German language. The Academy's annual ''Georg-Büchner-Preis'', named in memory of [[Georg Büchner]], is considered the most renowned literary award for writers of German language."); - sb.append(LF); - sb.append(LF); - sb.append("== Sons and Daughters of the City =="); - sb.append(LF); - sb.append("* Justus von Liebig, Chemist "); - sb.append(LF); - sb.append("* Georg Büchner, German Poet"); - sb.append(LF); - sb.append("* [[Dr Walter Köbel]], German politician"); - sb.append(LF); - sb.append("* [[August Anton Ullrich]], German industrialist (1865-1919)"); - sb.append(LF); - sb.append("* [[Fabian Scheuermann]], World traveller"); - sb.append(LF); - sb.append("* [[Björn Phau]], Tennis player"); - sb.append(LF); - sb.append("* [[Friedrich August Kekulé von Stradonitz]], Organic Chemist"); - sb.append(LF); - sb.append(LF); - sb.append("== Military =="); - sb.append(LF); - sb.append("There are still [[U.S. Army]] personnel stationed in the Darmstadt area. Just outside the Darmstadt centre is the U.S. Army Garrison Darmstadt on Cambrai-Fritsch Kaserne. The barracks was originally built in the 1930s as two separate German Army barracks (Cambrai Kaserne and Freiherr von Fritsch Kaserne). "); - sb.append(LF); - sb.append("It is possible to listen to the military entertainment radio for the American troops in the region. The station is called [[American Forces Network|AFN Europe]] and broadcasts from Frankfurt on FM 98.7 or AM 873."); - sb.append(LF); - sb.append("The base has already started deactivation and will be closed around 2008-2010, at that time AFN Europe will be moved to Mannheim."); - sb.append(LF); - sb.append(LF); - sb.append("== Trivia =="); - sb.append(LF); - sb.append("Literally translated, the German name \"Darmstadt\" means \"City of the intestine\". But that is just a coincidence, as the name derives from the medieval name \"darmundestat\". The Darm(bach) is a small creek running through the city."); - sb.append(LF); - sb.append("The [[chemical element]] [[Darmstadtium]] ([[atomic number]]: 110), first discovered at the [[Gesellschaft für Schwerionenforschung]] was named after the city in 2003, making Darmstadt only the sixth city with an element named after it (the other five cities are [[Ytterby]] in [[Sweden]] (four elements); [[Strontian]] in [[Scotland]]; [[Copenhagen]] in [[Denmark]] (whose latin name gives [[Hafnium]]); [[Berkeley, California]]; and [[Dubna]] in [[Russia]]). [[Meitnerium]] ([[atomic number]]: 109) (1982), [[Hassium]] ([[atomic number]]: 108) (1984) and [[Roentgenium]] ([[atomic number]]: 111) (1994) and [[Ununbium]] ([[atomic number]]: 112) (1996) were also synthesized in this facility."); - sb.append(LF); - sb.append("Darmstadt also happens to be one of the small number of cities worldwide which do not lie close to a river or coast."); - sb.append(LF); - sb.append("Darmstadt is the home of [[Software AG]], a software company."); - sb.append(LF); - sb.append("Frankenstein Castle, ''[http://de.wikipedia.org/wiki/Burg_Frankenstein_%28Bergstrasse%29 Burg Frankenstein]'' (in German), possibly Mary Shelley's inspiration for the title of her famous 1818 novel ''[[Frankenstein | Frankenstein; or, The Modern Prometheus]]'', is located nearby."); - sb.append(LF); - sb.append(LF); - sb.append("== Twinning =="); - sb.append(LF); - sb.append("Darmstadt is [[twinned]] with:"); - sb.append(LF); - sb.append(LF); - sb.append("*{{flagicon|Netherlands}}[[Alkmaar]], [[Netherlands]]"); - sb.append(LF); - sb.append("*{{flagicon|Italy}}[[Brescia]], [[Italy]]"); - sb.append(LF); - sb.append("*{{flagicon|Turkey}}[[Bursa, Turkey|Bursa]], [[Turkey]]"); - sb.append(LF); - sb.append("*{{flagicon|United Kingdom}}[[Chesterfield]], [[United Kingdom|UK]]"); - sb.append(LF); - sb.append("*{{flagicon|Austria}}[[Graz]], [[Austria]]"); - sb.append(LF); - sb.append("*{{flagicon|Latvia}}[[Liepaja]], [[Latvia]]"); - sb.append(LF); - sb.append("*{{flagicon|Spain}}[[Logroño]], [[Spain]]"); - sb.append(LF); - sb.append("*{{flagicon|Poland}}[[Płock]], [[Poland]]"); - sb.append(LF); - sb.append("*{{flagicon|Hungary}}[[Szeged]], [[Hungary]]"); - sb.append(LF); - sb.append("*{{flagicon|Norway}}[[Trondheim]], [[Norway]]"); - sb.append(LF); - sb.append("*{{flagicon|France}}[[Troyes]], [[France]]"); - sb.append(LF); - sb.append("*{{flagicon|Ukraine}}[[Uzhhorod]], [[Ukraine]]"); - sb.append(LF); - sb.append("*{{flagicon|Switzerland}}[[Saanen]], [[Switzerland]]"); - sb.append(LF); - sb.append(LF); - sb.append("==External links=="); - sb.append(LF); - sb.append("{{commonscat|Darmstadt, Germany}}"); - sb.append(LF); - sb.append("*[http://www.darmstadt.de/ Official site of the city of Darmstadt] (German, English)"); - sb.append(LF); - sb.append("*[[wikitravel:Darmstadt|Darmstadt]] on [[wikitravel:Main Page|Wikitravel]]"); - sb.append(LF); - sb.append("*[http://www.mathildenhoehe.info Mathildenhoehe]"); - sb.append(LF); - sb.append("*[http://public-transport.net/bim/Darmstadt.htm Details of Trams and Buses used in Darmstadt]"); - sb.append(LF); - sb.append("*[http://www.rmv.de/ Public Transport in Darmstadt - Maps, Timetables, Fares]"); - sb.append(LF); - sb.append("*[http://sites-of-memory.de/main/location.html#darmstadt War memorials in Darmstadt]"); - sb.append(LF); - sb.append("*[http://www.darmstadt.army.mil/ Webpage of the U.S. army in Darmstadt]"); - sb.append(LF); - sb.append(LF); - sb.append("===Notable institutions==="); - sb.append(LF); - sb.append("* [http://www.tu-darmstadt.de/index.en.html Darmstadt University of Technology]"); - sb.append(LF); - sb.append("* [http://www.hochschule-darmstadt.de/engl/engl.htm University of Applied Sciences Darmstadt]"); - sb.append(LF); - sb.append("* [http://www.igd.fraunhofer.de/ Fraunhofer Institute for Computer Graphics]"); - sb.append(LF); - sb.append("* [http://www.sit.fraunhofer.de/ Fraunhofer Institute for Secure Information Technology]"); - sb.append(LF); - sb.append("* [http://www.ipsi.fraunhofer.de/ Fraunhofer Institute for Integrated Publication and Information Systems]"); - sb.append(LF); - sb.append("* [http://www.lbf.fhg.de/ Fraunhofer Institute for Structural Durability]"); - sb.append(LF); - sb.append("* [http://www.deutscheakademie.de/ Deutsche Akademie für Sprache und Dichtung]"); - sb.append(LF); - sb.append("* [http://www.gsi.de/ Gesellschaft für Schwerionenforschung]"); - sb.append(LF); - sb.append("* [http://www.esa.int/SPECIALS/ESOC/ European Space Operations Centre] (ESOC)"); - sb.append(LF); - sb.append("* [http://www.eumetsat.int/ European Organisation for the Exploitation of Meteorological Satellites (EUMETSAT)]"); - sb.append(LF); - sb.append(LF); - sb.append("[[Category:Cities in Hesse]]"); - sb.append(LF); - sb.append("[[Category:Merck]]"); - sb.append(LF); - sb.append("[[ar:دارمشتادت]]"); - sb.append(LF); - sb.append("[[an:Darmstadt]]"); - sb.append(LF); - sb.append("[[bg:Дармщат]]"); - sb.append(LF); - sb.append("[[ca:Darmstadt]]"); - sb.append(LF); - sb.append("[[cs:Darmstadt]]"); - sb.append(LF); - sb.append("[[da:Darmstadt]]"); - sb.append(LF); - sb.append("[[de:Darmstadt]]"); - sb.append(LF); - sb.append("[[et:Darmstadt]]"); - sb.append(LF); - sb.append("[[el:Ντάρμστατ]]"); - sb.append(LF); - sb.append("[[es:Darmstadt]]"); - sb.append(LF); - sb.append("[[eo:Darmstadt]]"); - sb.append(LF); - sb.append("[[fr:Darmstadt]]"); - sb.append(LF); - sb.append("[[ko:다름슈타트]]"); - sb.append(LF); - sb.append("[[id:Darmstadt]]"); - sb.append(LF); - sb.append("[[it:Darmstadt]]"); - sb.append(LF); - sb.append("[[la:Darmstadium]]"); - sb.append(LF); - sb.append("[[hu:Darmstadt]]"); - sb.append(LF); - sb.append("[[nl:Darmstadt]]"); - sb.append(LF); - sb.append("[[ja:ダルムシュタット]]"); - sb.append(LF); - sb.append("[[no:Darmstadt]]"); - sb.append(LF); - sb.append("[[nds:Darmstadt]]"); - sb.append(LF); - sb.append("[[pl:Darmstadt]]"); - sb.append(LF); - sb.append("[[pt:Darmstadt]]"); - sb.append(LF); - sb.append("[[ro:Darmstadt]]"); - sb.append(LF); - sb.append("[[ru:Дармштадт]]"); - sb.append(LF); - sb.append("[[simple:Darmstadt]]"); - sb.append(LF); - sb.append("[[fi:Darmstadt]]"); - sb.append(LF); - sb.append("[[sv:Darmstadt]]"); - sb.append(LF); - sb.append("[[tr:Darmstadt]]"); - sb.append(LF); - sb.append("[[vo:Darmstadt]]"); - sb.append(LF); - sb.append("[[zh:达姆施塔特]]"); - sb.append(LF); + public static String getFileText() { + StringBuilder sb = new StringBuilder(); + sb.append("'''Darmstadt''' is a city in the [[States of Germany|Bundesland]] (federal state) of [[Hesse]]n in [[Germany]]. As of 2005, its population was 139,000. The city is located in the southern part of the [[Frankfurt Rhine Main Area|Rhine Main Metropolitan Area]]."); + sb.append(LF); + sb.append(LF); + sb.append("== History =="); + sb.append(LF); + sb.append("The name Darmstadt first appears towards the end of the [[11th century]], then ''Darmundestat''; Darmstadt was chartered as a city by the [[Holy Roman Emperor]] [[Louis IV, Holy Roman Emperor|Ludwig the Bavarian]] in 1330. The seat of the ruling [[Landgraf|landgraves]] (1567-1806) and thereafter (to 1918) to the [[Grand Duke of Hesse and by Rhine|Grand Dukes of Hesse]], the city grew in population during the [[19th century]] from little over 10,000 to 72,000 inhabitants. A polytechnical school, which later became a Technical University now known as [[Darmstadt University of Technology|TU Darmstadt]], was established in 1877. "); + sb.append(LF); + sb.append("In the beginning of the 20th Century Darmstadt was an important centre for the art movement of [[Art Nouveau|Jugendstil]], the German variant of [[Art Nouveau]]. Annual architectural competitions led to the building of many architectural treasures of this period. Also during this period, in [[1912]] the chemist [[Anton Kollisch]], working for the pharmaceutical company [[Merck]], first synthesised the chemical [[MDMA]] (ecstasy) in Darmstadt."); + sb.append(LF); + sb.append("Darmstadt's municipal area was extended in 1937 to include the neighbouring localities of Arheilgen [not Arheil''i''gen] and Eberstadt, and in 1938 the city was separated administratively from the surrounding district (''Kreis''). Its old city centre was largely destroyed in a [[Bombing of Darmstadt in World War II|British bombing raid]] of [[September 11]] [[1944]], which killed an estimated 12,300 inhabitants and rendered 66,000 homeless. Most of Darmstadt's 3000 [[Jew]]s were killed by the [[Nazism|Nazi]] regime between 1933 and 1945. "); + sb.append(LF); + sb.append("Darmstadt is home to many technology companies and research institutes, and has been promoting itself as a \"city of science\" since 1997. It is well known as the high-tech centre in the vicinity of [[Frankfurt International Airport|Frankfurt Airport]], with important activities in spacecraft operations, chemistry, pharmacy, information technology, biotechnology, telecommunications and mechatronics. The [[Darmstadt University of Technology|TU Darmstadt]] is one of the important technical institutes in Germany and is well known for its research and teaching in the Electrical, Mechanical and Civil Engineering disciplines."); + sb.append(LF); + sb.append(LF); + sb.append("== Institutions =="); + sb.append(LF); + sb.append("Darmstadt is the site of one of the leading German universities, the [[Darmstadt University of Technology]], renowned for its engineering departments and the [[Darmstadt University of Applied Sciences]]. Related institutes are the [[Gesellschaft für Schwerionenforschung]] (see also 'Trivia', below) and the four Institutes of the [[Fraunhofer Society]]. The European Space Operations Center ([[ESOC]]) of the [[European Space Agency]] is located in Darmstadt, as is [[EUMETSAT]], which operates [[meteorological]] [[satellite]]s. Darmstadt is a centre for the pharmaceutical and chemical industry, with [[Merck KGaA|Merck]], Röhm and Schenck RoTec (part of The Dürr Group) having their main plants and centres here."); + sb.append(LF); + sb.append("The [http://www.jazzinstitut.de Jazz-Institut Darmstadt] is Germany's largest publicly accessible [[Jazz]] archive."); + sb.append(LF); + sb.append("The [http://www.imd.darmstadt.de/ Internationales Musikinstitut Darmstadt], harboring one of the world's largest collections of [[post-war]] [[sheet music]], also hosts the biannual ''[[International Summer Courses for New Music|Internationale Ferienkurse für Neue Musik]]'', a summer school in [[contemporary classical music]] founded by [[Wolfgang Steinecke]]. A large number of avant-garde [[composer]]s have attended and given lectures there, including [[Olivier Messiaen]], [[Luciano Berio]], [[Milton Babbitt]], [[Pierre Boulez]], [[John Cage]], [[György Ligeti]], [[Iannis Xenakis]], [[Karlheinz Stockhausen]] and [[Mauricio Kagel]]."); + sb.append(LF); + sb.append("The [http://www.deutscheakademie.de/ Deutsche Akademie für Sprache und Dichtung] (German Academy for Language and Poetry) provides writers and scholars with a place to research the German language. The Academy's annual ''Georg-Büchner-Preis'', named in memory of [[Georg Büchner]], is considered the most renowned literary award for writers of German language."); + sb.append(LF); + sb.append(LF); + sb.append("== Sons and Daughters of the City =="); + sb.append(LF); + sb.append("* Justus von Liebig, Chemist "); + sb.append(LF); + sb.append("* Georg Büchner, German Poet"); + sb.append(LF); + sb.append("* [[Dr Walter Köbel]], German politician"); + sb.append(LF); + sb.append("* [[August Anton Ullrich]], German industrialist (1865-1919)"); + sb.append(LF); + sb.append("* [[Fabian Scheuermann]], World traveller"); + sb.append(LF); + sb.append("* [[Björn Phau]], Tennis player"); + sb.append(LF); + sb.append("* [[Friedrich August Kekulé von Stradonitz]], Organic Chemist"); + sb.append(LF); + sb.append(LF); + sb.append("== Military =="); + sb.append(LF); + sb.append("There are still [[U.S. Army]] personnel stationed in the Darmstadt area. Just outside the Darmstadt centre is the U.S. Army Garrison Darmstadt on Cambrai-Fritsch Kaserne. The barracks was originally built in the 1930s as two separate German Army barracks (Cambrai Kaserne and Freiherr von Fritsch Kaserne). "); + sb.append(LF); + sb.append("It is possible to listen to the military entertainment radio for the American troops in the region. The station is called [[American Forces Network|AFN Europe]] and broadcasts from Frankfurt on FM 98.7 or AM 873."); + sb.append(LF); + sb.append("The base has already started deactivation and will be closed around 2008-2010, at that time AFN Europe will be moved to Mannheim."); + sb.append(LF); + sb.append(LF); + sb.append("== Trivia =="); + sb.append(LF); + sb.append("Literally translated, the German name \"Darmstadt\" means \"City of the intestine\". But that is just a coincidence, as the name derives from the medieval name \"darmundestat\". The Darm(bach) is a small creek running through the city."); + sb.append(LF); + sb.append("The [[chemical element]] [[Darmstadtium]] ([[atomic number]]: 110), first discovered at the [[Gesellschaft für Schwerionenforschung]] was named after the city in 2003, making Darmstadt only the sixth city with an element named after it (the other five cities are [[Ytterby]] in [[Sweden]] (four elements); [[Strontian]] in [[Scotland]]; [[Copenhagen]] in [[Denmark]] (whose latin name gives [[Hafnium]]); [[Berkeley, California]]; and [[Dubna]] in [[Russia]]). [[Meitnerium]] ([[atomic number]]: 109) (1982), [[Hassium]] ([[atomic number]]: 108) (1984) and [[Roentgenium]] ([[atomic number]]: 111) (1994) and [[Ununbium]] ([[atomic number]]: 112) (1996) were also synthesized in this facility."); + sb.append(LF); + sb.append("Darmstadt also happens to be one of the small number of cities worldwide which do not lie close to a river or coast."); + sb.append(LF); + sb.append("Darmstadt is the home of [[Software AG]], a software company."); + sb.append(LF); + sb.append("Frankenstein Castle, ''[http://de.wikipedia.org/wiki/Burg_Frankenstein_%28Bergstrasse%29 Burg Frankenstein]'' (in German), possibly Mary Shelley's inspiration for the title of her famous 1818 novel ''[[Frankenstein | Frankenstein; or, The Modern Prometheus]]'', is located nearby."); + sb.append(LF); + sb.append(LF); + sb.append("== Twinning =="); + sb.append(LF); + sb.append("Darmstadt is [[twinned]] with:"); + sb.append(LF); + sb.append(LF); + sb.append("*{{flagicon|Netherlands}}[[Alkmaar]], [[Netherlands]]"); + sb.append(LF); + sb.append("*{{flagicon|Italy}}[[Brescia]], [[Italy]]"); + sb.append(LF); + sb.append("*{{flagicon|Turkey}}[[Bursa, Turkey|Bursa]], [[Turkey]]"); + sb.append(LF); + sb.append("*{{flagicon|United Kingdom}}[[Chesterfield]], [[United Kingdom|UK]]"); + sb.append(LF); + sb.append("*{{flagicon|Austria}}[[Graz]], [[Austria]]"); + sb.append(LF); + sb.append("*{{flagicon|Latvia}}[[Liepaja]], [[Latvia]]"); + sb.append(LF); + sb.append("*{{flagicon|Spain}}[[Logroño]], [[Spain]]"); + sb.append(LF); + sb.append("*{{flagicon|Poland}}[[Płock]], [[Poland]]"); + sb.append(LF); + sb.append("*{{flagicon|Hungary}}[[Szeged]], [[Hungary]]"); + sb.append(LF); + sb.append("*{{flagicon|Norway}}[[Trondheim]], [[Norway]]"); + sb.append(LF); + sb.append("*{{flagicon|France}}[[Troyes]], [[France]]"); + sb.append(LF); + sb.append("*{{flagicon|Ukraine}}[[Uzhhorod]], [[Ukraine]]"); + sb.append(LF); + sb.append("*{{flagicon|Switzerland}}[[Saanen]], [[Switzerland]]"); + sb.append(LF); + sb.append(LF); + sb.append("==External links=="); + sb.append(LF); + sb.append("{{commonscat|Darmstadt, Germany}}"); + sb.append(LF); + sb.append("*[http://www.darmstadt.de/ Official site of the city of Darmstadt] (German, English)"); + sb.append(LF); + sb.append("*[[wikitravel:Darmstadt|Darmstadt]] on [[wikitravel:Main Page|Wikitravel]]"); + sb.append(LF); + sb.append("*[http://www.mathildenhoehe.info Mathildenhoehe]"); + sb.append(LF); + sb.append("*[http://public-transport.net/bim/Darmstadt.htm Details of Trams and Buses used in Darmstadt]"); + sb.append(LF); + sb.append("*[http://www.rmv.de/ Public Transport in Darmstadt - Maps, Timetables, Fares]"); + sb.append(LF); + sb.append("*[http://sites-of-memory.de/main/location.html#darmstadt War memorials in Darmstadt]"); + sb.append(LF); + sb.append("*[http://www.darmstadt.army.mil/ Webpage of the U.S. army in Darmstadt]"); + sb.append(LF); + sb.append(LF); + sb.append("===Notable institutions==="); + sb.append(LF); + sb.append("* [http://www.tu-darmstadt.de/index.en.html Darmstadt University of Technology]"); + sb.append(LF); + sb.append("* [http://www.hochschule-darmstadt.de/engl/engl.htm University of Applied Sciences Darmstadt]"); + sb.append(LF); + sb.append("* [http://www.igd.fraunhofer.de/ Fraunhofer Institute for Computer Graphics]"); + sb.append(LF); + sb.append("* [http://www.sit.fraunhofer.de/ Fraunhofer Institute for Secure Information Technology]"); + sb.append(LF); + sb.append("* [http://www.ipsi.fraunhofer.de/ Fraunhofer Institute for Integrated Publication and Information Systems]"); + sb.append(LF); + sb.append("* [http://www.lbf.fhg.de/ Fraunhofer Institute for Structural Durability]"); + sb.append(LF); + sb.append("* [http://www.deutscheakademie.de/ Deutsche Akademie für Sprache und Dichtung]"); + sb.append(LF); + sb.append("* [http://www.gsi.de/ Gesellschaft für Schwerionenforschung]"); + sb.append(LF); + sb.append("* [http://www.esa.int/SPECIALS/ESOC/ European Space Operations Centre] (ESOC)"); + sb.append(LF); + sb.append("* [http://www.eumetsat.int/ European Organisation for the Exploitation of Meteorological Satellites (EUMETSAT)]"); + sb.append(LF); + sb.append(LF); + sb.append("[[Category:Cities in Hesse]]"); + sb.append(LF); + sb.append("[[Category:Merck]]"); + sb.append(LF); + sb.append("[[ar:دارمشتادت]]"); + sb.append(LF); + sb.append("[[an:Darmstadt]]"); + sb.append(LF); + sb.append("[[bg:Дармщат]]"); + sb.append(LF); + sb.append("[[ca:Darmstadt]]"); + sb.append(LF); + sb.append("[[cs:Darmstadt]]"); + sb.append(LF); + sb.append("[[da:Darmstadt]]"); + sb.append(LF); + sb.append("[[de:Darmstadt]]"); + sb.append(LF); + sb.append("[[et:Darmstadt]]"); + sb.append(LF); + sb.append("[[el:Ντάρμστατ]]"); + sb.append(LF); + sb.append("[[es:Darmstadt]]"); + sb.append(LF); + sb.append("[[eo:Darmstadt]]"); + sb.append(LF); + sb.append("[[fr:Darmstadt]]"); + sb.append(LF); + sb.append("[[ko:다름슈타트]]"); + sb.append(LF); + sb.append("[[id:Darmstadt]]"); + sb.append(LF); + sb.append("[[it:Darmstadt]]"); + sb.append(LF); + sb.append("[[la:Darmstadium]]"); + sb.append(LF); + sb.append("[[hu:Darmstadt]]"); + sb.append(LF); + sb.append("[[nl:Darmstadt]]"); + sb.append(LF); + sb.append("[[ja:ダルムシュタット]]"); + sb.append(LF); + sb.append("[[no:Darmstadt]]"); + sb.append(LF); + sb.append("[[nds:Darmstadt]]"); + sb.append(LF); + sb.append("[[pl:Darmstadt]]"); + sb.append(LF); + sb.append("[[pt:Darmstadt]]"); + sb.append(LF); + sb.append("[[ro:Darmstadt]]"); + sb.append(LF); + sb.append("[[ru:Дармштадт]]"); + sb.append(LF); + sb.append("[[simple:Darmstadt]]"); + sb.append(LF); + sb.append("[[fi:Darmstadt]]"); + sb.append(LF); + sb.append("[[sv:Darmstadt]]"); + sb.append(LF); + sb.append("[[tr:Darmstadt]]"); + sb.append(LF); + sb.append("[[vo:Darmstadt]]"); + sb.append(LF); + sb.append("[[zh:达姆施塔特]]"); + sb.append(LF); - return sb.toString(); - } + return sb.toString(); + } } diff --git a/dkpro-jwpl-util/src/main/java/org/dkpro/jwpl/util/revisions/RevisionUtils.java b/dkpro-jwpl-util/src/main/java/org/dkpro/jwpl/util/revisions/RevisionUtils.java index 4fe42bae..9e0b5162 100644 --- a/dkpro-jwpl-util/src/main/java/org/dkpro/jwpl/util/revisions/RevisionUtils.java +++ b/dkpro-jwpl-util/src/main/java/org/dkpro/jwpl/util/revisions/RevisionUtils.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -34,96 +34,94 @@ * Provides several revision-related utilities that should not be part of the RevisionMachine * package because of dependencies to the JWPL API. RevisionMachine should stay independent * from the RevisionMachine - * - * */ public class RevisionUtils { - RevisionApi revApi; - Wikipedia wiki; - - public RevisionUtils(DatabaseConfiguration conf) throws WikiApiException{ - wiki = new Wikipedia(conf); - revApi = new RevisionApi(conf); - } - - public RevisionUtils(Wikipedia wiki, RevisionApi revApi) throws WikiApiException{ - this.revApi=revApi; - this.wiki=wiki; - } - - /** - * For a given article revision, the method returns the revision of the article discussion - * page which was current at the time the revision was created. - * - * @param revisionId revision of the article for which the talk page revision should be retrieved - * @return the revision of the talk page that was current at the creation time of the given article revision - * @throws WikiApiException if any error occurred accessing the Wiki db - * @throws WikiPageNotFoundException if no discussion page was available at the time of the given article revision - */ - public Revision getDiscussionRevisionForArticleRevision(int revisionId) throws WikiApiException, WikiPageNotFoundException{ - //get article revision - Revision rev = revApi.getRevision(revisionId); - Timestamp revTime = rev.getTimeStamp(); - - //get corresponding discussion page - Page discussion = wiki.getDiscussionPage(rev.getArticleID()); - - /* - * find correct revision of discussion page - */ - List discussionTs = revApi.getRevisionTimestamps(discussion.getPageId()); - - // sort in reverse order - newest first - discussionTs.sort(Comparator.reverseOrder()); - - //find first timestamp equal to or before the article revision timestamp - for(Timestamp curDiscTime:discussionTs){ - if(curDiscTime==revTime||curDiscTime.before(revTime)){ - return revApi.getRevision(discussion.getPageId(), curDiscTime); - } - } - - throw new WikiPageNotFoundException("Not discussion page was available at the time of the given article revision"); - } - - - /** - * For a given article revision, the method returns the revisions of the archived article discussion - * pages which were available at the time of the article revision - * - * @param revisionId revision of the article for which the talk page archive revisions should be retrieved - * @return the revisions of the talk page archives that were available at the time of the article revision - */ - public List getDiscussionArchiveRevisionsForArticleRevision(int revisionId) throws WikiApiException, WikiPageNotFoundException{ - List result = new LinkedList<>(); - - //get article revision - Revision rev = revApi.getRevision(revisionId); - Timestamp revTime = rev.getTimeStamp(); - - //get corresponding discussion archives - Iterable discArchives = wiki.getDiscussionArchives(rev.getArticleID()); - - /* - * for each discussion archive, find correct revision of discussion page - */ - for(Page discArchive:discArchives){ - //get revision timestamps for the current discussion archive - List discussionTs = revApi.getRevisionTimestamps(discArchive.getPageId()); - - // sort in reverse order - newest first - discussionTs.sort(Comparator.reverseOrder()); - - //find first timestamp equal to or before the article revision timestamp - for(Timestamp curDiscTime:discussionTs){ - if(curDiscTime==revTime||curDiscTime.before(revTime)){ - result.add(revApi.getRevision(discArchive.getPageId(), curDiscTime)); - break; - } - } - } - - return result; - } + private RevisionApi revApi; + private Wikipedia wiki; + + public RevisionUtils(DatabaseConfiguration conf) throws WikiApiException { + wiki = new Wikipedia(conf); + revApi = new RevisionApi(conf); + } + + public RevisionUtils(Wikipedia wiki, RevisionApi revApi) throws WikiApiException { + this.revApi = revApi; + this.wiki = wiki; + } + + /** + * For a given article revision, the method returns the revision of the article discussion + * page which was current at the time the revision was created. + * + * @param revisionId revision of the article for which the talk page revision should be retrieved + * @return the revision of the talk page that was current at the creation time of the given article revision + * @throws WikiApiException if any error occurred accessing the Wiki db + * @throws WikiPageNotFoundException if no discussion page was available at the time of the given article revision + */ + public Revision getDiscussionRevisionForArticleRevision(int revisionId) throws WikiApiException, WikiPageNotFoundException { + //get article revision + Revision rev = revApi.getRevision(revisionId); + Timestamp revTime = rev.getTimeStamp(); + + //get corresponding discussion page + Page discussion = wiki.getDiscussionPage(rev.getArticleID()); + + /* + * find correct revision of discussion page + */ + List discussionTs = revApi.getRevisionTimestamps(discussion.getPageId()); + + // sort in reverse order - newest first + discussionTs.sort(Comparator.reverseOrder()); + + //find first timestamp equal to or before the article revision timestamp + for (Timestamp curDiscTime : discussionTs) { + if (curDiscTime == revTime || curDiscTime.before(revTime)) { + return revApi.getRevision(discussion.getPageId(), curDiscTime); + } + } + + throw new WikiPageNotFoundException("Not discussion page was available at the time of the given article revision"); + } + + + /** + * For a given article revision, the method returns the revisions of the archived article discussion + * pages which were available at the time of the article revision + * + * @param revisionId revision of the article for which the talk page archive revisions should be retrieved + * @return the revisions of the talk page archives that were available at the time of the article revision + */ + public List getDiscussionArchiveRevisionsForArticleRevision(int revisionId) throws WikiApiException, WikiPageNotFoundException { + List result = new LinkedList<>(); + + //get article revision + Revision rev = revApi.getRevision(revisionId); + Timestamp revTime = rev.getTimeStamp(); + + //get corresponding discussion archives + Iterable discArchives = wiki.getDiscussionArchives(rev.getArticleID()); + + /* + * for each discussion archive, find correct revision of discussion page + */ + for (Page discArchive : discArchives) { + //get revision timestamps for the current discussion archive + List discussionTs = revApi.getRevisionTimestamps(discArchive.getPageId()); + + // sort in reverse order - newest first + discussionTs.sort(Comparator.reverseOrder()); + + //find first timestamp equal to or before the article revision timestamp + for (Timestamp curDiscTime : discussionTs) { + if (curDiscTime == revTime || curDiscTime.before(revTime)) { + result.add(revApi.getRevision(discArchive.getPageId(), curDiscTime)); + break; + } + } + } + + return result; + } } diff --git a/dkpro-jwpl-util/src/main/java/org/dkpro/jwpl/util/templates/RevisionPair.java b/dkpro-jwpl-util/src/main/java/org/dkpro/jwpl/util/templates/RevisionPair.java index 4cd3ecdb..6d7d6fae 100644 --- a/dkpro-jwpl-util/src/main/java/org/dkpro/jwpl/util/templates/RevisionPair.java +++ b/dkpro-jwpl-util/src/main/java/org/dkpro/jwpl/util/templates/RevisionPair.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -29,158 +29,152 @@ /** * Represents a pair of (adjacent) revisions. In the second pair part (=after) a * template has been added or removed (depending on the mode). - * - * */ -public class RevisionPair implements Serializable{ - - private static final long serialVersionUID = -428550315195347191L; - - private final Revision before; - private final Revision after; - private final String template; - private final RevisionPairType revPairType; - - public RevisionPair(Revision before, Revision after, String template, - RevisionPairType revPairType) { - this.before = before; - this.after = after; - this.template = template; - this.revPairType = revPairType; - } - - /** - * @return revision before the template change - */ - public Revision getBeforeRevision() { - return before; - } - - /** - * @return revision after the template change - */ - public Revision getAfterRevision() { - return after; - } - - /** - * @return the template that has been added or removed - */ - public String getTemplate() { - return template; - } - - /** - * @return the type of template change - */ - public RevisionPairType getType() { - return revPairType; - } - - /** - * Returns the text "around the given template" and returns the corresponding - * text in the other pair part of the RevisionPair. - *

- * Currently, this is done section-based. On TextPairPart contains a section - * with a template and the other contains the corresponding section - * after the template has been deleted (in deleteTemplate mode) or before - * it has been added (in addTemplate mode). - *

- * Note that this only makes sense for inline- or section-templates. - *

- * The section-matching is currently done simply by matching section titles. - * If the title has changed, no match will be found. - * - * @param markTemplates sets whether to add an inline marker for the template - * - * @return a pair of strings corresponding to the before-revision and - * after-revision - */ - public List getInlineTextPairs(boolean markTemplates) { - List pairList = new ArrayList<>(); - - try { - //extract sections - List beforeSections; - List afterSections; - if(markTemplates){ - //add inline marker for the template - beforeSections = ParseUtils.getSections(before.getRevisionText(), before.getRevisionID() + "",before.getRevisionID(), Arrays.asList(new String[]{template})); - afterSections = ParseUtils.getSections(after.getRevisionText(), after.getRevisionID() + "", after.getRevisionID(), Arrays.asList(new String[]{template})); - }else{ - //no inline markers - beforeSections = ParseUtils.getSections(before.getRevisionText(), before.getRevisionID() + "",before.getRevisionID()); - afterSections = ParseUtils.getSections(after.getRevisionText(), after.getRevisionID() + "", after.getRevisionID()); - } - for (ExtractedSection tplSect : revPairType == RevisionPairType.deleteTemplate ? beforeSections : afterSections) { - // in DELETE-mode, the "before" revision contain the templates - // in ADD-mode, the "after" revision contains the templates - if (containsIgnoreCase(tplSect.getTemplates(), template)) { - // the current sect contains the template we're looking for - // now find the corresponding tpl in the other revisions - for (ExtractedSection nonTplSect : revPairType == RevisionPairType.deleteTemplate ? afterSections: beforeSections) { - // TODO how do we match the sections? - // currently only by title - we could do fuzzy matching - // of the section body - if (tplSect.getTitle()!=null&&nonTplSect.getTitle()!=null&&tplSect.getTitle().equalsIgnoreCase(nonTplSect.getTitle())) { - if (revPairType == RevisionPairType.deleteTemplate) { - pairList.add(new TextPair(tplSect.getBody(), nonTplSect.getBody())); - } else { - pairList.add(new TextPair(nonTplSect.getBody(), tplSect.getBody())); - } - } - } - } - } - } catch (Exception ex) { - //This happends if a (SWEBLE-)compiler exception occurs.S - //Sometimes, malformed xml items seem to cause class cast exceptions - //in the parser, which is not wrapped in a Compiler exception. - //Therefore, we should catch all exceptions here and return the - //TextPairs identified so far (if any) - System.err.println(ex.getMessage()); - //TODO use logger!! - } - return pairList; - } - - /** - * Checks if a list of string contains a String while ignoring case - * - * @param stringlist a list of string - * @param match the string to look for - * @return true, if the list contains the string, false else - */ - private boolean containsIgnoreCase(List stringlist, String match) { - for (String s : stringlist) { - if (s.equalsIgnoreCase(match)) { - return true; - } - } - return false; - } - - public enum RevisionPairType { - deleteTemplate, addTemplate - } - - - @Override - public boolean equals(Object anObject) { - if(!(anObject instanceof RevisionPair)){ - return false; - }else{ - RevisionPair otherPair = (RevisionPair)anObject; - if (this.getBeforeRevision().getRevisionID() == - otherPair.getBeforeRevision().getRevisionID() - && this.getAfterRevision().getRevisionID() == - otherPair.getAfterRevision().getRevisionID() - && this.getTemplate().equals(otherPair.getTemplate())&& - this.getType()==otherPair.getType()) { - return true; - }else{ - return false; - } - } +public class RevisionPair implements Serializable { + + private static final long serialVersionUID = -1958556838478438963L; + private final Revision before; + private final Revision after; + private final String template; + private final RevisionPairType revPairType; + + public RevisionPair(Revision before, Revision after, String template, + RevisionPairType revPairType) { + this.before = before; + this.after = after; + this.template = template; + this.revPairType = revPairType; + } + + /** + * @return revision before the template change + */ + public Revision getBeforeRevision() { + return before; + } + + /** + * @return revision after the template change + */ + public Revision getAfterRevision() { + return after; + } + + /** + * @return the template that has been added or removed + */ + public String getTemplate() { + return template; + } + + /** + * @return the type of template change + */ + public RevisionPairType getType() { + return revPairType; + } + + /** + * Returns the text "around the given template" and returns the corresponding + * text in the other pair part of the RevisionPair. + *

+ * Currently, this is done section-based. On TextPairPart contains a section + * with a template and the other contains the corresponding section + * after the template has been deleted (in deleteTemplate mode) or before + * it has been added (in addTemplate mode). + *

+ * Note that this only makes sense for inline- or section-templates. + *

+ * The section-matching is currently done simply by matching section titles. + * If the title has changed, no match will be found. + * + * @param markTemplates sets whether to add an inline marker for the template + * @return a pair of strings corresponding to the before-revision and + * after-revision + */ + public List getInlineTextPairs(boolean markTemplates) { + List pairList = new ArrayList<>(); + + try { + //extract sections + List beforeSections; + List afterSections; + if (markTemplates) { + //add inline marker for the template + beforeSections = ParseUtils.getSections(before.getRevisionText(), before.getRevisionID() + "", before.getRevisionID(), Arrays.asList(new String[]{template})); + afterSections = ParseUtils.getSections(after.getRevisionText(), after.getRevisionID() + "", after.getRevisionID(), Arrays.asList(new String[]{template})); + } else { + //no inline markers + beforeSections = ParseUtils.getSections(before.getRevisionText(), before.getRevisionID() + "", before.getRevisionID()); + afterSections = ParseUtils.getSections(after.getRevisionText(), after.getRevisionID() + "", after.getRevisionID()); + } + for (ExtractedSection tplSect : revPairType == RevisionPairType.deleteTemplate ? beforeSections : afterSections) { + // in DELETE-mode, the "before" revision contain the templates + // in ADD-mode, the "after" revision contains the templates + if (containsIgnoreCase(tplSect.getTemplates(), template)) { + // the current sect contains the template we're looking for + // now find the corresponding tpl in the other revisions + for (ExtractedSection nonTplSect : revPairType == RevisionPairType.deleteTemplate ? afterSections : beforeSections) { + // TODO how do we match the sections? + // currently only by title - we could do fuzzy matching + // of the section body + if (tplSect.getTitle() != null && nonTplSect.getTitle() != null && tplSect.getTitle().equalsIgnoreCase(nonTplSect.getTitle())) { + if (revPairType == RevisionPairType.deleteTemplate) { + pairList.add(new TextPair(tplSect.getBody(), nonTplSect.getBody())); + } else { + pairList.add(new TextPair(nonTplSect.getBody(), tplSect.getBody())); + } + } + } + } + } + } catch (Exception ex) { + //This happends if a (SWEBLE-)compiler exception occurs.S + //Sometimes, malformed xml items seem to cause class cast exceptions + //in the parser, which is not wrapped in a Compiler exception. + //Therefore, we should catch all exceptions here and return the + //TextPairs identified so far (if any) + System.err.println(ex.getMessage()); + //TODO use logger!! + } + return pairList; + } + + /** + * Checks if a list of string contains a String while ignoring case + * + * @param stringlist a list of string + * @param match the string to look for + * @return true, if the list contains the string, false else + */ + private boolean containsIgnoreCase(List stringlist, String match) { + for (String s : stringlist) { + if (s.equalsIgnoreCase(match)) { + return true; + } + } + return false; + } + + public enum RevisionPairType { + deleteTemplate, addTemplate + } + + + @Override + public boolean equals(Object anObject) { + if (!(anObject instanceof RevisionPair)) { + return false; + } else { + RevisionPair otherPair = (RevisionPair) anObject; + if (this.getBeforeRevision().getRevisionID() == otherPair.getBeforeRevision().getRevisionID() && + this.getAfterRevision().getRevisionID() == otherPair.getAfterRevision().getRevisionID() && + this.getTemplate().equals(otherPair.getTemplate()) && + this.getType() == otherPair.getType()) { + return true; + } else { + return false; + } } + } } diff --git a/dkpro-jwpl-util/src/main/java/org/dkpro/jwpl/util/templates/TextPair.java b/dkpro-jwpl-util/src/main/java/org/dkpro/jwpl/util/templates/TextPair.java index 8b98a544..ea162c76 100644 --- a/dkpro-jwpl-util/src/main/java/org/dkpro/jwpl/util/templates/TextPair.java +++ b/dkpro-jwpl-util/src/main/java/org/dkpro/jwpl/util/templates/TextPair.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -35,277 +35,269 @@ /** * Represents a pair of Strings. Usually corresponding to a RevisionPair. - * - * */ public class TextPair { - private String beforeText; - private String afterText; - /** - * Holds arbitrary String-MetaData - */ - private Map metaData; - - public TextPair(String before, String after) { - this.beforeText = normalize(before); - this.afterText = normalize(after); - setMetaData(new HashMap<>()); - } - - public String getBeforeText() { - return beforeText; - } - - public List getBeforeLines() { - return sentenceSplit(beforeText); - } - - public void setBeforeText(String beforeText) { - this.beforeText = normalize(beforeText); - } - - public String getAfterText() { - return afterText; - } - - public List getAfterLines() { - return sentenceSplit(afterText); - } - - public void setAfterText(String afterText) { - this.afterText = normalize(afterText); - } - - public Map getMetaData() - { - return metaData; - } - - public void setMetaData(Map metaData) - { - this.metaData = metaData; - } - - public void addMetaData(String key, String value) - { - metaData.put(key, value); - } - - public String getMetaDataValue(String key) - { - return metaData.get(key); - } - - - - - /** - * Returns the patch object that contains all diffs between - * the beforeText and the afterText - * - * @return Patch object with all diffs - */ - public Patch getPatch() { - return DiffUtils.diff(sentenceSplit(beforeText), sentenceSplit(afterText)); - } - - public List getDiffRows(boolean markChangesInline){ - DiffRowGenerator generator = new DiffRowGenerator.Builder() - .showInlineDiffs(markChangesInline) - .columnWidth(Integer.MAX_VALUE) // do not wrap - .build(); - - return generator.generateDiffRows(sentenceSplit(beforeText),sentenceSplit(afterText)); - } - - public String getInlineDiffString() { - StringBuilder diffString = new StringBuilder(); - for(DiffRow row:getDiffRows(true)){ - diffString.append(row.toString()); - diffString.append(System.getProperty("line.separator")); - } - return diffString.toString(); - } - - - /** - * Returns the deltas between beforeText and afterText as a line separated String - * using delta.toString() - * For more detailed diffs, use getPatch() or getUnifiedDiffStrings() - * - * @return diffs as line-separated String using delta.toString() - */ - public String getSimpleDiffString() { - StringBuilder deltas = new StringBuilder(); - for(Delta delta:getPatch().getDeltas()){ - deltas.append(delta.toString()); - deltas.append(System.getProperty("line.separator")); - } - return deltas.toString(); - } - - /** - * Returns the deltas between beforeText and afterText as a line separated String - * using delta.toString() - * For more detailed diffs, use getPatch() or getUnifiedDiffStrings() - * - * @param difftype defines the type of diffs to include in the String - * @return diffs as line-separated String using delta.toString() - */ - public String getSimpleDiffString(TYPE difftype) { - StringBuilder deltas = new StringBuilder(); - for(Delta delta:getPatch().getDeltas()){ - if(delta.getType()==difftype){ - deltas.append(delta); - deltas.append(System.getProperty("line.separator")); - } - } - return deltas.toString(); - } - - /** - * Returns the deltas between beforeText and afterText as a line separated String. - * For more detailed diffs, use getPatch() or getUnifiedDiffStrings() - * - * @return diffs as line-separated String - */ - public String getLongDiffString() { - StringBuilder deltas = new StringBuilder(); - for(Delta delta:getPatch().getDeltas()){ - deltas.append("DeltaType: "+delta.getType().toString()); - deltas.append(System.getProperty("line.separator")); - deltas.append("Original (Non-Neutral):"); - deltas.append(System.getProperty("line.separator")); - deltas.append(delta.getOriginal()); - deltas.append(System.getProperty("line.separator")); - deltas.append(System.getProperty("line.separator")); - deltas.append("Revised (Neutral):"); - deltas.append(System.getProperty("line.separator")); - deltas.append(delta.getRevised()); - deltas.append(System.getProperty("line.separator")); - } - return deltas.toString(); - } - - /** - * Returns the deltas between beforeText and afterText as a line separated String. - * For more detailed diffs, use getPatch() or getUnifiedDiffStrings() - * - * @param diffType defines the type of diffs to include in the String - * @return diffs as line-separated String - */ - public String getLongDiffString(TYPE diffType) { - StringBuilder deltas = new StringBuilder(); - for(Delta delta:getPatch().getDeltas()){ - if(delta.getType()==diffType){ - deltas.append("Original (Non-Neutral):"); - deltas.append(System.getProperty("line.separator")); - deltas.append(delta.getOriginal()); - deltas.append(System.getProperty("line.separator")); - deltas.append(System.getProperty("line.separator")); - deltas.append("Revised (Neutral):"); - deltas.append(System.getProperty("line.separator")); - deltas.append(delta.getRevised()); - deltas.append(System.getProperty("line.separator")); - deltas.append("*********************************************"); - deltas.append(System.getProperty("line.separator")); - } - } - return deltas.toString(); - } - - - /** - * Returns the unified diff between "Before" and "After" - * containing one sentence per String. - * contextSize defines a window of lines/sentences around each change - * to display - * - * @param contextSize numer of lines/sentences around a change to display - * @return diffs as line-separated String - */ - public List getUnifiedDiffStrings(int contextSize) { - return DiffUtils.generateUnifiedDiff("Before", "After", sentenceSplit(beforeText), getPatch(), contextSize); - } - - /** - * Returns the unified diff between "Before" and "After" as a single - * line-separated String - * - * @param contextSize numer of characters around a change to display - * @return diffs as line-separated String - */ - public String getUnifiedDiffString(int contextSize) { - return listToString(getUnifiedDiffStrings(contextSize)); - } - - - /** - * Splits a String into sentences using the BreakIterator with - * US locale - * - * @param str a String with (multiple) sentences - * @return a list of Strings - one sentences per String - */ - private List sentenceSplit(String str) { - BreakIterator iterator = BreakIterator.getSentenceInstance(Locale.US); - iterator.setText(str); - int start = iterator.first(); - List sentences = new ArrayList<>(); - for (int end = iterator.next(); - end != BreakIterator.DONE; - start = end, end = iterator.next()) { - sentences.add(str.substring(start, end).trim()); - } - return sentences; - } - - /** - * Concatenates a list of Strings to one line-separated String - * - * @param stringList a list of Strings - * @return a single line-separated String containing all Strings from the list - */ - private String listToString(List stringList){ - StringBuilder concat = new StringBuilder(); - for(String str:stringList){ - concat.append(str); - concat.append(System.getProperty("line.separator")); - } - return concat.toString(); - } - - /** - * Normalizes the Strings in the TextPair. - * This mainly deals with whitespace-issues. - * Other normalizations can be included. - * - * @param str - * @return - */ - private String normalize(String str){ - str = StringUtils.trimToEmpty(str); - str = StringUtils.normalizeSpace(str); - - // remove whitespace before punctuation. not using \p{Punct}, - // because it includes to many special characters. - str = str.replaceAll("\\s+(?=[.!,\\?;:])", ""); - - return str; - } - - @Override - public boolean equals(Object anObject) { - if(!(anObject instanceof TextPair)){ - return false; - }else{ - TextPair otherPair = (TextPair)anObject; - if (this.getBeforeText().equals(otherPair.getBeforeText())&&this.getAfterText().equals(otherPair.getAfterText())) { - return true; - }else{ - return false; - } - } + private String beforeText; + private String afterText; + /** + * Holds arbitrary String-MetaData + */ + private Map metaData; + + public TextPair(String before, String after) { + this.beforeText = normalize(before); + this.afterText = normalize(after); + setMetaData(new HashMap<>()); + } + + public String getBeforeText() { + return beforeText; + } + + public List getBeforeLines() { + return sentenceSplit(beforeText); + } + + public void setBeforeText(String beforeText) { + this.beforeText = normalize(beforeText); + } + + public String getAfterText() { + return afterText; + } + + public List getAfterLines() { + return sentenceSplit(afterText); + } + + public void setAfterText(String afterText) { + this.afterText = normalize(afterText); + } + + public Map getMetaData() { + return metaData; + } + + public void setMetaData(Map metaData) { + this.metaData = metaData; + } + + public void addMetaData(String key, String value) { + metaData.put(key, value); + } + + public String getMetaDataValue(String key) { + return metaData.get(key); + } + + + /** + * Returns the patch object that contains all diffs between + * the beforeText and the afterText + * + * @return Patch object with all diffs + */ + public Patch getPatch() { + return DiffUtils.diff(sentenceSplit(beforeText), sentenceSplit(afterText)); + } + + public List getDiffRows(boolean markChangesInline) { + DiffRowGenerator generator = new DiffRowGenerator.Builder() + .showInlineDiffs(markChangesInline) + .columnWidth(Integer.MAX_VALUE) // do not wrap + .build(); + + return generator.generateDiffRows(sentenceSplit(beforeText), sentenceSplit(afterText)); + } + + public String getInlineDiffString() { + StringBuilder diffString = new StringBuilder(); + for (DiffRow row : getDiffRows(true)) { + diffString.append(row.toString()); + diffString.append(System.getProperty("line.separator")); + } + return diffString.toString(); + } + + + /** + * Returns the deltas between beforeText and afterText as a line separated String + * using delta.toString() + * For more detailed diffs, use getPatch() or getUnifiedDiffStrings() + * + * @return diffs as line-separated String using delta.toString() + */ + public String getSimpleDiffString() { + StringBuilder deltas = new StringBuilder(); + for (Delta delta : getPatch().getDeltas()) { + deltas.append(delta.toString()); + deltas.append(System.getProperty("line.separator")); + } + return deltas.toString(); + } + + /** + * Returns the deltas between beforeText and afterText as a line separated String + * using delta.toString() + * For more detailed diffs, use getPatch() or getUnifiedDiffStrings() + * + * @param difftype defines the type of diffs to include in the String + * @return diffs as line-separated String using delta.toString() + */ + public String getSimpleDiffString(TYPE difftype) { + StringBuilder deltas = new StringBuilder(); + for (Delta delta : getPatch().getDeltas()) { + if (delta.getType() == difftype) { + deltas.append(delta); + deltas.append(System.getProperty("line.separator")); + } + } + return deltas.toString(); + } + + /** + * Returns the deltas between beforeText and afterText as a line separated String. + * For more detailed diffs, use getPatch() or getUnifiedDiffStrings() + * + * @return diffs as line-separated String + */ + public String getLongDiffString() { + StringBuilder deltas = new StringBuilder(); + for (Delta delta : getPatch().getDeltas()) { + deltas.append("DeltaType: " + delta.getType().toString()); + deltas.append(System.getProperty("line.separator")); + deltas.append("Original (Non-Neutral):"); + deltas.append(System.getProperty("line.separator")); + deltas.append(delta.getOriginal()); + deltas.append(System.getProperty("line.separator")); + deltas.append(System.getProperty("line.separator")); + deltas.append("Revised (Neutral):"); + deltas.append(System.getProperty("line.separator")); + deltas.append(delta.getRevised()); + deltas.append(System.getProperty("line.separator")); + } + return deltas.toString(); + } + + /** + * Returns the deltas between beforeText and afterText as a line separated String. + * For more detailed diffs, use getPatch() or getUnifiedDiffStrings() + * + * @param diffType defines the type of diffs to include in the String + * @return diffs as line-separated String + */ + public String getLongDiffString(TYPE diffType) { + StringBuilder deltas = new StringBuilder(); + for (Delta delta : getPatch().getDeltas()) { + if (delta.getType() == diffType) { + deltas.append("Original (Non-Neutral):"); + deltas.append(System.getProperty("line.separator")); + deltas.append(delta.getOriginal()); + deltas.append(System.getProperty("line.separator")); + deltas.append(System.getProperty("line.separator")); + deltas.append("Revised (Neutral):"); + deltas.append(System.getProperty("line.separator")); + deltas.append(delta.getRevised()); + deltas.append(System.getProperty("line.separator")); + deltas.append("*********************************************"); + deltas.append(System.getProperty("line.separator")); + } + } + return deltas.toString(); + } + + + /** + * Returns the unified diff between "Before" and "After" + * containing one sentence per String. + * contextSize defines a window of lines/sentences around each change + * to display + * + * @param contextSize numer of lines/sentences around a change to display + * @return diffs as line-separated String + */ + public List getUnifiedDiffStrings(int contextSize) { + return DiffUtils.generateUnifiedDiff("Before", "After", sentenceSplit(beforeText), getPatch(), contextSize); + } + + /** + * Returns the unified diff between "Before" and "After" as a single + * line-separated String + * + * @param contextSize numer of characters around a change to display + * @return diffs as line-separated String + */ + public String getUnifiedDiffString(int contextSize) { + return listToString(getUnifiedDiffStrings(contextSize)); + } + + + /** + * Splits a String into sentences using the BreakIterator with + * US locale + * + * @param str a String with (multiple) sentences + * @return a list of Strings - one sentences per String + */ + private List sentenceSplit(String str) { + BreakIterator iterator = BreakIterator.getSentenceInstance(Locale.US); + iterator.setText(str); + int start = iterator.first(); + List sentences = new ArrayList<>(); + for (int end = iterator.next(); + end != BreakIterator.DONE; + start = end, end = iterator.next()) { + sentences.add(str.substring(start, end).trim()); + } + return sentences; + } + + /** + * Concatenates a list of Strings to one line-separated String + * + * @param stringList a list of Strings + * @return a single line-separated String containing all Strings from the list + */ + private String listToString(List stringList) { + StringBuilder concat = new StringBuilder(); + for (String str : stringList) { + concat.append(str); + concat.append(System.getProperty("line.separator")); + } + return concat.toString(); + } + + /** + * Normalizes the Strings in the TextPair. + * This mainly deals with whitespace-issues. + * Other normalizations can be included. + * + * @param str + * @return + */ + private String normalize(String str) { + str = StringUtils.trimToEmpty(str); + str = StringUtils.normalizeSpace(str); + + // remove whitespace before punctuation. not using \p{Punct}, + // because it includes to many special characters. + str = str.replaceAll("\\s+(?=[.!,\\?;:])", ""); + + return str; + } + + @Override + public boolean equals(Object anObject) { + if (!(anObject instanceof TextPair)) { + return false; + } else { + TextPair otherPair = (TextPair) anObject; + if (this.getBeforeText().equals(otherPair.getBeforeText()) && this.getAfterText().equals(otherPair.getAfterText())) { + return true; + } else { + return false; + } } + } } \ No newline at end of file diff --git a/dkpro-jwpl-util/src/main/java/org/dkpro/jwpl/util/templates/WikipediaTemplateInfo.java b/dkpro-jwpl-util/src/main/java/org/dkpro/jwpl/util/templates/WikipediaTemplateInfo.java index 8f11ca36..e48e2ea9 100644 --- a/dkpro-jwpl-util/src/main/java/org/dkpro/jwpl/util/templates/WikipediaTemplateInfo.java +++ b/dkpro-jwpl-util/src/main/java/org/dkpro/jwpl/util/templates/WikipediaTemplateInfo.java @@ -2,13 +2,13 @@ * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt + * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -57,1536 +57,1377 @@ */ public class WikipediaTemplateInfo { - private final Wikipedia wiki; - private RevisionApi revApi=null; - private MediaWikiParser parser=null; + private final Wikipedia wiki; + private RevisionApi revApi = null; + private MediaWikiParser parser = null; - private Connection connection; + private Connection connection; - /** - */ - public WikipediaTemplateInfo(Wikipedia pWiki) throws SQLException, WikiApiException{ - this.wiki = pWiki; - this.connection=getConnection(wiki); + /** + * + */ + public WikipediaTemplateInfo(Wikipedia pWiki) throws SQLException, WikiApiException { + this.wiki = pWiki; + this.connection = getConnection(wiki); - if (!tableExists(GeneratorConstants.TABLE_TPLID_TPLNAME)) { - System.err.println("No Template Database could be found. You can only use methods that work without a template index"); - } + if (!tableExists(GeneratorConstants.TABLE_TPLID_TPLNAME)) { + System.err.println("No Template Database could be found. You can only use methods that work without a template index"); } + } + + /** + * Returns the number of all pages that contain a template the name + * of which starts with any of the the given Strings. + * + * @param templateFragments a list Strings containing the beginnings of the desired templates + * @param whitelist whether to return pages containing these templates (true) or return pages + * NOT containing these templates (false) + * @return the number of pages that contain any template starting with templateFragment + * @throws WikiApiException If there was any error retrieving the page object (most likely if the template templates are corrupted) + */ + private Integer countFragmentFilteredPages(List templateFragments, boolean whitelist) throws WikiApiException { + try { + int count = 0; + PreparedStatement statement = null; + ResultSet result = null; + + try { + StringBuffer sqlString = new StringBuffer(); + StringBuffer subconditions = new StringBuffer(); + sqlString.append("SELECT distinct(count(*)) FROM " + GeneratorConstants.TABLE_TPLID_TPLNAME + " as tpl, " + + GeneratorConstants.TABLE_TPLID_PAGEID + " AS p WHERE tpl.templateId = p.templateId " + (whitelist ? "AND" : "AND NOT") + " ("); + for (@SuppressWarnings("unused") String fragment : templateFragments) { + if (subconditions.length() != 0) { + subconditions.append("OR "); + } + subconditions.append("tpl.templateName LIKE ?"); + } + sqlString.append(subconditions); + sqlString.append(")"); - /** - * Returns the number of all pages that contain a template the name - * of which starts with any of the the given Strings. - * - * @param templateFragments a list Strings containing the beginnings of the desired templates - * @param whitelist - * whether to return pages containing these templates (true) or return pages NOT containing these templates (false) - * @return the number of pages that contain any template starting with templateFragment - * @throws WikiApiException If there was any error retrieving the page object (most likely if the template templates are corrupted) - */ - private Integer countFragmentFilteredPages(List templateFragments, boolean whitelist) throws WikiApiException { - try { - int count = 0; - PreparedStatement statement = null; - ResultSet result = null; - - try { - StringBuffer sqlString = new StringBuffer(); - StringBuffer subconditions = new StringBuffer(); - sqlString.append("SELECT distinct(count(*)) FROM "+ GeneratorConstants.TABLE_TPLID_TPLNAME+ " as tpl, " - + GeneratorConstants.TABLE_TPLID_PAGEID+ " AS p WHERE tpl.templateId = p.templateId "+(whitelist?"AND":"AND NOT")+" ("); - for(@SuppressWarnings("unused") String fragment:templateFragments){ - if(subconditions.length()!=0){ - subconditions.append("OR "); - } - subconditions.append("tpl.templateName LIKE ?"); - } - sqlString.append(subconditions); - sqlString.append(")"); - - statement = connection.prepareStatement(sqlString.toString()); - - int curIdx=1; - for(String fragment:templateFragments){ - fragment=fragment.toLowerCase(); - fragment=fragment.trim(); - fragment=fragment.replaceAll(" ", "_"); - statement.setString(curIdx++, fragment + "%"); - } - - result = execute(statement); - - if (result == null) { - return 0; - } - - if (result.next()) { - count = result.getInt(1); - } - } - finally { - if (statement != null) { - statement.close(); - } - if (result != null) { - result.close(); - } - } - - return count; - } - catch (Exception e) { - throw new WikiApiException(e); - } - } - - /** - * Returns the number of all pages that contain a template the name - * of which starts with any of the the given Strings. - * - * @param templateFragments a list Strings containing the beginnings of the desired templates - * @return the number of pages that contain any template starting with templateFragment - * @throws WikiApiException If there was any error retrieving the page object (most likely if the template templates are corrupted) - */ - public Integer countPagesContainingTemplateFragments(List templateFragments) throws WikiApiException{ - return countFragmentFilteredPages(templateFragments, true); - } - - /** - * Returns the number of all pages that contain a template the name - * of which starts with any of the the given Strings. - * - * @param templateFragments a list Strings containing the beginnings of the desired templates - * @return the number of pages that contain any template starting with templateFragment - * @throws WikiApiException If there was any error retrieving the page object (most likely if the template templates are corrupted) - */ - public Integer countPagesNotContainingTemplateFragments(List templateFragments) throws WikiApiException{ - return countFragmentFilteredPages(templateFragments, false); - } - - - /** - * Returns the number of all pages that contain a template the name of which - * equals the given String. - * - * @param templateNames - * a list of String containing the beginnings of the templates that have to be matched - * @param whitelist - * whether to return pages containing these templates (true) or return pages NOT containing these templates (false) - * @return the number of pages that contain a template starting with - * any templateFragment - * @throws WikiApiException - * If there was any error retrieving the page object (most - * likely if the templates are corrupted) - */ - private Integer countFilteredPages(List templateNames, boolean whitelist) - throws WikiApiException{ - - try { - int count = 0; - PreparedStatement statement = null; - ResultSet result = null; - - try { - StringBuffer sqlString = new StringBuffer(); - StringBuffer subconditions = new StringBuffer(); - sqlString - .append("SELECT distinct(count(*)) FROM "+ GeneratorConstants.TABLE_TPLID_TPLNAME+ " as tpl, " - + GeneratorConstants.TABLE_TPLID_PAGEID+ " AS p WHERE tpl.templateId = p.templateId "+(whitelist?"AND":"AND NOT")+" ("); - - for(@SuppressWarnings("unused") String name:templateNames){ - if(subconditions.length()!=0){ - subconditions.append("OR "); - } - subconditions.append("tpl.templateName = ?"); - } - sqlString.append(subconditions); - sqlString.append(")"); - - statement = connection.prepareStatement(sqlString.toString()); - - int curIdx=1; - for(String name:templateNames){ - name=name.toLowerCase().trim(); - name=name.replaceAll(" ", "_"); - statement.setString(curIdx++, name); - } - - result = execute(statement); - - if (result == null) { - return 0; - } - - if (result.next()) { - count = result.getInt(1); - } - } - finally { - if (statement != null) { - statement.close(); - } - if (result != null) { - result.close(); - } - } - - return count; - } - catch (Exception e) { - throw new WikiApiException(e); - } - } - - /** - * Returns the number of all pages that contain a template the name of which - * equals the given String. - * - * @param templateNames - * a list of String containing the beginnings of the templates that have to be matched - * @return the number of pages that contain a template starting with - * any templateFragment - * @throws WikiApiException - * If there was any error retrieving the page object (most - * likely if the templates are corrupted) - */ - public Integer countPagesContainingTemplateNames(List templateNames) throws WikiApiException{ - return countFilteredPages(templateNames, true); - } - - /** - * Returns the number of all pages that do not contain a template the name of which - * equals the given String. - * - * @param templateNames - * a list of String containing the beginnings of the templates that have to be matched - * @return the number of pages that do not contain a template starting with - * any templateFragment - * @throws WikiApiException - * If there was any error retrieving the page object (most - * likely if the templates are corrupted) - */ - public Integer countPagesNotContainingTemplateNames(List templateNames) throws WikiApiException{ - return countFilteredPages(templateNames, false); - } - - /** - * Return an iterable containing all pages that contain a template the name - * of which starts with any of the given Strings. - * - * @param templateFragments - * the beginning of the templates that have to be matched - * @param whitelist - * whether to return pages containing these templates (true) or return pages NOT containing these templates (false) - * @return An iterable with the page objects that contain templates - * beginning with any String in templateFragments - * @throws WikiApiException - * If there was any error retrieving the page object (most - * likely if the templates are corrupted) - */ - private Iterable getFragmentFilteredPages(List templateFragments, boolean whitelist) throws WikiApiException{ - - try { - PreparedStatement statement = null; - ResultSet result = null; - List matchedPages = new LinkedList<>(); - - try { - StringBuffer sqlString = new StringBuffer(); - StringBuffer subconditions = new StringBuffer(); - sqlString.append("SELECT p.pageId FROM "+ - GeneratorConstants.TABLE_TPLID_TPLNAME+ " AS tpl, " - + GeneratorConstants.TABLE_TPLID_PAGEID - + " AS p WHERE tpl.templateId = p.templateId "+(whitelist?"AND":"AND NOT")+" ("); - - for(@SuppressWarnings("unused") String fragment:templateFragments){ - if(subconditions.length()!=0){ - subconditions.append("OR "); - } - subconditions.append("tpl.templateName LIKE ?"); - } - sqlString.append(subconditions); - sqlString.append(")"); - - statement = connection.prepareStatement(sqlString.toString()); - - int curIdx=1; - for(String fragment:templateFragments){ - fragment=fragment.toLowerCase().trim(); - fragment=fragment.replaceAll(" ", "_"); - statement.setString(curIdx++, fragment + "%"); - } - - result = execute(statement); - - if (result == null) { - throw new WikiPageNotFoundException("Nothing was found"); - } - - while (result.next()) { - int pageID = result.getInt(1); - matchedPages.add(wiki.getPage(pageID)); - } - } - finally { - if (statement != null) { - statement.close(); - } - if (result != null) { - result.close(); - } - } - - return matchedPages; - } - catch (Exception e) { - throw new WikiApiException(e); - } - } - - public int checkTemplateId(String templateName) throws WikiApiException{ - try { - PreparedStatement statement = null; - ResultSet result = null; - - try { - StringBuffer sqlString = new StringBuffer(); - - - sqlString.append("SELECT tpl.templateId FROM "+GeneratorConstants.TABLE_TPLID_TPLNAME+" AS tpl WHERE tpl.templateName='"+templateName.trim().replaceAll(" ","_")+"'"); - - statement = connection.prepareStatement(sqlString.toString()); - - result = execute(statement); - - if (result == null) { - return -1; - } - - if (result.next()) { - return result.getInt(1); - } - } - finally { - if (statement != null) { - statement.close(); - } - if (result != null) { - result.close(); - } - - } - - return -1; - } - catch (Exception e) { - throw new WikiApiException(e); - } - } + statement = connection.prepareStatement(sqlString.toString()); + int curIdx = 1; + for (String fragment : templateFragments) { + fragment = fragment.toLowerCase(); + fragment = fragment.trim(); + fragment = fragment.replaceAll(" ", "_"); + statement.setString(curIdx++, fragment + "%"); + } - /** - * Return an iterable containing all pages that contain a template the name - * of which starts with any of the given Strings. - * - * @param templateFragments - * the beginning of the templates that have to be matched - * @return An iterable with the page objects that contain templates - * beginning with any String in templateFragments - * @throws WikiApiException - * If there was any error retrieving the page object (most - * likely if the templates are corrupted) - */ - public Iterable getPagesContainingTemplateFragments(List templateFragments) throws WikiApiException{ - return getFragmentFilteredPages(templateFragments, true); - } + result = execute(statement); - /** - * Return an iterable containing all pages that contain a template the name - * of which starts with any of the given Strings. - * - * @param templateFragments - * the beginning of the templates that have to be matched - * @return An iterable with the page objects that contain templates - * beginning with any String in templateFragments - * @throws WikiApiException - * If there was any error retrieving the page object (most - * likely if the templates are corrupted) - */ - public Iterable getPagesNotContainingTemplateFragments(List templateFragments) throws WikiApiException{ - return getFragmentFilteredPages(templateFragments, false); - } + if (result == null) { + return 0; + } + if (result.next()) { + count = result.getInt(1); + } + } finally { + if (statement != null) { + statement.close(); + } + if (result != null) { + result.close(); + } + } - /** - * Return an iterable containing all pages that contain a template the name - * of which starts with any of the given Strings. - * - * @param templateNames - * the names of the template that we want to match - * @param whitelist - * whether to return pages containing these templates (true) or return pages NOT containing these templates (false) - * @return An iterable with the page objects that contain any of the the - * specified templates - * @throws WikiApiException - * If there was any error retrieving the page object (most - * likely if the templates are corrupted) - */ - private Iterable getFilteredPages(List templateNames, boolean whitelist) throws WikiApiException{ - try { - PreparedStatement statement = null; - ResultSet result = null; - List matchedPages = new LinkedList<>(); - - try { - StringBuffer sqlString = new StringBuffer(); - StringBuffer subconditions = new StringBuffer(); - sqlString.append("SELECT p.pageId FROM "+ GeneratorConstants.TABLE_TPLID_TPLNAME+ " AS tpl, " - + GeneratorConstants.TABLE_TPLID_PAGEID+ " AS p WHERE tpl.templateId = p.templateId "+(whitelist?"AND":"AND NOT")+" ("); - - for(@SuppressWarnings("unused") String name:templateNames){ - if(subconditions.length()!=0){ - subconditions.append("OR "); - } - subconditions.append("tpl.templateName = ?"); - } - sqlString.append(subconditions); - sqlString.append(")"); - - statement = connection.prepareStatement(sqlString.toString()); - - int curIdx=1; - for(String name:templateNames){ - name=name.toLowerCase().trim(); - name=name.replaceAll(" ","_"); - statement.setString(curIdx++, name); - } - - result = execute(statement); - - if (result == null) { - throw new WikiPageNotFoundException("Nothing was found"); - } - - while (result.next()) { - int pageID = result.getInt(1); - matchedPages.add(wiki.getPage(pageID)); - } - } - finally { - if (statement != null) { - statement.close(); - } - if (result != null) { - result.close(); - } - } - - return matchedPages; - } - catch (Exception e) { - throw new WikiApiException(e); - } - } - - /** - * Return an iterable containing all pages that contain a template the name - * of which equals any of the given Strings. - * - * @param templateNames - * the names of the template that we want to match - * @return An iterable with the page objects that contain any of the the - * specified templates - * @throws WikiApiException - * If there was any error retrieving the page object (most - * likely if the templates are corrupted) - */ - public Iterable getPagesContainingTemplateNames(List templateNames) throws WikiApiException{ - return getFilteredPages(templateNames, true); + return count; + } catch (Exception e) { + throw new WikiApiException(e); } + } + + /** + * Returns the number of all pages that contain a template the name + * of which starts with any of the the given Strings. + * + * @param templateFragments a list Strings containing the beginnings of the desired templates + * @return the number of pages that contain any template starting with templateFragment + * @throws WikiApiException If there was any error retrieving the page object (most likely if the template templates are corrupted) + */ + public Integer countPagesContainingTemplateFragments(List templateFragments) throws WikiApiException { + return countFragmentFilteredPages(templateFragments, true); + } + + /** + * Returns the number of all pages that contain a template the name + * of which starts with any of the the given Strings. + * + * @param templateFragments a list Strings containing the beginnings of the desired templates + * @return the number of pages that contain any template starting with templateFragment + * @throws WikiApiException If there was any error retrieving the page object (most likely if the template templates are corrupted) + */ + public Integer countPagesNotContainingTemplateFragments(List templateFragments) throws WikiApiException { + return countFragmentFilteredPages(templateFragments, false); + } + + + /** + * Returns the number of all pages that contain a template the name of which + * equals the given String. + * + * @param templateNames a list of String containing the beginnings of the templates that have to be matched + * @param whitelist whether to return pages containing these templates (true) or return pages NOT containing these templates (false) + * @return the number of pages that contain a template starting with + * any templateFragment + * @throws WikiApiException If there was any error retrieving the page object (most + * likely if the templates are corrupted) + */ + private Integer countFilteredPages(List templateNames, boolean whitelist) + throws WikiApiException { + + try { + int count = 0; + PreparedStatement statement = null; + ResultSet result = null; + + try { + StringBuffer sqlString = new StringBuffer(); + StringBuffer subconditions = new StringBuffer(); + sqlString + .append("SELECT distinct(count(*)) FROM " + GeneratorConstants.TABLE_TPLID_TPLNAME + " as tpl, " + + GeneratorConstants.TABLE_TPLID_PAGEID + " AS p WHERE tpl.templateId = p.templateId " + (whitelist ? "AND" : "AND NOT") + " ("); + + for (@SuppressWarnings("unused") String name : templateNames) { + if (subconditions.length() != 0) { + subconditions.append("OR "); + } + subconditions.append("tpl.templateName = ?"); + } + sqlString.append(subconditions); + sqlString.append(")"); - /** - * Return an iterable containing all pages that do NOT contain a template - * the name of which equals of the given Strings. - * - * @param templateNames - * the names of the template that we want to match - * @return An iterable with the page objects that do NOT contain any of the - * the specified templates - * @throws WikiApiException - * If there was any error retrieving the page object (most - * likely if the templates are corrupted) - */ - public Iterable getPagesNotContainingTemplateNames(List templateNames) throws WikiApiException{ - return getFilteredPages(templateNames, false); - } + statement = connection.prepareStatement(sqlString.toString()); + int curIdx = 1; + for (String name : templateNames) { + name = name.toLowerCase().trim(); + name = name.replaceAll(" ", "_"); + statement.setString(curIdx++, name); + } - /** - * This method first creates a list of pages containing templates that equal - * any of the provided Strings. - * It then returns a list of revision ids of the revisions in which the - * respective templates first appeared. - * - * @param templateName - * the template names that have to be matched - * @return An list with the revision ids of the first appearance of the template - * @throws WikiApiException - * If there was any error retrieving the page object (most - * likely if the templates are corrupted) - */ - public List getRevisionsWithFirstTemplateAppearance(String templateName) throws WikiApiException{ - /* - * Note: This method does not use any revision-template-index. Each revision has to be parsed until the first revision is found that does not contain a certain template. - * TODO also create version using revision-template index - */ - System.err.println("Note: This function call demands parsing several revision for each page. A method using the revision-template index is currently under construction."); - - templateName=templateName.trim().replaceAll(" ", "_"); - - List revisionIds = new LinkedList<>(); - List pageIds = getPageIdsContainingTemplateNames(Arrays.asList(new String[]{templateName})); - if(pageIds.size()==0){ - return revisionIds; - } - if(revApi==null){ - revApi = new RevisionApi(wiki.getDatabaseConfiguration()); - } - if(parser==null){ - //TODO switch to SWEBLE - MediaWikiParserFactory pf = new MediaWikiParserFactory( - wiki.getDatabaseConfiguration().getLanguage()); - pf.setTemplateParserClass(ShowTemplateNamesAndParameters.class); - parser = pf.createParser(); - } - - for(int id:pageIds){ - //get timestamps of all revisions - List tsList = revApi.getRevisionTimestamps(id); - - // sort in reverse order - newest first - tsList.sort(Comparator.reverseOrder()); - - Revision prevRev=null; - tsloop:for(Timestamp ts:tsList){ - - Revision rev = revApi.getRevision(id, ts); - - //initialize previous revision - if(prevRev==null){ - prevRev=rev; - } - - //Parse templates and check if the revision contains the template - ParsedPage pp = parser.parse(rev.getRevisionText()); - boolean containsTpl = false; - tplLoop:for(Template tpl:pp.getTemplates()){ - if(tpl.getName().equalsIgnoreCase(templateName)){ - containsTpl=true; - break tplLoop; - } - } - - //if the revision does not contain the template, we have found - //what we were looking for. add id of previous revision - if(!containsTpl){ - revisionIds.add(prevRev.getRevisionID()); - break tsloop; - } - prevRev=rev; - } - } - - return revisionIds; - } + result = execute(statement); + if (result == null) { + return 0; + } - ////////// - - - /** - * Returns a list containing the ids of all pages that contain a - * template the name of which starts with any of the given Strings. - * - * @param templateFragments - * the beginning of the templates that have to be matched - * @param whitelist - * whether to return pages containing these templates (true) or return pages NOT containing these templates (false) - * @return An list with the ids of the pages that contain templates - * beginning with any String in templateFragments - * @throws WikiApiException - * If there was any error retrieving the page object (most - * likely if the templates are corrupted) - */ - private List getFragmentFilteredPageIds(List templateFragments, boolean whitelist) throws WikiApiException{ - try { - PreparedStatement statement = null; - ResultSet result = null; - List matchedPages = new LinkedList<>(); - - try { - StringBuffer sqlString = new StringBuffer(); - StringBuffer subconditions = new StringBuffer(); - sqlString - .append("SELECT p.pageId FROM "+ GeneratorConstants.TABLE_TPLID_TPLNAME+ " AS tpl, " - + GeneratorConstants.TABLE_TPLID_PAGEID+ " AS p WHERE tpl.templateId = p.templateId "+(whitelist?"AND":"AND NOT")+" ("); - for(@SuppressWarnings("unused") String fragment:templateFragments){ - if(subconditions.length()!=0){ - subconditions.append("OR "); - } - subconditions.append("tpl.templateName LIKE ?"); - } - sqlString.append(subconditions); - sqlString.append(")"); - - statement = connection.prepareStatement(sqlString.toString()); - - int curIdx=1; - for(String fragment:templateFragments){ - fragment=fragment.toLowerCase().trim(); - fragment=fragment.replaceAll(" ","_"); - statement.setString(curIdx++, fragment + "%"); - } - - result = execute(statement); - - if (result == null) { - throw new WikiPageNotFoundException("Nothing was found"); - } - - while (result.next()) { - matchedPages.add(result.getInt(1)); - } - } - finally { - if (statement != null) { - statement.close(); - } - if (result != null) { - result.close(); - } - } - - return matchedPages; - } - catch (Exception e) { - throw new WikiApiException(e); - } - } - - /** - * Returns a list containing the ids of all pages that contain a - * template the name of which starts with any of the given Strings. - * - * @param templateFragments - * the beginning of the templates that have to be matched - * @return An list with the ids of the pages that contain templates - * beginning with any String in templateFragments - * @throws WikiApiException - * If there was any error retrieving the page object (most - * likely if the template templates are corrupted) - */ - public List getPageIdsContainingTemplateFragments(List templateFragments) throws WikiApiException{ - return getFragmentFilteredPageIds(templateFragments,true); - } + if (result.next()) { + count = result.getInt(1); + } + } finally { + if (statement != null) { + statement.close(); + } + if (result != null) { + result.close(); + } + } - /** - * Returns a list containing the ids of all pages that contain a - * template the name of which starts with any of the given Strings. - * - * @param templateFragments - * the beginning of the templates that have to be matched - * @return An list with the ids of the pages that do not contain templates - * beginning with any String in templateFragments - * @throws WikiApiException - * If there was any error retrieving the page object (most - * likely if the template templates are corrupted) - */ - public List getPageIdsNotContainingTemplateFragments(List templateFragments) throws WikiApiException{ - return getFragmentFilteredPageIds(templateFragments,false); + return count; + } catch (Exception e) { + throw new WikiApiException(e); } + } + + /** + * Returns the number of all pages that contain a template the name of which + * equals the given String. + * + * @param templateNames a list of String containing the beginnings of the templates that have to be matched + * @return the number of pages that contain a template starting with + * any templateFragment + * @throws WikiApiException If there was any error retrieving the page object (most + * likely if the templates are corrupted) + */ + public Integer countPagesContainingTemplateNames(List templateNames) throws WikiApiException { + return countFilteredPages(templateNames, true); + } + + /** + * Returns the number of all pages that do not contain a template the name of which + * equals the given String. + * + * @param templateNames a list of String containing the beginnings of the templates that have to be matched + * @return the number of pages that do not contain a template starting with + * any templateFragment + * @throws WikiApiException If there was any error retrieving the page object (most + * likely if the templates are corrupted) + */ + public Integer countPagesNotContainingTemplateNames(List templateNames) throws WikiApiException { + return countFilteredPages(templateNames, false); + } + + /** + * Return an iterable containing all pages that contain a template the name + * of which starts with any of the given Strings. + * + * @param templateFragments the beginning of the templates that have to be matched + * @param whitelist whether to return pages containing these templates (true) or return pages NOT containing these templates (false) + * @return An iterable with the page objects that contain templates + * beginning with any String in templateFragments + * @throws WikiApiException If there was any error retrieving the page object (most + * likely if the templates are corrupted) + */ + private Iterable getFragmentFilteredPages(List templateFragments, boolean whitelist) throws WikiApiException { + + try { + PreparedStatement statement = null; + ResultSet result = null; + List matchedPages = new LinkedList<>(); + + try { + StringBuffer sqlString = new StringBuffer(); + StringBuffer subconditions = new StringBuffer(); + sqlString.append("SELECT p.pageId FROM " + + GeneratorConstants.TABLE_TPLID_TPLNAME + " AS tpl, " + + GeneratorConstants.TABLE_TPLID_PAGEID + + " AS p WHERE tpl.templateId = p.templateId " + (whitelist ? "AND" : "AND NOT") + " ("); + + for (@SuppressWarnings("unused") String fragment : templateFragments) { + if (subconditions.length() != 0) { + subconditions.append("OR "); + } + subconditions.append("tpl.templateName LIKE ?"); + } + sqlString.append(subconditions); + sqlString.append(")"); - /////////////////// - - /** - * Returns a list containing the ids of all revisions that contain a - * template the name of which starts with any of the given Strings. - * - * @param templateFragments - * the beginning of the templates that have to be matched - * @param whitelist - * whether to return pages containing these templates (true) or return pages NOT containing these templates (false) - * @return An list with the ids of the revisions that contain templates - * beginning with any String in templateFragments - * @throws WikiApiException - * If there was any error retrieving the page object (most - * likely if the templates are corrupted) - */ - private List getFragmentFilteredRevisionIds(List templateFragments, boolean whitelist) throws WikiApiException{ - - try { - PreparedStatement statement = null; - ResultSet result = null; - List matchedPages = new LinkedList<>(); - - try { - StringBuffer sqlString = new StringBuffer(); - StringBuffer subconditions = new StringBuffer(); - sqlString - .append("SELECT r.revisionId FROM "+ GeneratorConstants.TABLE_TPLID_TPLNAME+ " AS tpl, " - + GeneratorConstants.TABLE_TPLID_REVISIONID+ " AS r WHERE tpl.templateId = r.templateId "+(whitelist?"AND":"AND NOT")+" ("); - for(@SuppressWarnings("unused") String fragment:templateFragments){ - if(subconditions.length()!=0){ - subconditions.append("OR "); - } - subconditions.append("tpl.templateName LIKE ?"); - } - sqlString.append(subconditions); - sqlString.append(")"); - - statement = connection.prepareStatement(sqlString.toString()); - - int curIdx=1; - for(String fragment:templateFragments){ - fragment=fragment.toLowerCase().trim(); - fragment=fragment.replaceAll(" ","_"); - statement.setString(curIdx++, fragment + "%"); - } - - result = execute(statement); - - if (result == null) { - throw new WikiPageNotFoundException("Nothing was found"); - } - - while (result.next()) { - matchedPages.add(result.getInt(1)); - } - } - finally { - if (statement != null) { - statement.close(); - } - if (result != null) { - result.close(); - } - } - - return matchedPages; - } - catch (Exception e) { - throw new WikiApiException(e); - } - } - - - - /** - * Returns a list containing the ids of all revisions that contain a - * template the name of which starts with any of the given Strings. - * - * @param templateFragments - * the beginning of the templates that have to be matched - * @return An list with the ids of the revisions that contain templates - * beginning with any String in templateFragments - * @throws WikiApiException - * If there was any error retrieving the page object (most - * likely if the template templates are corrupted) - */ - public List getRevisionIdsContainingTemplateFragments(List templateFragments) throws WikiApiException{ - return getFragmentFilteredRevisionIds(templateFragments,true); - } + statement = connection.prepareStatement(sqlString.toString()); + + int curIdx = 1; + for (String fragment : templateFragments) { + fragment = fragment.toLowerCase().trim(); + fragment = fragment.replaceAll(" ", "_"); + statement.setString(curIdx++, fragment + "%"); + } + + result = execute(statement); + + if (result == null) { + throw new WikiPageNotFoundException("Nothing was found"); + } - /** - * Returns a list containing the ids of all revisions that contain a - * template the name of which starts with any of the given Strings. - * - * @param templateFragments - * the beginning of the templates that have to be matched - * @return An list with the ids of the revisions that do not contain templates - * beginning with any String in templateFragments - * @throws WikiApiException - * If there was any error retrieving the page object (most - * likely if the template templates are corrupted) - */ - public List getRevisionIdsNotContainingTemplateFragments(List templateFragments) throws WikiApiException{ - return getFragmentFilteredRevisionIds(templateFragments,false); + while (result.next()) { + int pageID = result.getInt(1); + matchedPages.add(wiki.getPage(pageID)); + } + } finally { + if (statement != null) { + statement.close(); + } + if (result != null) { + result.close(); + } + } + + return matchedPages; + } catch (Exception e) { + throw new WikiApiException(e); } + } + public int checkTemplateId(String templateName) throws WikiApiException { + try { + PreparedStatement statement = null; + ResultSet result = null; - /////////////////// + try { + StringBuffer sqlString = new StringBuffer(); - /** - * Returns the ids of all pages that ever contained any of the given template names in the history of their existence. - * - * @param templateNames template names to look for - * @return list of page ids of the pages that once contained any of the given template names - * @throws WikiApiException If there was any error retrieving the page object (most - * likely if the template templates are corrupted) - */ - public List getIdsOfPagesThatEverContainedTemplateNames(List templateNames) throws WikiApiException{ - if(revApi==null){ - revApi = new RevisionApi(wiki.getDatabaseConfiguration()); - } - Set pageIdSet = new HashSet<>(); - - //TODO instead of getting rev ids and then getting page ids, do one query and make the join in the db directly - List revsWithTemplate = getRevisionIdsContainingTemplateNames(templateNames); - for(int revId:revsWithTemplate){ - pageIdSet.add(revApi.getPageIdForRevisionId(revId)); - } - - return new LinkedList<>(pageIdSet); + sqlString.append("SELECT tpl.templateId FROM " + GeneratorConstants.TABLE_TPLID_TPLNAME + " AS tpl WHERE tpl.templateName='" + templateName.trim().replaceAll(" ", "_") + "'"); + + statement = connection.prepareStatement(sqlString.toString()); + + result = execute(statement); + + if (result == null) { + return -1; + } + + if (result.next()) { + return result.getInt(1); + } + } finally { + if (statement != null) { + statement.close(); + } + if (result != null) { + result.close(); + } + + } + + return -1; + } catch (Exception e) { + throw new WikiApiException(e); } + } + + + /** + * Return an iterable containing all pages that contain a template the name + * of which starts with any of the given Strings. + * + * @param templateFragments the beginning of the templates that have to be matched + * @return An iterable with the page objects that contain templates + * beginning with any String in templateFragments + * @throws WikiApiException If there was any error retrieving the page object (most + * likely if the templates are corrupted) + */ + public Iterable getPagesContainingTemplateFragments(List templateFragments) throws WikiApiException { + return getFragmentFilteredPages(templateFragments, true); + } + + /** + * Return an iterable containing all pages that contain a template the name + * of which starts with any of the given Strings. + * + * @param templateFragments the beginning of the templates that have to be matched + * @return An iterable with the page objects that contain templates + * beginning with any String in templateFragments + * @throws WikiApiException If there was any error retrieving the page object (most + * likely if the templates are corrupted) + */ + public Iterable getPagesNotContainingTemplateFragments(List templateFragments) throws WikiApiException { + return getFragmentFilteredPages(templateFragments, false); + } + + + /** + * Return an iterable containing all pages that contain a template the name + * of which starts with any of the given Strings. + * + * @param templateNames the names of the template that we want to match + * @param whitelist whether to return pages containing these templates (true) or return pages + * NOT containing these templates (false) + * @return An iterable with the page objects that contain any of the the + * specified templates + * @throws WikiApiException If there was any error retrieving the page object (most + * likely if the templates are corrupted) + */ + private Iterable getFilteredPages(List templateNames, boolean whitelist) throws WikiApiException { + try { + PreparedStatement statement = null; + ResultSet result = null; + List matchedPages = new LinkedList<>(); + + try { + StringBuffer sqlString = new StringBuffer(); + StringBuffer subconditions = new StringBuffer(); + sqlString.append("SELECT p.pageId FROM " + GeneratorConstants.TABLE_TPLID_TPLNAME + " AS tpl, " + + GeneratorConstants.TABLE_TPLID_PAGEID + " AS p WHERE tpl.templateId = p.templateId " + (whitelist ? "AND" : "AND NOT") + " ("); + + for (@SuppressWarnings("unused") String name : templateNames) { + if (subconditions.length() != 0) { + subconditions.append("OR "); + } + subconditions.append("tpl.templateName = ?"); + } + sqlString.append(subconditions); + sqlString.append(")"); - /** - * Returns the ids of all pages that ever contained any template that started with any of the given template fragments - * - * @param templateFragments template-fragments to look for - * @return list of page ids of the pages that once contained any template that started with any of the given template fragments - * @throws WikiApiException If there was any error retrieving the page object (most - * likely if the template templates are corrupted) - */ - public List getIdsOfPagesThatEverContainedTemplateFragments(List templateFragments) throws WikiApiException{ - if(revApi==null){ - revApi = new RevisionApi(wiki.getDatabaseConfiguration()); - } - Set pageIdSet = new HashSet<>(); - - //TODO instead of getting rev ids and then getting page ids, do one query and make the join in the db directly - List revsWithTemplate = getRevisionIdsContainingTemplateFragments(templateFragments); - for(int revId:revsWithTemplate){ - pageIdSet.add(revApi.getPageIdForRevisionId(revId)); - } - - List pageIds = new LinkedList<>(); - pageIds.addAll(pageIdSet); - - return pageIds; + statement = connection.prepareStatement(sqlString.toString()); + + int curIdx = 1; + for (String name : templateNames) { + name = name.toLowerCase().trim(); + name = name.replaceAll(" ", "_"); + statement.setString(curIdx++, name); + } + + result = execute(statement); + + if (result == null) { + throw new WikiPageNotFoundException("Nothing was found"); + } + + while (result.next()) { + int pageID = result.getInt(1); + matchedPages.add(wiki.getPage(pageID)); + } + } finally { + if (statement != null) { + statement.close(); + } + if (result != null) { + result.close(); + } + } + + return matchedPages; + } catch (Exception e) { + throw new WikiApiException(e); } + } + + /** + * Return an iterable containing all pages that contain a template the name + * of which equals any of the given Strings. + * + * @param templateNames the names of the template that we want to match + * @return An iterable with the page objects that contain any of the the + * specified templates + * @throws WikiApiException If there was any error retrieving the page object (most + * likely if the templates are corrupted) + */ + public Iterable getPagesContainingTemplateNames(List templateNames) throws WikiApiException { + return getFilteredPages(templateNames, true); + } + + /** + * Return an iterable containing all pages that do NOT contain a template + * the name of which equals of the given Strings. + * + * @param templateNames the names of the template that we want to match + * @return An iterable with the page objects that do NOT contain any of the + * the specified templates + * @throws WikiApiException If there was any error retrieving the page object (most + * likely if the templates are corrupted) + */ + public Iterable getPagesNotContainingTemplateNames(List templateNames) throws WikiApiException { + return getFilteredPages(templateNames, false); + } + + + /** + * This method first creates a list of pages containing templates that equal + * any of the provided Strings. + * It then returns a list of revision ids of the revisions in which the + * respective templates first appeared. + * + * @param templateName the template names that have to be matched + * @return An list with the revision ids of the first appearance of the template + * @throws WikiApiException If there was any error retrieving the page object (most + * likely if the templates are corrupted) + */ + public List getRevisionsWithFirstTemplateAppearance(String templateName) throws WikiApiException { + /* + * Note: + * This method does not use any revision-template-index. Each revision has to be parsed + * until the first revision is found that does not contain a certain template. + * TODO also create version using revision-template index + */ + System.err.println("Note: This function call demands parsing several revision for each page. " + + "A method using the revision-template index is currently under construction."); + + templateName = templateName.trim().replaceAll(" ", "_"); - /////////////////// - - - /** - * Returns a list containing the ids of all pages that contain a template - * the name of which equals any of the given Strings. - * - * @param templateNames - * the names of the template that we want to match - * @param whitelist - * whether to return pages containing these templates (true) or return pages NOT containing these templates (false) - * @return A list with the ids of all pages that contain any of the the - * specified templates - * @throws WikiApiException - * If there was any error retrieving the page object (most - * likely if the templates are corrupted) - */ - private List getFilteredPageIds(List templateNames, boolean whitelist) throws WikiApiException{ - try { - PreparedStatement statement = null; - ResultSet result = null; - List matchedPages = new LinkedList<>(); - - try { - StringBuffer sqlString = new StringBuffer(); - StringBuffer subconditions = new StringBuffer(); - sqlString.append("SELECT p.pageId FROM "+ GeneratorConstants.TABLE_TPLID_TPLNAME+ " AS tpl, " - + GeneratorConstants.TABLE_TPLID_PAGEID+ " AS p WHERE tpl.templateId = p.templateId "+(whitelist?"AND":"AND NOT")+" ("); - - for(@SuppressWarnings("unused") String name:templateNames){ - if(subconditions.length()!=0){ - subconditions.append("OR "); - } - subconditions.append("tpl.templateName = ?"); - } - sqlString.append(subconditions); - sqlString.append(")"); - - statement = connection.prepareStatement(sqlString.toString()); - - int curIdx=1; - for(String name:templateNames){ - name=name.toLowerCase().trim(); - name=name.replaceAll(" ", "_"); - statement.setString(curIdx++, name); - } - - result = execute(statement); - - if (result == null) { - throw new WikiPageNotFoundException("Nothing was found"); - } - - while (result.next()) { - matchedPages.add(result.getInt(1)); - } - } - finally { - if (statement != null) { - statement.close(); - } - if (result != null) { - result.close(); - } - } - - return matchedPages; - } - catch (Exception e) { - throw new WikiApiException(e); - } - } - - - - /** - * Returns a list containing the ids of all pages that contain a template - * the name of which equals any of the given Strings. - * - * @param templateNames - * the names of the template that we want to match - * @return A list with the ids of all pages that contain any of the the - * specified templates - * @throws WikiApiException - * If there was any error retrieving the page object (most - * likely if the templates are corrupted) - */ - public List getPageIdsContainingTemplateNames(List templateNames) throws WikiApiException{ - return getFilteredPageIds(templateNames, true); + List revisionIds = new LinkedList<>(); + List pageIds = getPageIdsContainingTemplateNames(Arrays.asList(new String[]{templateName})); + if (pageIds.size() == 0) { + return revisionIds; } - /** - * Returns a list containing the ids of all pages that do not contain a template - * the name of which equals any of the given Strings. - * - * @param templateNames - * the names of the template that we want to match - * @return A list with the ids of all pages that do not contain any of the the - * specified templates - * @throws WikiApiException - * If there was any error retrieving the page object (most - * likely if the templates are corrupted) - */ - public List getPageIdsNotContainingTemplateNames(List templateNames) throws WikiApiException{ - return getFilteredPageIds(templateNames, false); + if (revApi == null) { + revApi = new RevisionApi(wiki.getDatabaseConfiguration()); + } + if (parser == null) { + //TODO switch to SWEBLE + MediaWikiParserFactory pf = new MediaWikiParserFactory( + wiki.getDatabaseConfiguration().getLanguage()); + pf.setTemplateParserClass(ShowTemplateNamesAndParameters.class); + parser = pf.createParser(); } + for (int id : pageIds) { + //get timestamps of all revisions + List tsList = revApi.getRevisionTimestamps(id); + // sort in reverse order - newest first + tsList.sort(Comparator.reverseOrder()); + Revision prevRev = null; + tsloop: + for (Timestamp ts : tsList) { - /** - * Returns a list containing the ids of all revisions that contain a template - * the name of which equals any of the given Strings. - * - * @param templateNames - * the names of the template that we want to match - * @param whitelist - * whether to return pages containing these templates (true) or return pages NOT containing these templates (false) - * @return A list with the ids of all revisions that contain any of the the - * specified templates - * @throws WikiApiException - * If there was any error retrieving the page object (most - * likely if the templates are corrupted) - */ - private List getFilteredRevisionIds(List templateNames, boolean whitelist) throws WikiApiException{ - try { - PreparedStatement statement = null; - ResultSet result = null; - List matchedPages = new LinkedList<>(); - - try { - StringBuffer sqlString = new StringBuffer(); - StringBuffer subconditions = new StringBuffer(); - sqlString.append("SELECT r.revisionId FROM "+ GeneratorConstants.TABLE_TPLID_TPLNAME+ " AS tpl, " - + GeneratorConstants.TABLE_TPLID_REVISIONID+ " AS r WHERE tpl.templateId = r.templateId "+(whitelist?"AND":"AND NOT")+" ("); - - for(@SuppressWarnings("unused") String name:templateNames){ - if(subconditions.length()!=0){ - subconditions.append("OR "); - } - subconditions.append("tpl.templateName = ?"); - } - sqlString.append(subconditions); - sqlString.append(")"); - - statement = connection.prepareStatement(sqlString.toString()); - - int curIdx=1; - for(String name:templateNames){ - name=name.toLowerCase().trim(); - name=name.replaceAll(" ", "_"); - statement.setString(curIdx++, name); - } - - result = execute(statement); - - if (result == null) { - throw new WikiPageNotFoundException("Nothing was found"); - } - - while (result.next()) { - matchedPages.add(result.getInt(1)); - } - } - finally { - if (statement != null) { - statement.close(); - } - if (result != null) { - result.close(); - } - } - - return matchedPages; - } - catch (Exception e) { - throw new WikiApiException(e); - } - } - - - /** - * Returns a list containing the ids of all revisions that contain a template - * the name of which equals any of the given Strings. - * - * @param templateNames - * the names of the template that we want to match - * @return A list with the ids of all revisions that contain any of the the - * specified templates - * @throws WikiApiException - * If there was any error retrieving the page object (most - * likely if the templates are corrupted) - */ - public List getRevisionIdsContainingTemplateNames(List templateNames) throws WikiApiException{ - return getFilteredRevisionIds(templateNames, true); - } + Revision rev = revApi.getRevision(id, ts); - /** - * Returns a list containing the ids of all revisions that do not contain a template - * the name of which equals any of the given Strings. - * - * @param templateNames - * the names of the template that we want to match - * @return A list with the ids of all revisions that do not contain any of the the - * specified templates - * @throws WikiApiException - * If there was any error retrieving the page object (most - * likely if the templates are corrupted) - */ - public List getRevisionIdsNotContainingTemplateNames(List templateNames) throws WikiApiException{ - return getFilteredRevisionIds(templateNames, false); + //initialize previous revision + if (prevRev == null) { + prevRev = rev; + } + + //Parse templates and check if the revision contains the template + ParsedPage pp = parser.parse(rev.getRevisionText()); + boolean containsTpl = false; + tplLoop: + for (Template tpl : pp.getTemplates()) { + if (tpl.getName().equalsIgnoreCase(templateName)) { + containsTpl = true; + break tplLoop; + } + } + + //if the revision does not contain the template, we have found + //what we were looking for. add id of previous revision + if (!containsTpl) { + revisionIds.add(prevRev.getRevisionID()); + break tsloop; + } + prevRev = rev; + } } + return revisionIds; + } + + + ////////// + + + /** + * Returns a list containing the ids of all pages that contain a + * template the name of which starts with any of the given Strings. + * + * @param templateFragments the beginning of the templates that have to be matched + * @param whitelist whether to return pages containing these templates (true) or return pages NOT containing these templates (false) + * @return An list with the ids of the pages that contain templates + * beginning with any String in templateFragments + * @throws WikiApiException If there was any error retrieving the page object (most + * likely if the templates are corrupted) + */ + private List getFragmentFilteredPageIds(List templateFragments, boolean whitelist) + throws WikiApiException { + try { + PreparedStatement statement = null; + ResultSet result = null; + List matchedPages = new LinkedList<>(); + + try { + StringBuffer sqlString = new StringBuffer(); + StringBuffer subconditions = new StringBuffer(); + sqlString.append("SELECT p.pageId FROM " + GeneratorConstants.TABLE_TPLID_TPLNAME + " AS tpl, " + + GeneratorConstants.TABLE_TPLID_PAGEID + " AS p WHERE tpl.templateId = p.templateId " + (whitelist ? "AND" : "AND NOT") + " ("); + for (@SuppressWarnings("unused") String fragment : templateFragments) { + if (subconditions.length() != 0) { + subconditions.append("OR "); + } + subconditions.append("tpl.templateName LIKE ?"); + } + sqlString.append(subconditions); + sqlString.append(")"); + statement = connection.prepareStatement(sqlString.toString()); + int curIdx = 1; + for (String fragment : templateFragments) { + fragment = fragment.toLowerCase().trim(); + fragment = fragment.replaceAll(" ", "_"); + statement.setString(curIdx++, fragment + "%"); + } - /** - * Returns the names of all templates contained in the specified page. - * - * @param page - * the page object for which the templates should be retrieved - * @return A List with the names of the templates contained in the specified - * page - * @throws WikiApiException - * If there was any error retrieving the page object (most - * likely if the templates are corrupted) - */ - public List getTemplateNamesFromPage(Page page) throws WikiApiException{ - return getTemplateNamesFromPage(page.getPageId()); - } + result = execute(statement); - /** - * Returns the names of all templates contained in the specified page. - * - * @param pageTitle - * the title of the page for which the templates should be - * retrieved - * @return A List with the names of the templates contained in the specified - * page - * @throws WikiApiException - * If there was any error retrieving the page object (most - * likely if the templates are corrupted) - */ - public List getTemplateNamesFromPage(String pageTitle) throws WikiApiException{ - Page p; - try{ - p = wiki.getPage(pageTitle); - }catch (WikiApiException e) { - return new ArrayList<>(); - } - return getTemplateNamesFromPage(p); + if (result == null) { + throw new WikiPageNotFoundException("Nothing was found"); + } + + while (result.next()) { + matchedPages.add(result.getInt(1)); + } + } finally { + if (statement != null) { + statement.close(); + } + if (result != null) { + result.close(); + } + } + + return matchedPages; + } catch (Exception e) { + throw new WikiApiException(e); } + } + + /** + * Returns a list containing the ids of all pages that contain a + * template the name of which starts with any of the given Strings. + * + * @param templateFragments the beginning of the templates that have to be matched + * @return An list with the ids of the pages that contain templates + * beginning with any String in templateFragments + * @throws WikiApiException If there was any error retrieving the page object (most + * likely if the template templates are corrupted) + */ + public List getPageIdsContainingTemplateFragments(List templateFragments) throws WikiApiException { + return getFragmentFilteredPageIds(templateFragments, true); + } + + /** + * Returns a list containing the ids of all pages that contain a + * template the name of which starts with any of the given Strings. + * + * @param templateFragments the beginning of the templates that have to be matched + * @return An list with the ids of the pages that do not contain templates + * beginning with any String in templateFragments + * @throws WikiApiException If there was any error retrieving the page object (most + * likely if the template templates are corrupted) + */ + public List getPageIdsNotContainingTemplateFragments(List templateFragments) throws WikiApiException { + return getFragmentFilteredPageIds(templateFragments, false); + } + + /////////////////// + + /** + * Returns a list containing the ids of all revisions that contain a + * template the name of which starts with any of the given Strings. + * + * @param templateFragments the beginning of the templates that have to be matched + * @param whitelist whether to return pages containing these templates (true) or return pages NOT containing these templates (false) + * @return An list with the ids of the revisions that contain templates + * beginning with any String in templateFragments + * @throws WikiApiException If there was any error retrieving the page object (most + * likely if the templates are corrupted) + */ + private List getFragmentFilteredRevisionIds(List templateFragments, boolean whitelist) throws WikiApiException { + + try { + PreparedStatement statement = null; + ResultSet result = null; + List matchedPages = new LinkedList<>(); + + try { + StringBuffer sqlString = new StringBuffer(); + StringBuffer subconditions = new StringBuffer(); + sqlString.append("SELECT r.revisionId FROM " + GeneratorConstants.TABLE_TPLID_TPLNAME + " AS tpl, " + + GeneratorConstants.TABLE_TPLID_REVISIONID + " AS r WHERE tpl.templateId = r.templateId " + (whitelist ? "AND" : "AND NOT") + " ("); + for (@SuppressWarnings("unused") String fragment : templateFragments) { + if (subconditions.length() != 0) { + subconditions.append("OR "); + } + subconditions.append("tpl.templateName LIKE ?"); + } + sqlString.append(subconditions); + sqlString.append(")"); + statement = connection.prepareStatement(sqlString.toString()); - /** - * Returns the names of all templates contained in the specified page. - * - * @param pageId - * the id of the Wiki page - * @return A List with the names of the templates contained in the specified - * page - * @throws WikiApiException - * If there was any error retrieving the page object (most - * likely if the templates are corrupted) - */ - public List getTemplateNamesFromPage(int pageId) throws WikiApiException{ - if(pageId<1){ - throw new WikiApiException("Page ID must be > 0"); - } - try { - PreparedStatement statement = null; - ResultSet result = null; - List templateNames = new LinkedList<>(); - - try { - statement = connection.prepareStatement("SELECT tpl.templateName FROM "+ GeneratorConstants.TABLE_TPLID_TPLNAME+ " AS tpl, " - + GeneratorConstants.TABLE_TPLID_PAGEID+ " AS p WHERE tpl.templateId = p.templateId AND p.pageId = ?"); - statement.setInt(1, pageId); - - result = execute(statement); - - if (result == null) { - return templateNames; - } - - while (result.next()) { - templateNames.add(result.getString(1).toLowerCase()); - } - } - finally { - if (statement != null) { - statement.close(); - } - if (result != null) { - result.close(); - } - } - - return templateNames; - } - catch (Exception e) { - throw new WikiApiException(e); - } - } - - - /** - * Returns the names of all templates contained in the specified revision. - * - * @param revid - * the revision id - * @return A List with the names of the templates contained in the specified - * revision - * @throws WikiApiException - * If there was any error retrieving the page object (most - * likely if the templates are corrupted) - */ - public List getTemplateNamesFromRevision(int revid) throws WikiApiException{ - if(revid<1){ - throw new WikiApiException("Revision ID must be > 0"); - } - try { - PreparedStatement statement = null; - ResultSet result = null; - List templateNames = new LinkedList<>(); - - try { - statement = connection.prepareStatement("SELECT tpl.templateName FROM "+ GeneratorConstants.TABLE_TPLID_TPLNAME+ " AS tpl, " - + GeneratorConstants.TABLE_TPLID_REVISIONID+ " AS p WHERE tpl.templateId = p.templateId AND p.revisionId = ?"); - statement.setInt(1, revid); - - result = execute(statement); - - if (result == null) { - return templateNames; - } - - while (result.next()) { - templateNames.add(result.getString(1).toLowerCase()); - } - } - finally { - if (statement != null) { - statement.close(); - } - if (result != null) { - result.close(); - } - } - - return templateNames; - } - catch (Exception e) { - throw new WikiApiException(e); - } - } - - - /** - * Determines whether a given revision contains a given template name - * - * @param revId - * @param templateName a template name - * @return - * @throws WikiApiException - */ - public boolean revisionContainsTemplateName(int revId, String templateName) throws WikiApiException{ - return revisionContainsTemplateNames(revId, Arrays.asList(new String[]{templateName})); + int curIdx = 1; + for (String fragment : templateFragments) { + fragment = fragment.toLowerCase().trim(); + fragment = fragment.replaceAll(" ", "_"); + statement.setString(curIdx++, fragment + "%"); + } + + result = execute(statement); + + if (result == null) { + throw new WikiPageNotFoundException("Nothing was found"); + } + + while (result.next()) { + matchedPages.add(result.getInt(1)); + } + } finally { + if (statement != null) { + statement.close(); + } + if (result != null) { + result.close(); + } + } + + return matchedPages; + } catch (Exception e) { + throw new WikiApiException(e); } + } + + + /** + * Returns a list containing the ids of all revisions that contain a + * template the name of which starts with any of the given Strings. + * + * @param templateFragments the beginning of the templates that have to be matched + * @return An list with the ids of the revisions that contain templates + * beginning with any String in templateFragments + * @throws WikiApiException If there was any error retrieving the page object (most + * likely if the template templates are corrupted) + */ + public List getRevisionIdsContainingTemplateFragments(List templateFragments) throws WikiApiException { + return getFragmentFilteredRevisionIds(templateFragments, true); + } + + /** + * Returns a list containing the ids of all revisions that contain a + * template the name of which starts with any of the given Strings. + * + * @param templateFragments the beginning of the templates that have to be matched + * @return An list with the ids of the revisions that do not contain templates + * beginning with any String in templateFragments + * @throws WikiApiException If there was any error retrieving the page object (most + * likely if the template templates are corrupted) + */ + public List getRevisionIdsNotContainingTemplateFragments(List templateFragments) throws WikiApiException { + return getFragmentFilteredRevisionIds(templateFragments, false); + } + + + /////////////////// + + + /** + * Returns the ids of all pages that ever contained any of the given template names in the history of their existence. + * + * @param templateNames template names to look for + * @return list of page ids of the pages that once contained any of the given template names + * @throws WikiApiException If there was any error retrieving the page object (most + * likely if the template templates are corrupted) + */ + public List getIdsOfPagesThatEverContainedTemplateNames(List templateNames) throws WikiApiException { + if (revApi == null) { + revApi = new RevisionApi(wiki.getDatabaseConfiguration()); + } + Set pageIdSet = new HashSet<>(); - /** - * Determines whether a given revision contains a given template name - * - * @param revId - * @param templateNames a list of template names - * @return - * @throws WikiApiException - */ - public boolean revisionContainsTemplateNames(int revId, List templateNames) throws WikiApiException{ - List tplList = getTemplateNamesFromRevision(revId); - for(String tpl:tplList){ - for(String templateName:templateNames){ - if(tpl.equalsIgnoreCase(templateName)){ - return true; - } - } - } - return false; + //TODO instead of getting rev ids and then getting page ids, do one query and make the join in the db directly + List revsWithTemplate = getRevisionIdsContainingTemplateNames(templateNames); + for (int revId : revsWithTemplate) { + pageIdSet.add(revApi.getPageIdForRevisionId(revId)); } - /** - * Determines whether a given revision contains a template starting witht the given fragment - * - * @param revId - * @param templateFragment - * @return - * @throws WikiApiException - */ - public boolean revisionContainsTemplateFragment(int revId, String templateFragment) throws WikiApiException{ - List tplList = getTemplateNamesFromRevision(revId); - for(String tpl:tplList){ - if(tpl.toLowerCase().startsWith(templateFragment.toLowerCase())){ - return true; - } - } - return false; + return new LinkedList<>(pageIdSet); + } + + /** + * Returns the ids of all pages that ever contained any template that started with any of the given template fragments + * + * @param templateFragments template-fragments to look for + * @return list of page ids of the pages that once contained any template that started with any of the given template fragments + * @throws WikiApiException If there was any error retrieving the page object (most + * likely if the template templates are corrupted) + */ + public List getIdsOfPagesThatEverContainedTemplateFragments(List templateFragments) throws WikiApiException { + if (revApi == null) { + revApi = new RevisionApi(wiki.getDatabaseConfiguration()); } + Set pageIdSet = new HashSet<>(); - /** - * Does the same as revisionContainsTemplateName() without using a template index - * - * @param revId - * @param templateName - * @return - * @throws WikiApiException - */ - public boolean revisionContainsTemplateNameWithoutIndex(int revId, String templateName) throws WikiApiException{ - if(revApi==null){ - revApi = new RevisionApi(wiki.getDatabaseConfiguration()); - } - if(parser==null){ - //TODO switch to SWEBLE - MediaWikiParserFactory pf = new MediaWikiParserFactory( - wiki.getDatabaseConfiguration().getLanguage()); - pf.setTemplateParserClass(ShowTemplateNamesAndParameters.class); - parser = pf.createParser(); - } - - List