diff --git a/README.md b/README.md index ff13052..7f7fc51 100644 --- a/README.md +++ b/README.md @@ -21,10 +21,10 @@ Use: from materials_entity_recognition import MatRecognition model = MatRecognition() - all_materials, precursors, targets, other_materials = model.mat_recognize(input_para_text) + result = model.mat_recognize(input_para_text) Parameters: Input: plain text of a paragraph - Output: 4 list objects, which are all materials, precursors, targets, other materials, respectively. + Output: a list of dict objects, containing all materials, precursors, targets, and other materials for each sentence in the input paragraph. diff --git a/materials_entity_recognition/__init__.py b/materials_entity_recognition/__init__.py index 8f5c328..0f092ee 100644 --- a/materials_entity_recognition/__init__.py +++ b/materials_entity_recognition/__init__.py @@ -1,6 +1,9 @@ from .scripts import MatIdentification from .scripts import MatRecognition from .scripts import MatTPIdentification +from .scripts import MatIdentificationBagging +from .scripts import MatRecognitionBagging +from .scripts import MatTPIdentificationBagging __author__ = 'Tanjin He' __maintainer__ = 'Tanjin He' diff --git a/materials_entity_recognition/data/test_paras.json b/materials_entity_recognition/data/test_paras.json index 419c40e..760fd73 100644 --- a/materials_entity_recognition/data/test_paras.json +++ b/materials_entity_recognition/data/test_paras.json @@ -1,3 +1,28 @@ [ - "Samples of (Zn1-xAlxO)mIn2O3 (m=integer; x = 0, 0.004, 0.008, 0.012, 0.016, and 0.02) were prepared from fine high-purity powders of ZnO, In2O3, and Al2O3 by a solid-state reaction. A mixture of the ZnO, In2O3, and Al2O3 powders and ethyl alcohol was milled for 4h using a planetary mill (Fritsch pulverisette 6) and a ZrO2 ball as a grinding media." + "Transparent bulk silicate undoped and (1%) Eu3+ xerogels were prepared by using the sol\u2013gel method (Aldrich reagents) according to the method described in [20]\u00a0and\u00a0[21]. In the first step tetraethoxysilane (TEOS) was hydrolyzed under constant stirring with a mixed solution of ethanol and water and using glacial acetic acid as catalyst; molar ratio was 1:4:10:0.5. Then another solution of Eu(CH3COO)3, Y(CH3COO)3, Li(CH3COO) and CF3COOH with the molar ratio for Eu:Y:Li:F of 1:5:20:255 was prepared by and added to the first solution. For other molar ratio of Y to Li (i.e. smaller than four) we have obtained glass\u2013ceramic containing only YF3 phase (i.e. for 1 to 1 molar ratio) or a mixture of YF3 and LiYF4 as was reported in Ref. [17]. After an additional vigorous stirring for 1\u00a0h at room temperature, the mixed solution was aged at room temperature for several days in a sealed container. Then the wet-gel obtained was dried up to 120\u00a0\u00b0C during 1 week to form the xerogel. Glass ceramization was obtained after subsequently thermal treatments in air at 530\u00a0\u00b0C for 30\u00a0min. in air. Using the same procedure we have prepared an Eu-doped xerogel and a silica glass.", + "For the comparison of the optical properties we have prepared LiYF4:Eu3+ (1\u00a0mol.%) polycrystalline by using conventional solid state route: stoichiometric mixture of YF3, EuF3 and LiF (0.99:0.01:1) was thoroughly ground (down to tens on microns size) and dried at 50\u00a0\u00b0C. Then the powder was pressed (as pellets) and sintered at 750\u00a0\u00b0C for 2\u00a0h in dry nitrogen atmosphere.", + "Samples of (Zn1-xAlxO)mIn2O3 (m=integer; x = 0, 0.004, 0.008, 0.012, 0.016, and 0.02) were prepared from fine high-purity powders of ZnO, In2O3, and Al2O3 by a solid-state reaction. A mixture of the ZnO, In2O3, and Al2O3 powders and ethyl alcohol was milled for 4h using a planetary mill (Fritsch pulverisette 6) and a ZrO2 ball as a grinding media. The obtained slurries were dried at 353K in an oven for 24h. The mixed powders were calcined in a mullite crucible at 1423K for 6h. The calcined powders were then ground carefully in mortar and passed through a 200-mesh sieve. Subsequently, the sieved powders were pressed at a pressure of 48MPa to prepare pellets of 3mm-thick and 30mm in diameter. The green compacts were heated at 1823K for 2h in air, and then furnace cooled.", + "PZT piezoelectric ceramic powders were prepared from approximate amounts of reagent-grade PbO (Cerac Incorporated, 325mesh, 99.9%, Japan), ZrO2 (Cerac Incorporated, 325mesh, 99.7% Japan), and TiO2 (Cerac Incorporated, 325mesh, 99.9% Japan). The columbite method was used to synthesize the ceramic powders in order to eliminate the formation of a pyrochlore-type phase. The PZT powder, with the addition of 5% Pb(MgNb)O3, could be completely sintered by maintaining it at 1100\u00b0C for 2h.", + "Phase pure polycrystalline sample of Li2VPO6 was synthesized on the 10g scale through standard solid-state reaction techniques. Stoichiometric amounts of Li2CO3 (Hayashi, 98%), V2O5 (Aldrich, 99%), and NH4H2PO4 (Aldrich, 98%) were thoroughly mixed with an agate mortar and pestle and pressed into pellets. The pellets were transferred to an alumina crucible and were gradually heated to 580\u00b0C for 36h with an intermediate regrinding. The pellets were cooled at a rate of 10\u00b0Ch-1 to room temperature.", + "Large 3.5\u00a0g samples of Li6ALa2Nb2O12 (A\u00a0=\u00a0Ca, Sr) were prepared by standard solid state synthesis from stoichiometric amounts of SrCO3, CaCO3, La2O3, Nb2O5, and a 10% excess of isotopically enriched Li2CO3 (>\u00a099% 7Li: so as to reduce problems with the high neutron absorption cross-section of 6Li). The intimately mixed powders were first heated at 700\u00a0\u00b0C for 14\u00a0h, before regrinding and pressing as pellets (2\u00a0cm diameter). The pellets were then heated at 900\u00a0\u00b0C for 14\u00a0h. Phase purity was determined using X-ray powder diffraction (Panalytical X'pert pro system with an X'celerator detector). Conductivity measurements were performed using AC impedance spectroscopy (Hewlett Packard 4182A Impedance Analyser). Samples for measurement were prepared by pressing into 13\u00a0mm diameter pellets and firing for 2\u00a0h at 925\u00a0\u00b0C. Au electrodes were then affixed to the pellets using Au paste and the pellet fired again, at 700\u00a0\u00b0C for 30\u00a0min, to give good electrical contact between sample and electrode, and the sample was cooled to room temperature in the furnace.", + "The phosphors of Zn3Ga2Ge2-xSixO10:2.5mol% Cr3+ were synthesized via conventional high temperature solid-state reaction method. According to our previous experiments, the most suitable sintering temperature is 1300\u00b0C and the chromium doping amount is 2.5% . Firstly, ZnO (AR), SiO2 (AR), Cr2O3 (AR), Ga2O3 (99.999%) and GeO2 (99.999%) raw powders were weighed accurately according to the stoichiometric composition of samples. Then the materials were put into an agate mortar and ground thoroughly to form homogeneous fine powder. The mixed powder was pressed into cylindrical compact discs under a uniaxial pressure of 20MPa after pre-sintering at 800\u00b0C for 1.5h in air. And finally the discs were put into alumina crucibles and sintered in an oven at 1300\u00b0C in air for 3h to form a solid bulk sample.", + "Samples of Srn+1FenO3n+1 with n=1, 2, 3 and \u221e were synthesized from stoichiometric amounts of Sr and Fe precursors, starting with a combination of either SrCO3 and Fe2O3 (n=1, 3, \u221e) or Sr(NO3)2 and Fe(NO3)3\u00b79H2O (n=2). For each case the synthesis conditions employed were carefully optimized on the basis of thermal stability of the phase concerned. The n=2 and \u221e members of the series are stable at high temperatures (above 1000\u00a0\u00b0C), whereas the n=1 and 3 members are stable only at low temperatures and therefore rather difficult to be obtained in single-phase form [23], [24]\u00a0and\u00a0[25].", + "The Sr2FeO4 (n=1) sample was synthesized through a nitrate precipitation route, by dissolving the starting materials in concentrated nitric acid and then letting the excess solvent evaporate on a hot plate, after which the nitrate residue thus obtained was calcined in air at 600\u00a0\u00b0C for 12\u00a0h. The final heat treatment was carried out in O2 gas flow at 750\u00a0\u00b0C for ca. 70\u00a0h. The Sr3Fe2O7 (n=2) sample was synthesized through another wet-chemical route, in which the metal cations are chelated by ethylenediaminetetraacetic acid (EDTA) from an aqueous nitrate solution [22]. The excess solution was evaporated in a hot water bath and the resultant dry gel was calcined in air at 450\u00a0\u00b0C for 4\u00a0h. The final heat treatment was carried out in air at 1000\u00a0\u00b0C with several intermediate grindings for ca. 110\u00a0h in total. The Sr4Fe3O10 (n=3) and SrFeO3 (n=\u221e) members were obtained through solid-state synthesis routes. The well-ground precursor mixtures were heat treated in air at 850\u00a0\u00b0C (for ca. 70\u00a0h) and 1100\u00a0\u00b0C (for 24\u00a0h), respectively. For all the four phases, the sample was quenched to room temperature after the final heat treatment, followed by immediate characterization of the as-synthesized sample for phase purity and lattice parameters by means of X-ray powder diffraction (XRD; Rikagu RINT2000 equipped with a rotating Cu anode). After the characterization, the as-synthesized samples were exposed to ambient air (temperature 20\u00b12\u00a0\u00b0C, relative humidity 45\u00b110%) or dipped in distilled water in order to study their tendencies to form water derivatives.", + "Ba2LnFeNb4O15 (Ln\u00a0=\u00a0La, Pr, Nd, Sm, Eu, Gd) compounds have been obtained by conventional solid state route, from stoichiometric mixtures of BaCO3, Fe2O3, Nb2O5 and Ln2O3, all reagents with 99.9% minimal purity grade. Ceramics were sintered at 1300\u00a0\u00b0C and exhibited densities ranging from 90 to 96% of the nominal density of the products.", + "Samples of Li2MnSiO4 were synthesized by a solid-state route. Stoichiometric quantities of LiOH (Sigma\u2013Aldrich, >98%), MnCO3 (Sigma\u2013Aldrich, >99.9%), SiO2 (fumed, Sigma\u2013Aldrich, 0.007\u00a0\u03bcm), together with 20\u00a0mol% adipic acid, were milled with dry hexane in a vibratory ball mill for 1\u00a0h. The mixed powders were heated at 1\u00a0\u00b0C\u00a0per\u00a0min to 450\u00a0\u00b0C for 10\u00a0h under dynamic vacuum to decompose the precursors. The resulting fine, dark brown powder was ground in a mortar and pestle and then heated to 700\u00a0\u00b0C for 10\u00a0h in argon in a tube furnace to prevent oxidation of the Mn2+. To complete the reaction, the sample was heated to 900\u00a0\u00b0C for a further 10\u00a0h in argon and allowed to cool to room temperature in the furnace. Sample powders were stored in an argon glovebox.", + "The mixed oxide solid-state method was used to prepare BaZr0.15Ti0.85O3. The stoichiometric composition, hereafter will be designated as BZT, was prepared from TiO2, BaCO3, (Riedel-deHa\u00ebn 99.9%), and ZrO2 (UniChem 99%). High-purity reagents were weighed in accordance with the stoichiometric compositions and mixed by ball milling for 5h in Teflon pots using zirconia balls and with ethanol as a solvent. The powders were dried at 230\u00b0C for 2h before weighing and mixing. Calcination was carried out at 1150\u00b0C for 2h. 0.5 and 2.0 wt% of ZnO (DUKSAN 99.9%) were then added to the calcined powder by wet mixing. Uniaxial pressing of the calcined powders was done at 170MPa and the pellets were formed with the dimensions of 10mm in diameter and 1.5mm in thickness, which were then sintered at 1300\u00b0C for 4h.", + "The ternary compounds MTe2O6 (s) (M: Th, Ce) were prepared by solid state reaction of MO2 (s) and TeO2 (s) in the mole ratio 1:2. The powders were homogenized by mixing them intimately. The homogenized powders were cold compacted into cylindrical pellets and then heated at 923K for 24h. The pellets were withdrawn intermittently during the heat treatment from the furnace, powdered, re-compacted and then heat treated again. This cycle was repeated twice in order to ensure complete formation of the desired ternary compound.", + "Undoped and 0.5\u00a0at.% Co-doped TiO2 powders were synthesized by standard solid-state reaction method. High purity of commercial TiO2 anatase and CoCO3\u00b7H2O powders were mixed thoroughly and sintered at 1000\u00a0\u00b0C for 10\u00a0h in air. The resulting powder was grinded carefully using an agate mortar to avoid any magnetic impurity contamination. The cationic composition of the powder was checked by means of energy dispersive X-ray spectroscopy (EDS). Additional annealing procedures were made at 450\u00a0\u00b0C for 10\u00a0h under a mixture of N2/H2 (5% H2) flow in order to create an oxygen vacancy in the samples.", + "The compounds of nominal composition Li5La3Nb2-xYxO12-\u03b4 (0\u2264x\u22641) were prepared by a solid-state reaction using appropriate amount of the precursors of high purity LiNO3 (99%, Alfa Aesar) La2O3 (99.99%, Alfa Aesar, pre-heated at 800\u00b0C for 24h), Nb2O5 (99.5%, Alfa Aesar) and Y(NO3)3 (99.9%, Alfa Aesar). LiNO3 was added in 10wt% excess to account for the loss of Li2O during annealing. The mixtures were ball milled in Pulverisette, Fritsch, Germany ball mill at 200rpm for 12h in 2-propanol using zirconia balls before and after heat treatment at 700\u00b0C for 6h. The powders obtained were pressed into pellets using isostatic press, (P.O. Weber, Germany) employing a pressure of 300kN for 5min. The pellets were sintered at 900\u00b0C for 24h and 1000-1100\u00b0C for 6h covered with the same powder to suppress potential volatilization of Li2O during the preparation. For the chemical stability test, a spinel-type cathode material Li2FeMn3O8 was prepared by the conventional solid state method using the high purity precursors such as Li2CO3, FeC2O4\u00b72H2O, and MnCO3 at 700\u00b0C for 24h in air.", + "The conventional solid-state method was employed to prepare YIG ceramics with various amounts of Ce2O3. The starting raw materials were Y2O3 (yttria, 99.9\u00a0% reagent grade, Aldrich) and Fe2O3 (ferric, 99.0\u00a0% reagent grade, Aldrich) and CeO2 (ceric, 99.0\u00a0% reagent grade, Aldrich). The raw materials were weighted and mixed according to the Y3\u2212xCexFe5O12 system with x\u00a0=\u00a00.0, 0.1, 0.3, 0.5, 1.0, 1.1, 1.2, 1.3, 1.4 and 1.5, respectively. Then, the combination of raw materials was added with ethanol and zirconia ball in the weight ratio of 1:5 (powder mixtures to ethanol) and 1:10 (powder mixtures to ball) of weight ratio.", + "The mixing was done with Heidoph Overhead Mixer (Reax 2) for 8\u00a0h at 25\u00a0rpm. The mixtures or slurries were dried at 80\u00a0\u00b0C (Memmert oven-UM 400) for 24\u00a0h to form dried cakes. The cake was ground using an agate mortar before being put into an alumina crucible and calcined at 1,100\u00a0\u00b0C for 6\u00a0h. The calcined powders were then ground before compacting into 16\u00a0mm cylindrical pellets using hardened steel die (150\u00a0MPa). The pellets were then sintered at 1,420\u00a0\u00b0C for 6\u00a0h.", + "Undoped yttrium aluminate powder was prepared by the citrate gel route. The starting materials taken were Y(NO3)3\u22c55H2O (Aldrich, 99.99% pure) and Al(NO3)3\u22c59H2O (Aldrich, 99.99+% pure). The precursor solution was prepared by dissolving the required amount of metal nitrates in distilled water. A calculated amount of citric acid was added to the solution: 1\u00a0mol of trivalent cations needed 1\u00a0mol of citric acid. Twice the amount of citric acid to metal cations in distilled water was added dropwise to the prepared aqueous solution to chelate Y3+ and Al3+ cations in the solution. The solution was continuously stirred for about 3\u20134\u00a0h at 80\u00a0\u00b0C until a brown gel was obtained. The gel was dried in an oven at about 130\u00a0\u00b0C for 4\u00a0h. The dried resin was charred in a furnace at 400\u00a0\u00b0C for 1\u00a0h to obtain the dried precursor. The precursor powder was calcined at 1,100\u00a0\u00b0C for 2\u00a0h. A subsequent 24\u00a0h ball milling in polyethylene jar containing zirconia balls in ethanol was performed to break the soft agglomerates and mix the powders to homogeneity. The calcined powders were then uniaxially cold-pressed at 30\u00a0kg/cm2 for 60\u00a0s into a 10\u00a0mm diameter pellet. The pressed pellets were sintered at 1,575\u00a0\u00b0C for 12\u00a0h. Heating rate was 5\u00a0\u00b0C/min, and the sample was cooled at 5\u00a0\u00b0C/min until 800\u00a0\u00b0C and then furnace-cooled to room temperature.", + "\\( {{\\hbox{Y}}_{1 - x}}{\\hbox{C}}{{\\hbox{a}}_x}{\\hbox{Al}}{{\\hbox{O}}_{3 - \\delta }}\\left( {x = 0.03 - 0.25} \\right) \\) was prepared by the conventional solid-state synthesis route. The starting materials taken were Y2O3 (Alfa Aesar, 99.99% pure), Al2O3 (Aldrich, 99.99+% pure), and CaCO3 (Aldrich, 99.5% pure). Y2O3 was pre-calcined in the furnace for about 5\u00a0h at 1,000\u00a0\u00b0C to ensure the correct stoichiometry. Powders were weighed and mixed to homogeneity in polyethylene jar containing zirconia balls in ethanol for about 48\u00a0h to obtain the desired composition. A 1:2 weight ratio of powder to zirconia balls was used. The mixed powders were filtered, dried, and calcined at 1,200\u00a0\u00b0C for about 18\u00a0h. The calcined powders were subjected to ball milling again with zirconia balls in ethanol for 48\u00a0h to break the agglomerates. A 1:4 weight ratio of powder to zirconia balls was used this time to achieve a smaller particle size. These powders were uniaxially cold-pressed at 30\u00a0kg/cm2 for 60\u00a0s into pellets of diameter 10\u00a0mm. Compositions with x between 0.03 and 0.12 were sintered for 12\u00a0h at 1575\u00a0\u00b0C. Compositions with x between 0.18 and 0.25 were sintered at 1,525\u00a0\u00b0C for 12\u00a0h. Heating rate for all cases was 5\u00a0\u00b0C/min, and all samples were furnace-cooled to room temperature. Density was calculated for the pellets from their geometry, and the relative density with respect to the theoretical density was estimated.", + "A homogenous powder mixture of 2\u00a0g of Degussa P25 TiO2 and 12\u00a0g of NaOH micro-pellets in a Ni crucible was annealed at 600\u00a0\u00b0C for 5\u00a0min in a preheated furnace, after which the cooled melt was dispersed in water and subsequently aged at room temperature for 0, 1, 5 or 14\u00a0days. The total volume of the dispersion obtained was 120\u00a0ml and its pH was >14 due to the high excess of NaOH. After each aging process, a portion of the dispersion was centrifuged at 7000\u00a0rpm and the sediment was redispersed in water. This washing procedure was repeated several times to remove the NaOH excess from the dispersion. The washed sediment was divided two equal portions, one of which was dried at 50\u00a0\u00b0C, while the other was treated with HCl solution. The dried samples without acid treatment were denoted Na-TNT_1, Na-TNT_5 and Na-TNT_14, the numbers in the designations of the samples indicating the aging time in days. The as-prepared sample (without aging) was denoted Na-TNT_0.", + "The acid treatment of the water-washed sediments was performed by stirring the sample for 12\u00a0h in 100\u00a0ml of 0.1\u00a0M HCl solution at room temperature. The white dispersions were centrifuged and the sediments obtained were washed with water and finally, dried at 50\u00a0\u00b0C. These samples were denoted H-TNT_0, H-TNT_1, H-TNT_5 and H-TNT_14, again including the applied aging time.", + "The samples involved in this study were synthesized by conventional solid-state method using high-purity MgO(\u226599.0%), TiO2(\u226599.0%), CaCO3(\u226599.0%), La2O3(\u226599.9%), Al2O3(\u226599.0%). MgO was additionally fired at 700\u00a0\u00b0C to remove water and CO2 due to its hygroscopic property. 0.95MgTiO3\u20130.05CaTiO3 (defined as 95MCT) and LaAlO3 powders were prepared by mixing all the raw materials according to the desired stoichiometry. Mixtures were milled in distilled water for 12\u00a0h with agate balls and then dried at 100\u2013150\u00a0\u00b0C to obtain homogenized powders. The preliminary heat treatment (calcination) of the powders was carried out at 1100\u00a0\u00b0C/3\u00a0h and 1250\u00a0\u00b0C/3\u00a0h in air in alumina crucibles respectively. After calcination, the powders 95MCT and LaAlO3 were mixed according to the molar fraction 100:1 and then re-milled for 12\u00a0h. The fine powders granulated by sieving through an 80 mesh together with the organic binder were pressed into pellets with 10 mm in diameter and 4\u20135\u00a0mm in thickness. All the pellets were sintered at 1175\u20131275\u00a0\u00b0C for 4\u00a0h in air.", + "LiCoPO4 was prepared using Co(NO3)2\u00b76H2O, NH4H2PO4 and LiOH\u00b7H2O as the raw materials. Stoichiometric Co(NO3)2\u00b76H2O (4.365\u00a0g), NH4H2PO4 (1.726\u00a0g), LiOH\u00b7H2O (0.630\u00a0g) and 6.3\u00a0g of citric acid (Co:Li:citric acid\u00a0=\u00a01:1:2) were initially mixed roughly in an agate mortar. Then, about 2\u00a0ml distilled water were added to form rheological phase mixture which reacted in an oven at 150\u00a0\u00b0C for 6\u00a0h. The resulting precursors were calcined at 750\u00a0\u00b0C for 10\u00a0h.", + "LiCoPO4 was also synthesized by a conventional solid-state method. The raw materials and proportion were the same as those of the above method. To obtain a pure olivine phase, the full mixed compounds was pre-calcined at 350\u00a0\u00b0C for 9\u00a0h [7], the compound was cooled down and ground again. Finally, the obtained precursor was calcined at 750\u00a0\u00b0C for 10\u00a0h.", + "Six compounds, La8.65Sr1.35(Si6O24)O2.32, La8.65Sr1.35(Ge6O24)O2.32, La9Sr1(Si5.5Al0.5O24)O2.25, La9.67\u25a10.33(Si5.5Al0.5O24)O2.25, La8.5Sr1.5(Si5.5Al0.5O24)O2 and La9.5\u25a10.5(Si5.5Al0.5O24)O2 have been prepared by the ceramic method in Pt crucibles using high purity oxides: La2O3 (Alfa, 99.999%), GeO2 (Aldrich, 99.998%), SiO2 (ABCR, quartz powder 99.31%), \u03b3-Al2O3 (Alfa, 99.997%) and SrCO3 (Alfa 99.99%). Lanthanum oxide was precalcined at 1273K for 2h in order to achieve decarbonation.", + "The ceramics of (0.70Bi1.05FeO3\u20130.30BaTiO3): xmol additives (x\u2009=\u20090.5 and 1.0%, additives including Bi2O3,MnO2, La2O3, Sm2O3, SnO2, and Nb2O5) were fabricated by the conventional solid-state method with a quenching process. Raw materials include Bi2O3 (99%), Fe2O3(99%), BaCO3 (99%), TiO2 (98%), MnO2 (99.5%), La2O3 (99.9%), Sm2O3 (99.9%), SnO2 (99.5%), and Nb2O5 (99.5%). In this work, two-step methods were adopted to prepare the ceramic samples. The raw materials including Bi2O3 (99%), Fe2O3(99%), BaCO3 (99%), and TiO2 (98%) were weighted and ball milled for 24\u00a0h with alcohol. Those mixing slurries were dried and calcined at 700\u2009\u00b0C for 2\u00a0h. After that, the calcined powders and those additives were weighted according to mole ratio and ball milled again. The pellets with 10\u00a0mm diameter and 0.6\u00a0mm thickness were pressed under a pressure of 10\u00a0MPa using 6\u20138\u00a0wt% polyvinyl alcohol (PVA) as a binder. After burning off PVA at 500\u2009\u00b0C for 3\u00a0h, the pellets were sintered at 960\u20131000\u2009\u00b0C for 3\u00a0h in air and were then quenched in water, and the sintering temperatures were dependent on the compositions. For electrical measurement, both sides of the sintered samples were pasted on silver slurry and then fired at 600\u2009\u00b0C for 10\u00a0min. The samples were poled at 120\u2009\u00b0C in a silicon oil bath under a dc field of 5\u00a0kV/mm." ] \ No newline at end of file diff --git a/materials_entity_recognition/scripts/__init__.py b/materials_entity_recognition/scripts/__init__.py index 70f3703..733d4c1 100644 --- a/materials_entity_recognition/scripts/__init__.py +++ b/materials_entity_recognition/scripts/__init__.py @@ -1,6 +1,9 @@ from .mat_models import MatIdentification from .mat_models import MatRecognition from .mat_models import MatTPIdentification +from .mat_models import MatIdentificationBagging +from .mat_models import MatRecognitionBagging +from .mat_models import MatTPIdentificationBagging __author__ = 'Tanjin He' __maintainer__ = 'Tanjin He' diff --git a/materials_entity_recognition/scripts/loader.py b/materials_entity_recognition/scripts/loader.py index 6549684..a6b9c06 100644 --- a/materials_entity_recognition/scripts/loader.py +++ b/materials_entity_recognition/scripts/loader.py @@ -140,10 +140,15 @@ def cap_feature(s): return 3 -def prepare_sentence(str_words, word_to_id, char_to_id, lower=False, - use_key_word=False, use_topic=False, - use_CHO=False, use_eleNum=False, input_tokens=[], - original_para_text=''): +def prepare_sentence(str_words, + word_to_id, + char_to_id, + lower=False, + use_CHO=False, + use_eleNum=False, + input_tokens=[], + original_para_text='', + ): """ Prepare a sentence for evaluation. diff --git a/materials_entity_recognition/scripts/mat_models.py b/materials_entity_recognition/scripts/mat_models.py index ba8785d..9eb5c6d 100644 --- a/materials_entity_recognition/scripts/mat_models.py +++ b/materials_entity_recognition/scripts/mat_models.py @@ -62,14 +62,18 @@ def mat_identify_sent(self, input_sent): input = create_input(sentence, self.parameters, False) # Prediction if self.parameters['crf']: - y_preds = np.array(self.f_eval(*input))[1:-1] + tags_scores, output = self.f_eval(*input) + y_preds = np.array(output)[1:-1] else: y_preds = self.f_eval(*input).argmax(axis=1) y_preds = [self.model.id_to_tag[y_pred] for y_pred in y_preds] y_preds = iobes_iob(y_preds) mat_begin = False for tmp_index, y_pred in enumerate(y_preds): - if y_pred == 'B-Mat': + if ( + y_pred == 'B-Mat' + # or (y_pred == 'I-Mat' and mat_begin == False) + ): materials.append(input_sent[tmp_index].copy()) materials[-1]['token_ids'] = [tmp_index, ] mat_begin = True @@ -101,9 +105,9 @@ def mat_identify(self, input_para, pre_tokens=None): # prepare input sentences for LSTM input_sent = [ { - 'text': tmp_token.text, - 'start': tmp_token.start, - 'end': tmp_token.end, + 'text': tmp_token.text, + 'start': tmp_token.start, + 'end': tmp_token.end, } for tmp_token in tmp_sent.tokens ] all_sents.append(input_sent) @@ -205,7 +209,8 @@ def matTP_identify_sent(self, input_sent): input = create_input(sentence, self.parameters, False) # Prediction if self.parameters['crf']: - y_preds = np.array(self.f_eval(*input))[1:-1] + tags_scores, output = self.f_eval(*input) + y_preds = np.array(output)[1:-1] else: y_preds = self.f_eval(*input).argmax(axis=1) y_preds = [self.model.id_to_tag[y_pred] for y_pred in y_preds] @@ -294,7 +299,7 @@ def matTP_identify(self, input_para, pre_tokens=None): return all_materials, precursors, targets, other_materials -class MatRecognition(): +class MatRecognition(object): """ Use LSTM for materials recognition """ @@ -357,40 +362,27 @@ def mat_recognize_sent(self, input_sent, ori_para_text=''): recognitionResult = {'precursors': [], 'targets': [], 'other_materials': []} # Prepare input words = [tmp_token['text'] for tmp_token in input_sent] - if self.parameters['keyword_dim'] != 0: - sentence = prepare_sentence(words, self.word_to_id, self.char_to_id, \ - lower=self.parameters['lower'], use_key_word=True) - elif self.parameters['topic_dim'] != 0: - sentence = prepare_sentence(words, self.word_to_id, self.char_to_id, \ - lower=self.parameters['lower'], use_topic=True) - elif self.parameters['has_CHO'] or self.parameters['ele_num']: - sentence = prepare_sentence(words, self.word_to_id, self.char_to_id, \ - lower=self.parameters['lower'], \ - use_key_word=False, use_topic=False, \ - use_CHO=self.parameters['has_CHO'], use_eleNum=self.parameters['ele_num'], \ - input_tokens=input_sent, original_para_text=ori_para_text) + if self.parameters['has_CHO'] or self.parameters['ele_num']: + sentence = prepare_sentence(words, + self.word_to_id, + self.char_to_id, + lower=self.parameters['lower'], + use_CHO=self.parameters['has_CHO'], + use_eleNum=self.parameters['ele_num'], + input_tokens=input_sent, + original_para_text=ori_para_text) else: - sentence = prepare_sentence(words, self.word_to_id, self.char_to_id, \ - lower=self.parameters['lower'], \ - use_key_word=False, use_topic=False, use_CHO=False, use_eleNum=False) - # use_key_word = False - # use_topic = False - # use_CHO - # use_eleNum - # usePos - # if self.parameters.get('keyword_dim', 0) != 0: - # use_key_word = True - - - # sentence = prepare_sentence(words, self.word_to_id, self.char_to_id, - # lower=self.parameters['lower'], - # use_key_word=(self.parameters['keyword_dim'] != 0), - # use_topic=(self.parameters['topic_dim'] != 0), - # ) + sentence = prepare_sentence(words, + self.word_to_id, + self.char_to_id, + lower=self.parameters['lower'], + use_CHO=False, + use_eleNum=False) input = create_input(sentence, self.parameters, False) # Prediction if self.parameters['crf']: - y_preds = np.array(self.f_eval(*input))[1:-1] + tags_scores, output = self.f_eval(*input) + y_preds = np.array(output)[1:-1] else: y_preds = self.f_eval(*input).argmax(axis=1) y_preds = [self.model.id_to_tag[y_pred] for y_pred in y_preds] @@ -561,3 +553,368 @@ def mat_recognize(self, input_para, materials=None, pre_tokens=None): return result + +class MatIdentificationBagging(MatIdentification): + """ + Use LSTM for materials identification + """ + + def __init__(self, model_path=None, bagging=[]): + """ + :param model_path: path to the model for materials recognition. If None input, default initialize. + """ + self.identify_models = [] + if bagging: + for tmp_path in bagging: + self.identify_models.append( + MatIdentification( + model_path=tmp_path, + ) + ) + else: + self.identify_models.append( + MatIdentification( + model_path=model_path, + ) + ) + + def mat_identify_sent(self, input_sent): + """ + Identify materials in a sentence, which is a list of tokens. + + :param input_sent: list of tokens representing a sentence + :return materials: list of materials from LSTM + """ + # goal + materials = [] + + all_y_preds = [] + all_tags_scores = [] + + standard_id_to_tag = self.identify_models[0].model.id_to_tag + standard_tag_to_id = {v:k for (k, v) in standard_id_to_tag.items()} + + words = [tmp_token['text'] for tmp_token in input_sent] + for tmp_model in self.identify_models: + # Prepare input + sentence = prepare_sentence(words, tmp_model.word_to_id, tmp_model.char_to_id, + lower=tmp_model.parameters['lower']) + input = create_input(sentence, tmp_model.parameters, False) + # Prediction + if tmp_model.parameters['crf']: + tags_scores, output = tmp_model.f_eval(*input) + tags_scores_exp = np.exp(tags_scores) + tags_scores_normalized = tags_scores_exp / np.sum(tags_scores_exp, axis=1)[:,None] + old_ids = sorted(tmp_model.model.id_to_tag.keys()) + new_ids = [ + standard_tag_to_id[ + tmp_model.model.id_to_tag[ + tmp_old_id + ] + ] + for tmp_old_id in old_ids + ] + tags_scores_normalized[:, new_ids] = tags_scores_normalized[:, old_ids] + all_tags_scores.append(tags_scores_normalized) + y_preds = np.array(output)[1:-1] + # y_preds = np.array(tmp_model.f_eval(*input))[1:-1] + else: + y_preds = tmp_model.f_eval(*input).argmax(axis=1) + + y_preds_score = np.zeros((len(y_preds), len(tmp_model.model.id_to_tag))) + for i, y_pred in enumerate(y_preds): + y_preds_score[ + i, + standard_tag_to_id[ + tmp_model.model.id_to_tag[y_pred] + ] + ] = 1.0 + + all_y_preds.append(y_preds_score) + + # bagging + bagged_ids = [] + all_y_preds = sum(all_y_preds) + if all_tags_scores: + all_tags_scores = sum(all_tags_scores) + sequence_len = len(all_y_preds) + for i in range(sequence_len): + sorted_y_preds = sorted(all_y_preds[i], reverse=True) + if (sorted_y_preds[0] == sorted_y_preds[1]) \ + and (len(all_tags_scores) > 0): + bagged_ids.append(np.argmax(all_tags_scores[i])) + else: + bagged_ids.append(np.argmax(all_y_preds[i])) + y_preds = [standard_id_to_tag[tmp_id] for tmp_id in bagged_ids] + + # result + y_preds = iobes_iob(y_preds) + mat_begin = False + for tmp_index, y_pred in enumerate(y_preds): + if y_pred == 'B-Mat': + materials.append(input_sent[tmp_index]) + mat_begin = True + elif y_pred == 'I-Mat' and mat_begin == True: + materials[-1]['end'] = input_sent[tmp_index]['end'] + materials[-1]['text'] += ' ' + input_sent[tmp_index]['text'] + else: + mat_begin = False + return materials + + +class MatRecognitionBagging(MatRecognition): + """ + Use LSTM for materials recognition + """ + + def __init__(self, model_path=None, + mat_identify_model_path=None, + parse_dependency=False, use_topic=False, + bagging=[], mat_identify_bagging=[]): + """ + :param model_path: path to the model for materials recognition. If None input, default initialize. + :param mat_identify_model_path: path to the model for materials identification. If None input, default initialize. + :param parse_dependency: parse dependency or not. If True, the parsed dependency will be used as the key word feature. + """ + self.recognition_models = [] + + if bagging: + for tmp_path in bagging: + self.recognition_models.append( + MatRecognition( + model_path=tmp_path, + mat_identify_model_path=mat_identify_model_path, + parse_dependency=parse_dependency, + use_topic=use_topic, + ) + ) + else: + self.recognition_models.append( + MatRecognition( + model_path=model_path, + mat_identify_model_path=mat_identify_model_path, + parse_dependency=parse_dependency, + use_topic=use_topic, + ) + ) + + self.parameters = self.recognition_models[0].parameters.copy() + valid_keys = {'has_CHO', 'ele_num'} + self.parameters = dict(filter(lambda x: x[0] in valid_keys, self.parameters.items())) + + self.identify_model = MatIdentificationBagging( + model_path=mat_identify_model_path, + bagging=mat_identify_bagging, + ) + + + def mat_recognize_sent(self, input_sent, ori_para_text=''): + """ + Recognize target/precursor in a sentence, which is a list of tokens. + + :param input_sent: list of tokens representing a sentence + :return recognitionResult: dict containing keys of precursors, targets, and other materials, + the value of each one is a list of index of token in the sentence + """ + # goal + recognitionResult = {'precursors': [], 'targets': [], 'other_materials': []} + all_y_preds = [] + all_tags_scores = [] + + standard_id_to_tag = self.recognition_models[0].model.id_to_tag + standard_tag_to_id = {v:k for (k, v) in standard_id_to_tag.items()} + + words = [tmp_token['text'] for tmp_token in input_sent] + for tmp_model in self.recognition_models: + # Prepare input + if tmp_model.parameters['has_CHO'] or tmp_model.parameters['ele_num']: + sentence = prepare_sentence(words, + tmp_model.word_to_id, + tmp_model.char_to_id, + lower=tmp_model.parameters['lower'], + use_CHO=tmp_model.parameters['has_CHO'], + use_eleNum=tmp_model.parameters['ele_num'], + input_tokens=input_sent, + original_para_text=ori_para_text) + else: + sentence = prepare_sentence(words, + tmp_model.word_to_id, + tmp_model.char_to_id, + lower=tmp_model.parameters['lower'], + use_CHO=False, + use_eleNum=False) + + input = create_input(sentence, tmp_model.parameters, False) + # Prediction + if tmp_model.parameters['crf']: + tags_scores, output = tmp_model.f_eval(*input) + tags_scores_exp = np.exp(tags_scores) + tags_scores_normalized = tags_scores_exp / np.sum(tags_scores_exp, axis=1)[:,None] + old_ids = sorted(tmp_model.model.id_to_tag.keys()) + new_ids = [ + standard_tag_to_id[ + tmp_model.model.id_to_tag[ + tmp_old_id + ] + ] + for tmp_old_id in old_ids + ] + tags_scores_normalized[:, new_ids] = tags_scores_normalized[:, old_ids] + all_tags_scores.append(tags_scores_normalized) + y_preds = np.array(output)[1:-1] + # y_preds = np.array(tmp_model.f_eval(*input))[1:-1] + else: + y_preds = tmp_model.f_eval(*input).argmax(axis=1) + + y_preds_score = np.zeros((len(y_preds), len(tmp_model.model.id_to_tag))) + for i, y_pred in enumerate(y_preds): + y_preds_score[ + i, + standard_tag_to_id[ + tmp_model.model.id_to_tag[y_pred] + ] + ] = 1.0 + + all_y_preds.append(y_preds_score) + + # bagging + bagged_ids = [] + all_y_preds = sum(all_y_preds) + if all_tags_scores: + all_tags_scores = sum(all_tags_scores) + sequence_len = len(all_y_preds) + for i in range(sequence_len): + sorted_y_preds = sorted(all_y_preds[i], reverse=True) + if (sorted_y_preds[0] == sorted_y_preds[1]) \ + and (len(all_tags_scores) > 0): + bagged_ids.append(np.argmax(all_tags_scores[i])) + else: + bagged_ids.append(np.argmax(all_y_preds[i])) + y_preds = [standard_id_to_tag[tmp_id] for tmp_id in bagged_ids] + + # result + y_preds = iobes_iob(y_preds) + mat_begin = False + for tmp_index, y_pred in enumerate(y_preds): + if y_pred == 'B-Pre': + recognitionResult['precursors'].append(tmp_index) + if y_pred == 'B-Tar': + recognitionResult['targets'].append(tmp_index) + if y_pred == 'B-Mat': + recognitionResult['other_materials'].append(tmp_index) + return recognitionResult + + +class MatTPIdentificationBagging(MatTPIdentification): + """ + Use LSTM for materials identification + """ + + def __init__(self, model_path=None, bagging=[]): + """ + :param model_path: path to the model for materials recognition. If None input, default initialize. + """ + self.matTP_identify_models = [] + if bagging: + for tmp_path in bagging: + self.matTP_identify_models.append( + MatTPIdentification( + model_path=tmp_path, + ) + ) + else: + self.matTP_identify_models.append( + MatTPIdentification( + model_path=model_path, + ) + ) + + def matTP_identify_sent(self, input_sent): + """ + Identify materials in a sentence, which is a list of tokens. + + :param input_sent: list of tokens representing a sentence + :return materials: list of materials from LSTM + """ + # goal + recognitionResult = {'all_materials': [], 'precursors': [], 'targets': [], 'other_materials': []} + type_to_abbr = {'precursors': 'Pre', 'targets': 'Tar', 'other_materials': 'Mat'} + abbr_to_type = {v: k for (k, v) in type_to_abbr.items()} + + all_y_preds = [] + all_tags_scores = [] + + standard_id_to_tag = self.matTP_identify_models[0].model.id_to_tag + standard_tag_to_id = {v:k for (k, v) in standard_id_to_tag.items()} + + words = [tmp_token['text'] for tmp_token in input_sent] + for tmp_model in self.matTP_identify_models: + # Prepare input + sentence = prepare_sentence(words, tmp_model.word_to_id, tmp_model.char_to_id, + lower=tmp_model.parameters['lower']) + input = create_input(sentence, tmp_model.parameters, False) + # Prediction + if tmp_model.parameters['crf']: + tags_scores, output = tmp_model.f_eval(*input) + tags_scores_exp = np.exp(tags_scores) + tags_scores_normalized = tags_scores_exp / np.sum(tags_scores_exp, axis=1)[:,None] + old_ids = sorted(tmp_model.model.id_to_tag.keys()) + new_ids = [ + standard_tag_to_id[ + tmp_model.model.id_to_tag[ + tmp_old_id + ] + ] + for tmp_old_id in old_ids + ] + tags_scores_normalized[:, new_ids] = tags_scores_normalized[:, old_ids] + all_tags_scores.append(tags_scores_normalized) + y_preds = np.array(output)[1:-1] + # y_preds = np.array(tmp_model.f_eval(*input))[1:-1] + else: + y_preds = tmp_model.f_eval(*input).argmax(axis=1) + + y_preds_score = np.zeros((len(y_preds), len(tmp_model.model.id_to_tag))) + for i, y_pred in enumerate(y_preds): + y_preds_score[ + i, + standard_tag_to_id[ + tmp_model.model.id_to_tag[y_pred] + ] + ] = 1.0 + + all_y_preds.append(y_preds_score) + + # bagging + bagged_ids = [] + all_y_preds = sum(all_y_preds) + if all_tags_scores: + all_tags_scores = sum(all_tags_scores) + sequence_len = len(all_y_preds) + for i in range(sequence_len): + sorted_y_preds = sorted(all_y_preds[i], reverse=True) + if (sorted_y_preds[0] == sorted_y_preds[1]) \ + and (len(all_tags_scores) > 0): + bagged_ids.append(np.argmax(all_tags_scores[i])) + else: + bagged_ids.append(np.argmax(all_y_preds[i])) + y_preds = [standard_id_to_tag[tmp_id] for tmp_id in bagged_ids] + + # result + y_preds = iobes_iob(y_preds) + mat_begin = None + for tmp_index, y_pred in enumerate(y_preds): + if y_pred.startswith('B-'): + mat_begin = y_pred[2:] + recognitionResult['all_materials'].append(input_sent[tmp_index]) + recognitionResult[abbr_to_type[mat_begin]].append(input_sent[tmp_index]) + elif y_pred.startswith('I-') and mat_begin == y_pred[2:]: + recognitionResult['all_materials'][-1]['end'] = input_sent[tmp_index]['end'] + recognitionResult['all_materials'][-1]['text'] += ' ' + input_sent[tmp_index]['text'] + recognitionResult[abbr_to_type[mat_begin]][-1]['end'] = input_sent[tmp_index]['end'] + recognitionResult[abbr_to_type[mat_begin]][-1]['text'] += ' ' + input_sent[tmp_index]['text'] + else: + mat_begin = None + + return recognitionResult + diff --git a/materials_entity_recognition/scripts/model.py b/materials_entity_recognition/scripts/model.py index 60b1a01..b29a8e8 100644 --- a/materials_entity_recognition/scripts/model.py +++ b/materials_entity_recognition/scripts/model.py @@ -187,7 +187,7 @@ def build(self, word_input = word_layer.link(word_ids) inputs.append(word_input) # Initialize with pretrained embeddings - if pre_emb: + if pre_emb and training: new_weights = word_layer.embeddings.get_value() print('Loading pretrained embeddings from %s...' % pre_emb) pretrained = {} @@ -372,7 +372,7 @@ def build(self, # Network parameters params = [] - if word_dim and (not pre_emb): + if word_dim: self.add_component(word_layer) params.extend(word_layer.params) if char_dim: @@ -457,8 +457,10 @@ def build(self, else: f_eval = theano.function( inputs=eval_inputs, - outputs=forward(observations, transitions, viterbi=True, - return_alpha=False, return_best_sequence=True), + outputs= [tags_scores, forward(observations, transitions, viterbi=True, + return_alpha=False, return_best_sequence=True),], + # outputs=forward(observations, transitions, viterbi=True, + # return_alpha=False, return_best_sequence=True), givens=({is_train: np.cast['int32'](0)} if dropout else {}) ) diff --git a/materials_entity_recognition/scripts/sent_ele_func.py b/materials_entity_recognition/scripts/sent_ele_func.py index 2a1b8d9..b96ad8d 100644 --- a/materials_entity_recognition/scripts/sent_ele_func.py +++ b/materials_entity_recognition/scripts/sent_ele_func.py @@ -6,7 +6,7 @@ __email__ = 'tanjin_he@berkeley.edu, rongzq08@gmail.com' # constant -mp = MaterialParser(pubchem_lookup=True) +mp = MaterialParser(pubchem_lookup=False) allNonMetalElements = set(['C', 'H', 'O', 'N', 'Cl', 'F', 'P', 'S', 'Br', 'I', 'Se'] + ['He', 'Ne', 'Ar', 'Kr', 'Xe', 'Rn']) # element table by symbol of elements elementTable = { @@ -36,7 +36,7 @@ 'Lv': [116, 'livermorium'], 'Ts': [117, 'tennessine'], 'Og': [118, 'oganesson'], } allElements = list(elementTable.keys()) - +allElements = sorted(allElements, key=lambda ele: len(ele), reverse=True) def parse_material(material_text, para_text): # goal @@ -55,7 +55,7 @@ def parse_material(material_text, para_text): if len(dopants) > 0: parsed_material['dopants'] = dopants try: - # material parser version 6.0.3 + # material parser version 6.1.0 list_of_materials = mp.split_materials_list(new_material2) list_of_materials = list_of_materials if list_of_materials != [] else [(new_material2, '')] tmp_structure = [] @@ -67,8 +67,8 @@ def parse_material(material_text, para_text): else: # print('unresolved') pass - except Exception as e: - print('Error!', e) + except: + # print('unresolved') pass return parsed_material @@ -82,10 +82,6 @@ def merge_struct_comp(struct_list): # get all compositions from struct_list for tmp_struct in struct_list: - if (set(tmp_struct['elements'].keys()) == {'H', 'O'} - and len(struct_list) > 1): - # not take H2O into account - continue if tmp_struct.get('amount', '1.0') != '1.0': # multiply by coefficient if amount is not 1 tmp_comp = {} @@ -117,7 +113,8 @@ def count_metal_ele(material_text, para_text): # get ele feature for one material def get_ele_feature(material_text, para_text): metal_ele_num = 0 - only_CHO = False + only_CHO = 0 + material_text = material_text.strip() parsed_material = parse_material(material_text, para_text) if parsed_material['composition']: ele_set = set(parsed_material['composition'].keys()) diff --git a/materials_entity_recognition/test/pre_tokens.py b/materials_entity_recognition/test/pre_tokens.py index 9950dbe..7f1e8fe 100644 --- a/materials_entity_recognition/test/pre_tokens.py +++ b/materials_entity_recognition/test/pre_tokens.py @@ -30,5 +30,5 @@ for tmp_para in paras: CDE_para = CDE.doc.Paragraph(tmp_para) pre_tokens = [tmp_sent.tokens for tmp_sent in CDE_para] - all_materials, precursors, targets, other_materials = model_new.mat_recognize(tmp_para, pre_tokens=pre_tokens) + result = model_new.mat_recognize(tmp_para, pre_tokens=pre_tokens)