Spark-NLP

export JAVA_HOME="/usr/lib/jvm/java-8-openjdk-amd64/"
import pyspark.sql.functions as F
from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType, ArrayType
from pyspark.sql.types import StructType, StructField
from sparknlp import DocumentAssembler
from sparknlp.annotator import SentenceDetector, Tokenizer, WordEmbeddingsModel, PerceptronModel, NerCrfModel
spark = SparkSession.builder \
.appName("ex1") \
.master("local[*]") \
.config("spark.driver.memory", "8G") \
.config("spark.driver.maxResultSize", "2G") \
.config("spark.jars.packages", "JohnSnowLabs:spark-nlp:2.3.1") \
.config("spark.kryoserializer.buffer.max", "500m") \
.getOrCreate()
parts = spark.read.parquet(f"data/part*.snappy.parquet")
  • DocumentAssembler
  • Tokenizer
  • WordEmbeddingsModel (Word Embeddings, Glove)
  • PerceptronModel (Part of Speech)
  • NerCrfModel (Named Entity Recognition)
documentAssembler = DocumentAssembler() \
.setInputCol("text") \
.setOutputCol("document")

sentenceDetector = SentenceDetector() \
.setInputCols(["document"]) \
.setOutputCol("sentence")

tokenizer = Tokenizer() \
.setInputCols(["sentence"]) \
.setOutputCol("token")

wordEmbeddingsModel = WordEmbeddingsModel \
.pretrained() \
.setInputCols(["document", "token"]) \
.setOutputCol("word_embeddings")

perceptronModel = PerceptronModel \
.pretrained() \
.setInputCols(["token", "document"]) \
.setOutputCol("pos")

nerCrfModel = NerCrfModel \
.pretrained() \
.setInputCols(["document", "token", "pos", "word_embeddings"]) \
.setOutputCol("ne")

pipeline = Pipeline() \
.setStages([
documentAssembler,
sentenceDetector,
tokenizer,
wordEmbeddingsModel,
perceptronModel,
nerCrfModel,
])

df = pipeline.fit(parts).transform(parts)
tok_ne_pos_df = df \
.select("token.result", "ne.result", "pos.result") \
.toDF("token_result_arr", "ne_result_arr", "pos_result_arr")

tok_ne_pos_df.show()
# excluding non Entity token
tok_ne_pos_df = tok_ne_pos_df.filter(F.col("ne_result") != 'O')

tok_ne_pos_df.groupBy("ne_result", "pos_result").count().sort(F.col("count").desc()).show()
+---------+----------+-----+                                                    
|ne_result|pos_result|count|
+---------+----------+-----+
| I-PER| NNP| 1445|
| I-LOC| NNP| 1022|
| I-ORG| NNP| 962|
| I-MISC| NNP| 390|
| I-MISC| JJ| 157|
| I-ORG| IN| 23|
| I-ORG| JJ| 15|
| I-ORG| CC| 12|
| I-ORG| NNPS| 10|
| I-MISC| NNPS| 10|
| I-PER| NN| 7|
| I-LOC| NNPS| 6|
| I-MISC| CD| 6|
| I-ORG| NN| 5|
| I-ORG| CD| 4|
| I-ORG| DT| 3|
| I-MISC| NN| 3|
| I-ORG| NNS| 3|
| I-ORG| (| 2|
| I-ORG| )| 2|
+---------+----------+-----+
only showing top 20 rows

--

--

Get the Medium app

A button that says 'Download on the App Store', and if clicked it will lead you to the iOS App store
A button that says 'Get it on, Google Play', and if clicked it will lead you to the Google Play store