Getting started with distributed computing
PySpark is the Python API for Apache Spark, an open-source, distributed computing system used for big data processing and analytics.
from pyspark.sql import SparkSession
# Initialize Spark Session
spark = SparkSession.builder \
.appName("FirstPySparkApp") \
.getOrCreate()
# Create a simple DataFrame
data = [("Python", 100), ("Spark", 200), ("Hadoop", 150)]
df = spark.createDataFrame(data, ["Technology", "Score"])
# Show the DataFrame
df.show()