{"id":1157,"date":"2024-01-31T10:32:08","date_gmt":"2024-01-31T10:32:08","guid":{"rendered":"https:\/\/sparkexample.com\/?p=1157"},"modified":"2024-01-31T17:32:58","modified_gmt":"2024-01-31T17:32:58","slug":"spark-join-and-types","status":"publish","type":"post","link":"https:\/\/sparkexample.com\/spark-join-and-types\/","title":{"rendered":"Spark join and types"},"content":{"rendered":"\n
Spark supports different join types as given below.<\/p>\n\n\n\n
By default the join type is inner if we do not provide the join type.<\/p>\n\n\n\n
Join in spark is a narrow transformation if it is broadcast join otherwise it is wide transformation(shuffling of the data happened)<\/p>\n\n\n\n
Create the student dataframe<\/p>\n\n\n\n
student_list = [(1, 'smith', "banglore"),\n (2, 'mark', "hyderabad"),\n\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0(3, 'holder', "mumbai")]\n\nstudent_schema = StructType([\n\u00a0\u00a0\u00a0StructField('id', IntegerType()),\n\u00a0\u00a0\u00a0StructField('name', StringType()),\n\u00a0\u00a0\u00a0StructField('address', StringType())\n])\nstudent_df = sparkSession.createDataFrame(student_list, student_schema)<\/code><\/pre>\n\n\n\nCreate the marks dataframe<\/p>\n\n\n\n
marks_list = [(1, 'math', 80),\n (1, 'science', 85),\n\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0(1, 'computer', 90),\n\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0(2, 'math', 78),\n\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0(2, 'science', 92),\n\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0(2, 'computer', 85),\n\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0(4, 'math', 71),\n\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0(4, 'science', 75),\n\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0(4, 'computer', 97)]\n\nmarks_schema = StructType([\n\u00a0\u00a0\u00a0StructField('id', IntegerType()),\n\u00a0\u00a0\u00a0StructField('subject', StringType()),\n\u00a0\u00a0\u00a0StructField('marks', StringType())\n])\n\nmarks_df = sparkSession.createDataFrame(marks_list, marks_schema)<\/code><\/pre>\n\n\n\nJoin student dataframe with marks dataframe:<\/p>\n\n\n\n
join_df = student_df.join(marks_df, ['id'], 'inner')\njoin_df.show()<\/code><\/pre>\n\n\n\nInner join output: <\/p>\n\n\n\n
+---+-----+---------+--------+-----+\n| id| name|\u00a0 address| subject|marks|\n+---+-----+---------+--------+-----+\n|\u00a0 1|smith| banglore|\u00a0 \u00a0 math| \u00a0 80|\n|\u00a0 1|smith| banglore| science| \u00a0 85|\n|\u00a0 1|smith| banglore|computer| \u00a0 90|\n|\u00a0 2| mark|hyderabad|\u00a0 \u00a0 math| \u00a0 80|\n|\u00a0 2| mark|hyderabad| science| \u00a0 85|\n|\u00a0 2| mark|hyderabad|computer| \u00a0 90|\n+---+-----+---------+--------+-----+<\/code><\/pre>\n\n\n\njoin_df = student_df.join(marks_df, ['id'], 'left')\njoin_df.show()<\/code><\/pre>\n\n\n\nLeft join output: <\/p>\n\n\n\n
+---+------+---------+--------+-----+\n| id|\u00a0 name|\u00a0 address| subject|marks|\n+---+------+---------+--------+-----+\n|\u00a0 1| smith| banglore|computer| \u00a0 90|\n|\u00a0 1| smith| banglore| science| \u00a0 85|\n|\u00a0 1| smith| banglore|\u00a0 \u00a0 math| \u00a0 80|\n|\u00a0 2|\u00a0 mark|hyderabad|computer| \u00a0 85|\n|\u00a0 2|\u00a0 mark|hyderabad| science| \u00a0 92|\n|\u00a0 2|\u00a0 mark|hyderabad|\u00a0 \u00a0 math| \u00a0 78|\n|\u00a0 3|holder| \u00a0 mumbai|\u00a0 \u00a0 NULL| NULL|\n+---+------+---------+--------+-----+<\/code><\/pre>\n\n\n\njoin_df = student_df.join(marks_df, ['id'], 'right')\njoin_df.show()<\/code><\/pre>\n\n\n\nRight join output:<\/p>\n\n\n\n
+---+-----+---------+--------+-----+\n| id| name|\u00a0 address| subject|marks|\n+---+-----+---------+--------+-----+\n|\u00a0 1|smith| banglore|\u00a0 \u00a0 math| \u00a0 80|\n|\u00a0 1|smith| banglore| science| \u00a0 85|\n|\u00a0 1|smith| banglore|computer| \u00a0 90|\n|\u00a0 2| mark|hyderabad|\u00a0 \u00a0 math| \u00a0 78|\n|\u00a0 2| mark|hyderabad| science| \u00a0 92|\n|\u00a0 2| mark|hyderabad|computer| \u00a0 85|\n|\u00a0 4| NULL| \u00a0 \u00a0 NULL|\u00a0 \u00a0 math| \u00a0 71|\n|\u00a0 4| NULL| \u00a0 \u00a0 NULL| science| \u00a0 75|\n|\u00a0 4| NULL| \u00a0 \u00a0 NULL|computer| \u00a0 97|\n+---+-----+---------+--------+-----+<\/code><\/pre>\n\n\n\ncrossJoinDf = student_df.crossJoin(marks_df)\ncrossJoinDf.show()<\/code><\/pre>\n\n\n\nOutput of cross join:<\/p>\n\n\n\n
+---+------+---------+---+--------+-----+\n| id|\u00a0 name|\u00a0 address| id| subject|marks|\n+---+------+---------+---+--------+-----+\n|\u00a0 1| smith| banglore|\u00a0 1|\u00a0 \u00a0 math| \u00a0 80|\n|\u00a0 1| smith| banglore|\u00a0 1| science| \u00a0 85|\n|\u00a0 1| smith| banglore|\u00a0 1|computer| \u00a0 90|\n|\u00a0 1| smith| banglore|\u00a0 2|\u00a0 \u00a0 math| \u00a0 78|\n|\u00a0 1| smith| banglore|\u00a0 2| science| \u00a0 92|\n|\u00a0 1| smith| banglore|\u00a0 2|computer| \u00a0 85|\n|\u00a0 1| smith| banglore|\u00a0 4|\u00a0 \u00a0 math| \u00a0 71|\n|\u00a0 1| smith| banglore|\u00a0 4| science| \u00a0 75|\n|\u00a0 1| smith| banglore|\u00a0 4|computer| \u00a0 97|\n|\u00a0 2|\u00a0 mark|hyderabad|\u00a0 1|\u00a0 \u00a0 math| \u00a0 80|\n|\u00a0 2|\u00a0 mark|hyderabad|\u00a0 1| science| \u00a0 85|\n|\u00a0 2|\u00a0 mark|hyderabad|\u00a0 1|computer| \u00a0 90|\n|\u00a0 2|\u00a0 mark|hyderabad|\u00a0 2|\u00a0 \u00a0 math| \u00a0 78|\n|\u00a0 2|\u00a0 mark|hyderabad|\u00a0 2| science| \u00a0 92|\n|\u00a0 2|\u00a0 mark|hyderabad|\u00a0 2|computer| \u00a0 85|\n|\u00a0 2|\u00a0 mark|hyderabad|\u00a0 4|\u00a0 \u00a0 math| \u00a0 71|\n|\u00a0 2|\u00a0 mark|hyderabad|\u00a0 4| science| \u00a0 75|\n|\u00a0 2|\u00a0 mark|hyderabad|\u00a0 4|computer| \u00a0 97|\n|\u00a0 3|holder| \u00a0 mumbai|\u00a0 1|\u00a0 \u00a0 math| \u00a0 80|\n|\u00a0 3|holder| \u00a0 mumbai|\u00a0 1| science| \u00a0 85|\n|\u00a0 3|holder| \u00a0 mumbai|\u00a0 1|computer| \u00a0 90|\n|\u00a0 3|holder| \u00a0 mumbai|\u00a0 2|\u00a0 \u00a0 math| \u00a0 78|\n|\u00a0 3|holder| \u00a0 mumbai|\u00a0 2| science| \u00a0 92|\n|\u00a0 3|holder| \u00a0 mumbai|\u00a0 2|computer| \u00a0 85|\n|\u00a0 3|holder| \u00a0 mumbai|\u00a0 4|\u00a0 \u00a0 math| \u00a0 71|\n|\u00a0 3|holder| \u00a0 mumbai|\u00a0 4| science| \u00a0 75|\n|\u00a0 3|holder| \u00a0 mumbai|\u00a0 4|computer| \u00a0 97|\n+---+------+---------+---+--------+-----+<\/code><\/pre>\n\n\n\nSpark Join Strategies<\/strong><\/h1>\n\n\n\nBroadcast hash Join: <\/strong><\/p>\n\n\n\nIn the join, if one dataset is large and another is smaller enough that fits into memory,then spark optimizes the join and uses the broadcast join,By using broadcast join the shuffle operation does not happen.<\/p>\n\n\n\n
By default the value of spark.sql.autoBroadcastJoinThreshold=10485760 (10 MB).<\/strong>We can increase or decrease the value based on our requirement .<\/strong>If we do not want to use broadcast join, set spark.sql.autoBroadcastJoinThreshold= -1.<\/strong> In this case spark does not use broadcast join,spark either uses shuffle hash join or Shuffle sort-merge join.<\/p>\n\n\n\nShuffle Hash join: <\/strong><\/p>\n\n\n\nIn Shuffle Hash join, spark join two large dataset with creating the hash value of keys in both the dataset and put the same key in one partition on a single node.This join does not need sorting of the data. This join is more expensive than Broadcast hash join.<\/p>\n\n\n\n
spark.sql.join.preferSortMergeJoin=false (Default value is true)<\/strong><\/p>\n\n\n\nspark.sql.autoBroadcastJoinThreshold= -1<\/strong><\/p>\n\n\n\nShuffle Sort Merge join: <\/strong><\/p>\n\n\n\nIn this join, Both the dataset are going to be sorted and sent to the same key in a single node.<\/p>\n","protected":false},"excerpt":{"rendered":"
Spark supports different join types as given below. By default the join type is inner if we do not provide the join type. Join in spark is a narrow transformation if it is broadcast join otherwise it is wide transformation(shuffling of the data happened) Create the student dataframe Create the marks dataframe Join student dataframe […]<\/p>\n","protected":false},"author":2,"featured_media":0,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"ocean_post_layout":"","ocean_both_sidebars_style":"","ocean_both_sidebars_content_width":0,"ocean_both_sidebars_sidebars_width":0,"ocean_sidebar":"","ocean_second_sidebar":"","ocean_disable_margins":"enable","ocean_add_body_class":"","ocean_shortcode_before_top_bar":"","ocean_shortcode_after_top_bar":"","ocean_shortcode_before_header":"","ocean_shortcode_after_header":"","ocean_has_shortcode":"","ocean_shortcode_after_title":"","ocean_shortcode_before_footer_widgets":"","ocean_shortcode_after_footer_widgets":"","ocean_shortcode_before_footer_bottom":"","ocean_shortcode_after_footer_bottom":"","ocean_display_top_bar":"default","ocean_display_header":"default","ocean_header_style":"","ocean_center_header_left_menu":"","ocean_custom_header_template":"","ocean_custom_logo":0,"ocean_custom_retina_logo":0,"ocean_custom_logo_max_width":0,"ocean_custom_logo_tablet_max_width":0,"ocean_custom_logo_mobile_max_width":0,"ocean_custom_logo_max_height":0,"ocean_custom_logo_tablet_max_height":0,"ocean_custom_logo_mobile_max_height":0,"ocean_header_custom_menu":"","ocean_menu_typo_font_family":"","ocean_menu_typo_font_subset":"","ocean_menu_typo_font_size":0,"ocean_menu_typo_font_size_tablet":0,"ocean_menu_typo_font_size_mobile":0,"ocean_menu_typo_font_size_unit":"px","ocean_menu_typo_font_weight":"","ocean_menu_typo_font_weight_tablet":"","ocean_menu_typo_font_weight_mobile":"","ocean_menu_typo_transform":"","ocean_menu_typo_transform_tablet":"","ocean_menu_typo_transform_mobile":"","ocean_menu_typo_line_height":0,"ocean_menu_typo_line_height_tablet":0,"ocean_menu_typo_line_height_mobile":0,"ocean_menu_typo_line_height_unit":"","ocean_menu_typo_spacing":0,"ocean_menu_typo_spacing_tablet":0,"ocean_menu_typo_spacing_mobile":0,"ocean_menu_typo_spacing_unit":"","ocean_menu_link_color":"","ocean_menu_link_color_hover":"","ocean_menu_link_color_active":"","ocean_menu_link_background":"","ocean_menu_link_hover_background":"","ocean_menu_link_active_background":"","ocean_menu_social_links_bg":"","ocean_menu_social_hover_links_bg":"","ocean_menu_social_links_color":"","ocean_menu_social_hover_links_color":"","ocean_disable_title":"default","ocean_disable_heading":"default","ocean_post_title":"","ocean_post_subheading":"","ocean_post_title_style":"","ocean_post_title_background_color":"","ocean_post_title_background":0,"ocean_post_title_bg_image_position":"","ocean_post_title_bg_image_attachment":"","ocean_post_title_bg_image_repeat":"","ocean_post_title_bg_image_size":"","ocean_post_title_height":0,"ocean_post_title_bg_overlay":0.5,"ocean_post_title_bg_overlay_color":"","ocean_disable_breadcrumbs":"default","ocean_breadcrumbs_color":"","ocean_breadcrumbs_separator_color":"","ocean_breadcrumbs_links_color":"","ocean_breadcrumbs_links_hover_color":"","ocean_display_footer_widgets":"default","ocean_display_footer_bottom":"default","ocean_custom_footer_template":"","ocean_post_oembed":"","ocean_post_self_hosted_media":"","ocean_post_video_embed":"","ocean_link_format":"","ocean_link_format_target":"self","ocean_quote_format":"","ocean_quote_format_link":"post","ocean_gallery_link_images":"on","ocean_gallery_id":[],"footnotes":""},"categories":[1],"tags":[],"jetpack_featured_media_url":"","_links":{"self":[{"href":"https:\/\/sparkexample.com\/wp-json\/wp\/v2\/posts\/1157"}],"collection":[{"href":"https:\/\/sparkexample.com\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/sparkexample.com\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/sparkexample.com\/wp-json\/wp\/v2\/users\/2"}],"replies":[{"embeddable":true,"href":"https:\/\/sparkexample.com\/wp-json\/wp\/v2\/comments?post=1157"}],"version-history":[{"count":4,"href":"https:\/\/sparkexample.com\/wp-json\/wp\/v2\/posts\/1157\/revisions"}],"predecessor-version":[{"id":1229,"href":"https:\/\/sparkexample.com\/wp-json\/wp\/v2\/posts\/1157\/revisions\/1229"}],"wp:attachment":[{"href":"https:\/\/sparkexample.com\/wp-json\/wp\/v2\/media?parent=1157"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/sparkexample.com\/wp-json\/wp\/v2\/categories?post=1157"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/sparkexample.com\/wp-json\/wp\/v2\/tags?post=1157"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}