Duomenys paimti iš tarptautinės filmų duomenų bazės, IMDb: https://datasets.imdbws.com/
import sys
!{sys.executable} -m pip install langid
!{sys.executable} -m pip install iso-639
!{sys.executable} -m pip install -U pandasql
import pandas as pd
import langid
from iso639 import languages
from pandasql import sqldf
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator
films_original = pd.read_csv('C:\\Users\\rokas\\Desktop\\Jupyter\\Projektas\\Duomenys\\title.basics.tsv\\data.tsv', sep='\t', dtype='str', na_values=[r'\N'])
ratings_original = pd.read_csv('C:\\Users\\rokas\\Desktop\\Jupyter\\Projektas\\Duomenys\\title.ratings.tsv\\data.tsv', sep='\t', na_values=[r'\N'])
films_original
| tconst | titleType | primaryTitle | originalTitle | isAdult | startYear | endYear | runtimeMinutes | genres | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | tt0000001 | short | Carmencita | Carmencita | 0 | 1894 | NaN | 1 | Documentary,Short |
| 1 | tt0000002 | short | Le clown et ses chiens | Le clown et ses chiens | 0 | 1892 | NaN | 5 | Animation,Short |
| 2 | tt0000003 | short | Pauvre Pierrot | Pauvre Pierrot | 0 | 1892 | NaN | 4 | Animation,Comedy,Romance |
| 3 | tt0000004 | short | Un bon bock | Un bon bock | 0 | 1892 | NaN | 12 | Animation,Short |
| 4 | tt0000005 | short | Blacksmith Scene | Blacksmith Scene | 0 | 1893 | NaN | 1 | Comedy,Short |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 8578739 | tt9916848 | tvEpisode | Episode #3.17 | Episode #3.17 | 0 | 2010 | NaN | NaN | Action,Drama,Family |
| 8578740 | tt9916850 | tvEpisode | Episode #3.19 | Episode #3.19 | 0 | 2010 | NaN | NaN | Action,Drama,Family |
| 8578741 | tt9916852 | tvEpisode | Episode #3.20 | Episode #3.20 | 0 | 2010 | NaN | NaN | Action,Drama,Family |
| 8578742 | tt9916856 | short | The Wind | The Wind | 0 | 2015 | NaN | 27 | Short |
| 8578743 | tt9916880 | tvEpisode | Horrid Henry Knows It All | Horrid Henry Knows It All | 0 | 2014 | NaN | 10 | Adventure,Animation,Comedy |
8578744 rows × 9 columns
ratings_original
| tconst | averageRating | numVotes | |
|---|---|---|---|
| 0 | tt0000001 | 5.7 | 1846 |
| 1 | tt0000002 | 6.0 | 238 |
| 2 | tt0000003 | 6.5 | 1611 |
| 3 | tt0000004 | 6.1 | 154 |
| 4 | tt0000005 | 6.2 | 2436 |
| ... | ... | ... | ... |
| 1201926 | tt9916682 | 5.6 | 5 |
| 1201927 | tt9916690 | 6.6 | 5 |
| 1201928 | tt9916720 | 5.5 | 164 |
| 1201929 | tt9916766 | 6.7 | 18 |
| 1201930 | tt9916778 | 7.3 | 33 |
1201931 rows × 3 columns
films_original['titleType'].unique()
array(['short', 'movie', 'tvEpisode', 'tvSeries', 'tvShort', 'tvMovie',
'tvMiniSeries', 'tvSpecial', 'video', 'videoGame', 'tvPilot'],
dtype=object)
films = films_original[films_original['titleType'].isin (['movie', 'tvMovie'])]
films = pd.merge(films[['tconst', 'originalTitle', 'startYear', 'runtimeMinutes', 'genres']], ratings_original, on = 'tconst')
films = films[films['numVotes'] >= 100]
films
| tconst | originalTitle | startYear | runtimeMinutes | genres | averageRating | numVotes | |
|---|---|---|---|---|---|---|---|
| 1 | tt0000574 | The Story of the Kelly Gang | 1906 | 70 | Action,Adventure,Biography | 6.1 | 744 |
| 34 | tt0001258 | Den hvide slavehandel | 1910 | 45 | Drama | 5.7 | 139 |
| 60 | tt0001892 | Den sorte drøm | 1911 | 53 | Drama | 5.9 | 220 |
| 68 | tt0002101 | Cleopatra | 1912 | 100 | Drama,History | 5.1 | 524 |
| 69 | tt0002130 | L'Inferno | 1911 | 71 | Adventure,Drama,Fantasy | 7.0 | 2751 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 321422 | tt9914286 | Sokagin Çocuklari | 2019 | 98 | Drama,Family | 7.6 | 218 |
| 321426 | tt9914942 | La vida sense la Sara Amat | 2019 | 74 | Drama | 6.9 | 136 |
| 321436 | tt9916190 | Safeguard | 2020 | 90 | Action,Adventure,Thriller | 3.5 | 227 |
| 321438 | tt9916270 | Il talento del calabrone | 2020 | 84 | Thriller | 5.8 | 1260 |
| 321439 | tt9916362 | Akelarre | 2020 | 92 | Drama,History | 6.4 | 4226 |
122480 rows × 7 columns
vertimas = []
l = len(films)
for i in range(l):
vertimas.append(langid.classify(films.iloc[i, 1])[0])
films['language'] = vertimas
films
| tconst | originalTitle | startYear | runtimeMinutes | genres | averageRating | numVotes | language | |
|---|---|---|---|---|---|---|---|---|
| 1 | tt0000574 | The Story of the Kelly Gang | 1906 | 70 | Action,Adventure,Biography | 6.1 | 744 | en |
| 34 | tt0001258 | Den hvide slavehandel | 1910 | 45 | Drama | 5.7 | 139 | da |
| 60 | tt0001892 | Den sorte drøm | 1911 | 53 | Drama | 5.9 | 220 | nb |
| 68 | tt0002101 | Cleopatra | 1912 | 100 | Drama,History | 5.1 | 524 | sk |
| 69 | tt0002130 | L'Inferno | 1911 | 71 | Adventure,Drama,Fantasy | 7.0 | 2751 | en |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 321422 | tt9914286 | Sokagin Çocuklari | 2019 | 98 | Drama,Family | 7.6 | 218 | tr |
| 321426 | tt9914942 | La vida sense la Sara Amat | 2019 | 74 | Drama | 6.9 | 136 | es |
| 321436 | tt9916190 | Safeguard | 2020 | 90 | Action,Adventure,Thriller | 3.5 | 227 | en |
| 321438 | tt9916270 | Il talento del calabrone | 2020 | 84 | Thriller | 5.8 | 1260 | it |
| 321439 | tt9916362 | Akelarre | 2020 | 92 | Drama,History | 6.4 | 4226 | et |
122480 rows × 8 columns
films['language'] = films['language'].apply(lambda x: languages.get(alpha2=x).name)
films
| tconst | originalTitle | startYear | runtimeMinutes | genres | averageRating | numVotes | language | |
|---|---|---|---|---|---|---|---|---|
| 1 | tt0000574 | The Story of the Kelly Gang | 1906 | 70 | Action,Adventure,Biography | 6.1 | 744 | English |
| 34 | tt0001258 | Den hvide slavehandel | 1910 | 45 | Drama | 5.7 | 139 | Danish |
| 60 | tt0001892 | Den sorte drøm | 1911 | 53 | Drama | 5.9 | 220 | Norwegian Bokmål |
| 68 | tt0002101 | Cleopatra | 1912 | 100 | Drama,History | 5.1 | 524 | Slovak |
| 69 | tt0002130 | L'Inferno | 1911 | 71 | Adventure,Drama,Fantasy | 7.0 | 2751 | English |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 321422 | tt9914286 | Sokagin Çocuklari | 2019 | 98 | Drama,Family | 7.6 | 218 | Turkish |
| 321426 | tt9914942 | La vida sense la Sara Amat | 2019 | 74 | Drama | 6.9 | 136 | Spanish |
| 321436 | tt9916190 | Safeguard | 2020 | 90 | Action,Adventure,Thriller | 3.5 | 227 | English |
| 321438 | tt9916270 | Il talento del calabrone | 2020 | 84 | Thriller | 5.8 | 1260 | Italian |
| 321439 | tt9916362 | Akelarre | 2020 | 92 | Drama,History | 6.4 | 4226 | Estonian |
122480 rows × 8 columns
films.to_csv('C:\\Users\\rokas\\Desktop\\Jupyter\\Projektas\\Duomenys\\movies_final.csv')
rating0 = sqldf("select language, avg(averageRating) as average_rating, count(tconst) as movie_count from films group by language order by average_rating desc")
rating0.index += 1
rating0.head(20)
| language | average_rating | movie_count | |
|---|---|---|---|
| 1 | Faroese | 7.066667 | 3 |
| 2 | Lao | 6.650000 | 2 |
| 3 | Japanese | 6.642857 | 7 |
| 4 | Croatian | 6.560582 | 378 |
| 5 | Walloon | 6.532493 | 357 |
| 6 | Icelandic | 6.515476 | 84 |
| 7 | Zulu | 6.498333 | 60 |
| 8 | Javanese | 6.478107 | 169 |
| 9 | Irish | 6.424324 | 111 |
| 10 | Slovak | 6.411358 | 405 |
| 11 | Occitan (post 1500) | 6.402542 | 118 |
| 12 | Sinhala | 6.400000 | 1 |
| 13 | Modern Greek (1453-) | 6.400000 | 4 |
| 14 | Galician | 6.373418 | 158 |
| 15 | Aragonese | 6.356250 | 32 |
| 16 | Vietnamese | 6.347315 | 298 |
| 17 | Kinyarwanda | 6.340741 | 216 |
| 18 | Bosnian | 6.320755 | 53 |
| 19 | Albanian | 6.319767 | 86 |
| 20 | Catalan | 6.315769 | 260 |
en0 = rating0.copy()
en0 = en0[en0['language'] == 'English']
en0['percentage'] = round(en0['movie_count']/rating0['movie_count'].sum()*100, 1)
en0
| language | average_rating | movie_count | percentage | |
|---|---|---|---|---|
| 60 | English | 5.92134 | 72173 | 58.9 |
films[(films['language'] == 'English')]
| tconst | originalTitle | startYear | runtimeMinutes | genres | averageRating | numVotes | language | |
|---|---|---|---|---|---|---|---|---|
| index | ||||||||
| 1 | tt0000574 | The Story of the Kelly Gang | 1906 | 70 | Action,Adventure,Biography | 6.1 | 744 | English |
| 69 | tt0002130 | L'Inferno | 1911 | 71 | Adventure,Drama,Fantasy | 7.0 | 2751 | English |
| 72 | tt0002199 | From the Manger to the Cross; or, Jesus of Naz... | 1912 | 60 | Biography,Drama | 5.8 | 576 | English |
| 80 | tt0002423 | Madame DuBarry | 1919 | 85 | Biography,Drama,Romance | 6.7 | 869 | English |
| 83 | tt0002452 | Independenta Romaniei | 1912 | 120 | History,War | 6.5 | 237 | English |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 321364 | tt9902160 | Herself | 2020 | 97 | Drama | 7.0 | 3025 | English |
| 321375 | tt9904802 | Enemy Lines | 2020 | 92 | War | 4.5 | 1477 | English |
| 321386 | tt9906262 | The Changin' Times of Ike White | 2019 | 77 | Documentary,Music | 6.7 | 202 | English |
| 321392 | tt9907782 | Eight for Silver | 2021 | 113 | Horror | 6.0 | 285 | English |
| 321436 | tt9916190 | Safeguard | 2020 | 90 | Action,Adventure,Thriller | 3.5 | 227 | English |
72173 rows × 8 columns
films[(films['language'] == 'Lithuanian')]
| tconst | originalTitle | startYear | runtimeMinutes | genres | averageRating | numVotes | language | |
|---|---|---|---|---|---|---|---|---|
| index | ||||||||
| 82 | tt0002445 | Quo Vadis? | 1913 | 120 | Drama,History | 6.2 | 351 | Lithuanian |
| 1020 | tt0010208 | Harakiri | 1919 | 80 | Drama | 5.6 | 690 | Lithuanian |
| 3093 | tt0018199 | Nevada | 1927 | 70 | Western | 6.2 | 121 | Lithuanian |
| 3865 | tt0020232 | Oblomok imperii | 1929 | 96 | Drama | 7.3 | 397 | Lithuanian |
| 4798 | tt0022048 | Shukujo to hige | 1931 | 75 | Comedy,Romance | 6.4 | 412 | Lithuanian |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 319606 | tt9581076 | Di jiu tianchang | 2019 | 185 | Drama | 7.7 | 3867 | Lithuanian |
| 319657 | tt9592116 | Aatos ja Amine | 2019 | 73 | Documentary | 7.5 | 208 | Lithuanian |
| 319757 | tt9614416 | Blackia | 2019 | 141 | Drama | 6.6 | 137 | Lithuanian |
| 319900 | tt9635540 | Darbar | 2020 | 159 | Action,Crime,Thriller | 6.1 | 5817 | Lithuanian |
| 320847 | tt9806192 | J'ai perdu mon corps | 2019 | 81 | Animation,Drama,Fantasy | 7.6 | 30605 | Lithuanian |
777 rows × 8 columns
rating1 = sqldf("select language, avg(averageRating) as average_rating, count(tconst) as movie_count from films where numVotes>=1000 group by language order by average_rating desc")
rating1.index += 1
rating1.head(20)
| language | average_rating | movie_count | |
|---|---|---|---|
| 1 | Faroese | 8.100000 | 1 |
| 2 | Northern Sami | 7.600000 | 1 |
| 3 | Malagasy | 7.122222 | 45 |
| 4 | Walloon | 7.041284 | 109 |
| 5 | Croatian | 7.023333 | 90 |
| 6 | Irish | 7.004348 | 23 |
| 7 | Amharic | 7.000000 | 1 |
| 8 | Galician | 6.993548 | 31 |
| 9 | Vietnamese | 6.939130 | 92 |
| 10 | Kinyarwanda | 6.933929 | 56 |
| 11 | Javanese | 6.929412 | 51 |
| 12 | Icelandic | 6.922222 | 27 |
| 13 | Catalan | 6.911940 | 67 |
| 14 | Swahili (macrolanguage) | 6.832967 | 182 |
| 15 | Haitian | 6.807692 | 13 |
| 16 | Latin | 6.784211 | 19 |
| 17 | Basque | 6.768919 | 148 |
| 18 | Xhosa | 6.757447 | 47 |
| 19 | Breton | 6.751351 | 37 |
| 20 | Aragonese | 6.736364 | 11 |
en1 = rating1.copy()
en1 = en1[en1['language'] == 'English']
en1['percentage'] = round(en1['movie_count']/rating1['movie_count'].sum()*100, 1)
en1
| language | average_rating | movie_count | percentage | |
|---|---|---|---|---|
| 62 | English | 6.231188 | 24910 | 64.6 |
rating2 = sqldf("select language, avg(averageRating) as average_rating, count(tconst) as movie_count from films where numVotes>=10000 group by language order by average_rating desc")
rating2.index += 1
rating2.head(20)
| language | average_rating | movie_count | |
|---|---|---|---|
| 1 | Icelandic | 8.500000 | 2 |
| 2 | Vietnamese | 7.753333 | 15 |
| 3 | Azerbaijani | 7.700000 | 1 |
| 4 | Xhosa | 7.657143 | 7 |
| 5 | Chinese | 7.633333 | 3 |
| 6 | Luxembourgish | 7.600000 | 1 |
| 7 | Latin | 7.600000 | 4 |
| 8 | Breton | 7.570000 | 10 |
| 9 | Slovak | 7.533333 | 12 |
| 10 | Norwegian Nynorsk | 7.460000 | 10 |
| 11 | Galician | 7.385714 | 7 |
| 12 | Javanese | 7.383333 | 6 |
| 13 | Walloon | 7.320000 | 15 |
| 14 | Lithuanian | 7.305405 | 37 |
| 15 | Esperanto | 7.300000 | 7 |
| 16 | Malagasy | 7.250000 | 4 |
| 17 | Romanian | 7.203846 | 26 |
| 18 | Volapük | 7.200000 | 1 |
| 19 | Portuguese | 7.181081 | 37 |
| 20 | Haitian | 7.175000 | 4 |
en2 = rating2.copy()
en2 = en2[en2['language'] == 'English']
en2['percentage'] = round(en2['movie_count']/rating2['movie_count'].sum()*100, 1)
en2
| language | average_rating | movie_count | percentage | |
|---|---|---|---|---|
| 50 | English | 6.568707 | 7334 | 75.3 |
rating3 = sqldf("select language, avg(averageRating) as average_rating, count(tconst) as movie_count from films where numVotes>=100000 group by language order by average_rating desc")
rating3.index += 1
rating3.head(20)
| language | average_rating | movie_count | |
|---|---|---|---|
| 1 | Slovak | 8.400000 | 1 |
| 2 | Latvian | 8.350000 | 2 |
| 3 | Portuguese | 8.275000 | 4 |
| 4 | Walloon | 8.200000 | 1 |
| 5 | Chinese | 8.200000 | 2 |
| 6 | Vietnamese | 8.100000 | 2 |
| 7 | Tagalog | 8.100000 | 1 |
| 8 | Catalan | 8.100000 | 1 |
| 9 | Galician | 8.000000 | 1 |
| 10 | Norwegian | 7.950000 | 2 |
| 11 | Norwegian Nynorsk | 7.700000 | 2 |
| 12 | Indonesian | 7.540000 | 10 |
| 13 | Slovenian | 7.520000 | 5 |
| 14 | Lithuanian | 7.520000 | 5 |
| 15 | Welsh | 7.500000 | 1 |
| 16 | Kinyarwanda | 7.500000 | 1 |
| 17 | French | 7.480000 | 25 |
| 18 | Maltese | 7.400000 | 5 |
| 19 | Esperanto | 7.300000 | 1 |
| 20 | Italian | 7.292683 | 41 |
en3 = rating3.copy()
en3 = en3[en3['language'] == 'English']
en3['percentage'] = round(en3['movie_count']/rating3['movie_count'].sum()*100, 1)
en3
| language | average_rating | movie_count | percentage | |
|---|---|---|---|---|
| 30 | English | 7.021573 | 1678 | 80.5 |
rating4 = sqldf("select language, avg(averageRating) as average_rating, count(tconst) as movie_count from films where numVotes>=1000000 group by language order by average_rating desc")
rating4.index += 1
rating4.head(20)
| language | average_rating | movie_count | |
|---|---|---|---|
| 1 | Italian | 8.700000 | 1 |
| 2 | Maltese | 8.600000 | 1 |
| 3 | English | 8.535897 | 39 |
| 4 | French | 8.500000 | 1 |
| 5 | German | 8.450000 | 2 |
| 6 | Chinese | 8.400000 | 1 |
| 7 | Spanish | 8.150000 | 2 |
| 8 | Polish | 8.100000 | 2 |
| 9 | Norwegian Nynorsk | 8.100000 | 1 |
en4 = rating4.copy()
en4 = en4[en4['language'] == 'English']
en4['percentage'] = round(en4['movie_count']/rating4['movie_count'].sum()*100, 1)
en4
| language | average_rating | movie_count | percentage | |
|---|---|---|---|---|
| 3 | English | 8.535897 | 39 | 78.0 |
Kalbų atpažinimo algoritmas nėra tikslus (ypač mažesnėms kalboms, pvz. lietuvių), bet galime daryti išvadas apie bendras tendencijas, susijusias su angliškais filmais.
historical = films.groupby('startYear', as_index=False)['averageRating'].mean()
historical.tail(20)
| startYear | averageRating | |
|---|---|---|
| 95 | 2003 | 5.968877 |
| 96 | 2004 | 5.973783 |
| 97 | 2005 | 5.913279 |
| 98 | 2006 | 5.857472 |
| 99 | 2007 | 5.911971 |
| 100 | 2008 | 5.854835 |
| 101 | 2009 | 5.829136 |
| 102 | 2010 | 5.838862 |
| 103 | 2011 | 5.842653 |
| 104 | 2012 | 5.854820 |
| 105 | 2013 | 5.852284 |
| 106 | 2014 | 5.875215 |
| 107 | 2015 | 5.850643 |
| 108 | 2016 | 5.882517 |
| 109 | 2017 | 5.879383 |
| 110 | 2018 | 5.865897 |
| 111 | 2019 | 5.894913 |
| 112 | 2020 | 5.753712 |
| 113 | 2021 | 6.069348 |
| 114 | 2022 | 6.466667 |
fig, ax = plt.subplots()
ax.set_title('Vidutinis filmų reitingas 1900 - 2021', fontdict = {'fontsize' : 24})
ax.set_xlim(1900, 2021)
ax.xaxis.set_major_locator(MultipleLocator(10))
ax.plot(historical['startYear'], historical['averageRating'])
fig.set_size_inches(18.5, 10.5)
plt.xlabel('Metai', fontdict = {'fontsize' : 16})
plt.ylabel('Reitingas', fontdict = {'fontsize' : 16})
plt.show()
historical2 = historical
historical2= historical2[historical2['startYear'] >= 2010]
fig, ax = plt.subplots()
ax.set_title('Vidutinis filmų reitingas 2010 - 2021', fontdict = {'fontsize' : 24})
ax.set_xlim(2010, 2021)
ax.plot(historical2['startYear'], historical2['averageRating'])
fig.set_size_inches(18.5, 10.5)
plt.xlabel('Metai', fontdict = {'fontsize' : 16})
plt.ylabel('Reitingas', fontdict = {'fontsize' : 16})
plt.show()